In [251]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [252]:
# Check for TensorFlow GPU access
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# See TensorFlow version
print(f"TensorFlow version: {tf.__version__}")

TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.8.0


In [253]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [254]:
# Inspecting the training data
df_train.shape

(891, 12)

In [255]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [256]:
df_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [257]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [258]:
# Empty check
df_train.isnull().sum().sort_values() / len(df_train)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Embarked       0.002245
Age            0.198653
Cabin          0.771044
dtype: float64

In [259]:
# Delete rows without embarked values
df_train = df_train[df_train['Embarked'].notna()]

In [260]:
df_train['Age'].fillna(df_train['Age'].mean(), inplace = True)

In [261]:
df_train.isnull().sum().sort_values() / len(df_train)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Embarked       0.000000
Cabin          0.772778
dtype: float64

In [262]:
df_train['Sex'].replace(['female', 'male'], [0, 1], inplace = True)
df_train['Embarked'].replace(['C', 'Q', 'S'], [0, 1, 2], inplace = True)

In [263]:
df_train.drop(['Name', 'Cabin', 'Ticket', 'Sex', 'Embarked', 'PassengerId'], axis = 1, inplace = True)

In [264]:
df_train.dtypes

Survived      int64
Pclass        int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
dtype: object

In [265]:
df_train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [266]:
x_train = df_train.drop('Survived', axis = 1).to_numpy()

In [267]:
y_train = df_train['Survived'].to_numpy()

In [268]:
# Inspect test data
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [269]:
df_test.isnull().sum().sort_values() / len(df_test)

PassengerId    0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Embarked       0.000000
Fare           0.002392
Age            0.205742
Cabin          0.782297
dtype: float64

In [270]:
df_test['Age'].fillna(df_test['Age'].mean(), inplace = True)
df_test['Fare'].fillna(df_test['Fare'].mean(), inplace = True)

In [271]:
df_test['Sex'].replace(['female', 'male'], [0, 1], inplace = True)
df_test['Embarked'].replace(['C', 'Q', 'S'], [0, 1, 2], inplace = True)

In [272]:
df_test.drop(['Name', 'Cabin', 'Ticket', 'Sex', 'Embarked'], axis = 1, inplace = True)

In [273]:
df_test.isnull().sum().sort_values() / len(df_test)

PassengerId    0.0
Pclass         0.0
Age            0.0
SibSp          0.0
Parch          0.0
Fare           0.0
dtype: float64

In [274]:
df_test.dtypes

PassengerId      int64
Pclass           int64
Age            float64
SibSp            int64
Parch            int64
Fare           float64
dtype: object

In [275]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,892,3,34.5,0,0,7.8292
1,893,3,47.0,1,0,7.0
2,894,2,62.0,0,0,9.6875
3,895,3,27.0,0,0,8.6625
4,896,3,22.0,1,1,12.2875


In [276]:
test_data = df_test.drop('PassengerId', axis = 1).to_numpy()

In [277]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(x_train)

scaled_x_train = scaler.transform(x_train)
scaled_test_data = scaler.transform(test_data)


In [278]:
scaled_x_train.shape

(889, 5)

In [279]:
y_train.shape

(889,)

In [281]:
from sklearn.linear_model import LogisticRegression

logmodel=LogisticRegression()

logmodel.fit(scaled_x_train, y_train)

y_pred = logmodel.predict(scaled_test_data)

In [282]:

df_predictions = pd.DataFrame(y_pred.reshape(418,1).tolist(), columns=['Survived'], index=df_test['PassengerId'])

df_predictions.to_csv('predictions_log.csv')

In [137]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(scaled_x_train, 
                                                    y_train, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 711 samples.
Testing set has 178 samples.


In [138]:
from sklearn.metrics import fbeta_score, accuracy_score
from time import time

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
    start = time() # Get start time
    learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set(X_test),
    #       then get predictions on the first 300 training samples(X_train) using .predict()
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    end = time() # Get end time
    
    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start
            
    # TODO: Compute accuracy on the first 300 training samples which is y_train[:300]
    results['acc_train'] = accuracy_score(y_train, predictions_train)
        
    # TODO: Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    # TODO: Compute F-score on the the first 300 training samples using fbeta_score()
    results['f_train'] = fbeta_score(y_train, predictions_train, beta = 0.5)
        
    # TODO: Compute F-score on the test set which is y_test
    results['f_test'] = fbeta_score(y_test, predictions_test, beta = 0.5)
       
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
        
    # Return the results
    return results

In [139]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

# TODO: Initialize the three models
clf_A = SVC(random_state = 42)
clf_B = AdaBoostClassifier(random_state = 42)
clf_C = LogisticRegression(random_state = 42)

# TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data
# HINT: samples_100 is the entire training set i.e. len(y_train)
# HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
# HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
samples = len(y_train)

# Collect results on the learners
results = []
for clf in [clf_A, clf_B, clf_C]:
    results.append(train_predict(clf, samples, X_train, y_train, X_test, y_test))

# Run metrics visualization for the three supervised learning models chosen
results

SVC trained on 711 samples.
AdaBoostClassifier trained on 711 samples.
LogisticRegression trained on 711 samples.


[{'train_time': 0.01690816879272461,
  'pred_time': 0.020408153533935547,
  'acc_train': 0.8607594936708861,
  'acc_test': 0.7471910112359551,
  'f_train': 0.8441558441558441,
  'f_test': 0.7049180327868851},
 {'train_time': 0.03523707389831543,
  'pred_time': 0.006052255630493164,
  'acc_train': 0.8466947960618847,
  'acc_test': 0.7303370786516854,
  'f_train': 0.7943578322197477,
  'f_test': 0.6744868035190614},
 {'train_time': 0.001483917236328125,
  'pred_time': 6.794929504394531e-05,
  'acc_train': 0.8227848101265823,
  'acc_test': 0.7078651685393258,
  'f_train': 0.7710464201416208,
  'f_test': 0.6456456456456456}]

In [34]:
predictions = clf_B.predict(test)

In [35]:
df_predictions = pd.DataFrame(predictions, columns=['Survived'], index=df_test['PassengerId'])

df_predictions.to_csv('predictions.csv')

In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier

clf = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(), random_state = 42)

parameters = {'base_estimator__max_depth': [5, 10, 25, 50, 100], 'n_estimators':[5, 10, 25, 50, 75, 100, 150]}

scorer = make_scorer(fbeta_score, beta = 0.5)

grid_obj = GridSearchCV(estimator = clf, param_grid = parameters, scoring = scorer)

grid_fit = grid_obj.fit(X_train, y_train)

best_clf = grid_fit.best_estimator_

predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))



Unoptimized model
------
Accuracy score on testing data: 0.7821
F-score on testing data: 0.7207

Optimized Model
------
Final accuracy score on the testing data: 0.7821
Final F-score on the testing data: 0.7207


In [37]:
print('Is there a GPU Available:', tf.test.is_gpu_available())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Is there a GPU Available: True


2022-02-13 11:08:40.996098: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-02-13 11:08:40.996343: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1 Pro


In [38]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(7)),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=192, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               2048      
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 192)               49344     
                                                                 
 dropout_1 (Dropout)         (None, 192)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               24704     
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 76,225
Trainable params: 76,225
Non-traina

2022-02-13 11:08:42.922234: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-02-13 11:08:42.922280: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [39]:
X_train.shape

(712, 7)

In [40]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

EPOCHS = 10

history = model.fit(X_train, y_train, epochs = 100)

Epoch 1/100


2022-02-13 11:08:48.503431: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


 1/23 [>.............................] - ETA: 9s - loss: 0.6986 - accuracy: 0.5000

2022-02-13 11:08:48.769795: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [None]:
history = model.fit(X_train, y_train, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
 1/23 [>.............................] - ETA: 0s - loss: 0.2210 - accuracy: 0.9062

In [42]:
model.evaluate(X_test, y_test)



2022-02-13 11:09:37.680578: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


[0.8499911427497864, 0.7932961583137512]

In [None]:
predictions_nn = model.predict(test)

In [None]:
predictions_nn

In [None]:
predictions_nn_0_1 = map(lambda x: 0 if x <= 0.5 else 1, predictions_nn)

In [None]:
df_predictions_nn = pd.DataFrame(predictions_nn_0_1, columns=['Survived'], index=df_test['PassengerId'])

df_predictions_nn.to_csv('predictions_nn.csv')