In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [2]:
# Check for TensorFlow GPU access
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# See TensorFlow version
print(f"TensorFlow version: {tf.__version__}")

TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.8.0


In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [4]:
# Inspecting the data
df_train.shape

(891, 12)

In [5]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [7]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
# Empty check
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [10]:
empty = df_train.isnull().sum()
total = len(df_train.index)
for key, val in empty.items():
    print("{} has {}% missing values".format(key, val/total * 100))

PassengerId has 0.0% missing values
Survived has 0.0% missing values
Pclass has 0.0% missing values
Name has 0.0% missing values
Sex has 0.0% missing values
Age has 19.865319865319865% missing values
SibSp has 0.0% missing values
Parch has 0.0% missing values
Ticket has 0.0% missing values
Fare has 0.0% missing values
Cabin has 77.10437710437711% missing values
Embarked has 0.22446689113355783% missing values


In [11]:
# Preprocessing the data
df_train['Sex'].replace(['female', 'male'], [0, 1], inplace = True)
df_test['Sex'].replace(['female', 'male'], [0, 1], inplace = True)

In [12]:
df_train.drop('Name', axis = 1, inplace = True)
df_train.drop('Cabin', axis = 1, inplace = True)

df_test.drop('Name', axis = 1, inplace = True)
df_test.drop('Cabin', axis = 1, inplace = True)

In [13]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

In [14]:
df_train.fillna(0, inplace = True)

df_test.fillna(0, inplace = True)

In [15]:
empty = df_train.isnull().sum()
total = len(df_train.index)
for key, val in empty.items():
    print("{} has {}% missing values".format(key, val/total * 100))

PassengerId has 0.0% missing values
Survived has 0.0% missing values
Pclass has 0.0% missing values
Sex has 0.0% missing values
Age has 0.0% missing values
SibSp has 0.0% missing values
Parch has 0.0% missing values
Ticket has 0.0% missing values
Fare has 0.0% missing values
Embarked has 0.0% missing values


In [16]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

In [17]:
df_train['Embarked'].replace(['C', 'Q', 'S'], [0, 1, 2], inplace = True)

df_test['Embarked'].replace(['C', 'Q', 'S'], [0, 1, 2], inplace = True)

In [18]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked         int64
dtype: object

In [19]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,1,22.0,1,0,A/5 21171,7.25,2
1,2,1,1,0,38.0,1,0,PC 17599,71.2833,0
2,3,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,2
3,4,1,1,0,35.0,1,0,113803,53.1,2
4,5,0,3,1,35.0,0,0,373450,8.05,2


In [20]:
df_train.drop('Ticket', axis = 1, inplace = True)

df_test.drop('Ticket', axis = 1, inplace = True)

In [21]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked         int64
dtype: object

In [22]:
df_train.drop('PassengerId', axis = 1, inplace = True)

In [23]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [25]:
x_train = df_train.drop('Survived', axis = 1).to_numpy()

In [26]:
y_train = df_train['Survived'].to_numpy()

array([[ 3.    ,  1.    , 22.    , ...,  0.    ,  7.25  ,  2.    ],
       [ 1.    ,  0.    , 38.    , ...,  0.    , 71.2833,  0.    ],
       [ 3.    ,  0.    , 26.    , ...,  0.    ,  7.925 ,  2.    ],
       ...,
       [ 3.    ,  0.    ,  0.    , ...,  2.    , 23.45  ,  2.    ],
       [ 1.    ,  1.    , 26.    , ...,  0.    , 30.    ,  0.    ],
       [ 3.    ,  1.    , 32.    , ...,  0.    ,  7.75  ,  1.    ]])

In [27]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,1
1,893,3,0,47.0,1,0,7.0,2
2,894,2,1,62.0,0,0,9.6875,1
3,895,3,1,27.0,0,0,8.6625,2
4,896,3,0,22.0,1,1,12.2875,2


In [28]:
test = df_test.drop('PassengerId', axis = 1).to_numpy()

In [29]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_train, 
                                                    y_train, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 712 samples.
Testing set has 179 samples.


In [30]:
from sklearn.metrics import fbeta_score, accuracy_score
from time import time

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
    start = time() # Get start time
    learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set(X_test),
    #       then get predictions on the first 300 training samples(X_train) using .predict()
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    end = time() # Get end time
    
    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start
            
    # TODO: Compute accuracy on the first 300 training samples which is y_train[:300]
    results['acc_train'] = accuracy_score(y_train, predictions_train)
        
    # TODO: Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    # TODO: Compute F-score on the the first 300 training samples using fbeta_score()
    results['f_train'] = fbeta_score(y_train, predictions_train, beta = 0.5)
        
    # TODO: Compute F-score on the test set which is y_test
    results['f_test'] = fbeta_score(y_test, predictions_test, beta = 0.5)
       
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
        
    # Return the results
    return results

In [31]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

# TODO: Initialize the three models
clf_A = SVC(random_state = 42)
clf_B = AdaBoostClassifier(random_state = 42)
clf_C = LogisticRegression(random_state = 42)

# TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data
# HINT: samples_100 is the entire training set i.e. len(y_train)
# HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
# HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
samples = len(y_train)

# Collect results on the learners
results = []
for clf in [clf_A, clf_B, clf_C]:
    results.append(train_predict(clf, samples, X_train, y_train, X_test, y_test))

# Run metrics visualization for the three supervised learning models chosen
results

SVC trained on 712 samples.
AdaBoostClassifier trained on 712 samples.
LogisticRegression trained on 712 samples.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[{'train_time': 0.013957738876342773,
  'pred_time': 0.03526186943054199,
  'acc_train': 0.6797752808988764,
  'acc_test': 0.7206703910614525,
  'f_train': 0.5381165919282511,
  'f_test': 0.6476683937823834},
 {'train_time': 0.02806401252746582,
  'pred_time': 0.0054628849029541016,
  'acc_train': 0.8300561797752809,
  'acc_test': 0.8156424581005587,
  'f_train': 0.782543265613243,
  'f_test': 0.7657657657657658},
 {'train_time': 0.009641408920288086,
  'pred_time': 7.486343383789062e-05,
  'acc_train': 0.7879213483146067,
  'acc_test': 0.8156424581005587,
  'f_train': 0.7298985167837627,
  'f_test': 0.7593123209169055}]

In [32]:
predictions = clf_B.predict(test)

In [33]:
df_predictions = pd.DataFrame(predictions, columns=['Survived'], index=df_test['PassengerId'])

df_predictions.to_csv('predictions.csv')

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier

clf = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(), random_state = 42)

parameters = {'base_estimator__max_depth': [5, 10, 25, 50, 100], 'n_estimators':[5, 10, 25, 50, 75, 100, 150]}

scorer = make_scorer(fbeta_score, beta = 0.5)

grid_obj = GridSearchCV(estimator = clf, param_grid = parameters, scoring = scorer)

grid_fit = grid_obj.fit(X_train, y_train)

best_clf = grid_fit.best_estimator_

predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))



Unoptimized model
------
Accuracy score on testing data: 0.7821
F-score on testing data: 0.7207

Optimized Model
------
Final accuracy score on the testing data: 0.7821
Final F-score on the testing data: 0.7207


In [37]:
print('Is there a GPU Available:', tf.test.is_gpu_available())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Is there a GPU Available: True


2022-02-13 10:19:59.617373: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-02-13 10:19:59.617449: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [98]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(7)),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=192, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_55 (Dense)            (None, 256)               2048      
                                                                 
 dropout_20 (Dropout)        (None, 256)               0         
                                                                 
 dense_56 (Dense)            (None, 192)               49344     
                                                                 
 dropout_21 (Dropout)        (None, 192)               0         
                                                                 
 dense_57 (Dense)            (None, 128)               24704     
                                                                 
 dense_58 (Dense)            (None, 1)                 129       
                                                                 
Total params: 76,225
Trainable params: 76,225
Non-tra

In [99]:
X_train.shape

(712, 7)

In [100]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

EPOCHS = 10

history = model.fit(X_train, y_train, epochs = 100)

Epoch 1/100

2022-02-13 10:46:31.043646: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [101]:
model.evaluate(X_test, y_test)



2022-02-13 10:46:49.070803: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


[0.42611798644065857, 0.7932961583137512]

In [102]:
predictions_nn = model.predict(test)

2022-02-13 10:46:52.671499: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [103]:
predictions_nn

array([[1.89343959e-01],
       [1.77389622e-01],
       [5.20856753e-02],
       [1.85960263e-01],
       [4.26267922e-01],
       [2.11655498e-01],
       [5.01177549e-01],
       [1.74172074e-01],
       [6.44906282e-01],
       [7.17123523e-02],
       [1.01956502e-01],
       [3.23146820e-01],
       [9.63842213e-01],
       [1.86488837e-01],
       [9.91995454e-01],
       [8.83425117e-01],
       [2.38098547e-01],
       [2.29973122e-01],
       [4.43246663e-01],
       [1.93428844e-01],
       [3.27072263e-01],
       [3.82403225e-01],
       [9.37981248e-01],
       [3.65675300e-01],
       [5.10816991e-01],
       [1.33704543e-01],
       [9.35614049e-01],
       [2.49765560e-01],
       [3.62933010e-01],
       [9.69814062e-02],
       [1.72148034e-01],
       [1.11271419e-01],
       [4.23269123e-01],
       [6.49674177e-01],
       [3.72394443e-01],
       [2.09229037e-01],
       [1.88095301e-01],
       [7.27414131e-01],
       [1.20616868e-01],
       [3.96405548e-01],


In [115]:
predictions_nn_0_1 = map(lambda x: 0 if x <= 0.5 else 1, predictions_nn)

In [116]:
df_predictions_nn = pd.DataFrame(predictions_nn_0_1, columns=['Survived'], index=df_test['PassengerId'])

df_predictions_nn.to_csv('predictions_nn.csv')