In [26]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [27]:
# Check for TensorFlow GPU access
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# See TensorFlow version
print(f"TensorFlow version: {tf.__version__}")

TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.8.0


In [28]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [29]:
# Inspecting the data
df_train.shape

(891, 12)

In [30]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [31]:
df_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [32]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [33]:
# Empty check
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [34]:
empty = df_train.isnull().sum()
total = len(df_train.index)
for key, val in empty.items():
    print("{} has {}% missing values".format(key, val/total * 100))

PassengerId has 0.0% missing values
Survived has 0.0% missing values
Pclass has 0.0% missing values
Name has 0.0% missing values
Sex has 0.0% missing values
Age has 19.865319865319865% missing values
SibSp has 0.0% missing values
Parch has 0.0% missing values
Ticket has 0.0% missing values
Fare has 0.0% missing values
Cabin has 77.10437710437711% missing values
Embarked has 0.22446689113355783% missing values


In [35]:
# Preprocessing the data
df_train['Sex'].replace(['female', 'male'], [0, 1], inplace = True)
df_test['Sex'].replace(['female', 'male'], [0, 1], inplace = True)

In [36]:
df_train.drop('Name', axis = 1, inplace = True)
df_train.drop('Cabin', axis = 1, inplace = True)

df_test.drop('Name', axis = 1, inplace = True)
df_test.drop('Cabin', axis = 1, inplace = True)

In [37]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

In [38]:
df_train.fillna(0, inplace = True)

df_test.fillna(0, inplace = True)

In [39]:
empty = df_train.isnull().sum()
total = len(df_train.index)
for key, val in empty.items():
    print("{} has {}% missing values".format(key, val/total * 100))

PassengerId has 0.0% missing values
Survived has 0.0% missing values
Pclass has 0.0% missing values
Sex has 0.0% missing values
Age has 0.0% missing values
SibSp has 0.0% missing values
Parch has 0.0% missing values
Ticket has 0.0% missing values
Fare has 0.0% missing values
Embarked has 0.0% missing values


In [40]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

In [41]:
df_train['Embarked'].replace(['C', 'Q', 'S'], [0, 1, 2], inplace = True)

df_test['Embarked'].replace(['C', 'Q', 'S'], [0, 1, 2], inplace = True)

In [42]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked         int64
dtype: object

In [43]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,1,22.0,1,0,A/5 21171,7.25,2
1,2,1,1,0,38.0,1,0,PC 17599,71.2833,0
2,3,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,2
3,4,1,1,0,35.0,1,0,113803,53.1,2
4,5,0,3,1,35.0,0,0,373450,8.05,2


In [44]:
df_train.drop('Ticket', axis = 1, inplace = True)

df_test.drop('Ticket', axis = 1, inplace = True)

In [45]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked         int64
dtype: object

In [46]:
df_train.drop('PassengerId', axis = 1, inplace = True)

In [47]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [49]:
x_train = df_train.drop('Survived', axis = 1).to_numpy()

In [51]:
y_train = df_train['Survived'].to_numpy()

In [53]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,1
1,893,3,0,47.0,1,0,7.0,2
2,894,2,1,62.0,0,0,9.6875,1
3,895,3,1,27.0,0,0,8.6625,2
4,896,3,0,22.0,1,1,12.2875,2


In [54]:
test = df_test.drop('PassengerId', axis = 1).to_numpy()

In [55]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_train, 
                                                    y_train, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 712 samples.
Testing set has 179 samples.


In [56]:
from sklearn.metrics import fbeta_score, accuracy_score
from time import time

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
    start = time() # Get start time
    learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set(X_test),
    #       then get predictions on the first 300 training samples(X_train) using .predict()
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time() # Get end time
    
    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start
            
    # TODO: Compute accuracy on the first 300 training samples which is y_train[:300]
    results['acc_train'] = accuracy_score(y_train[:300], predictions_train)
        
    # TODO: Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    # TODO: Compute F-score on the the first 300 training samples using fbeta_score()
    results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta = 0.5)
        
    # TODO: Compute F-score on the test set which is y_test
    results['f_test'] = fbeta_score(y_test, predictions_test, beta = 0.5)
       
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
        
    # Return the results
    return results

In [57]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

# TODO: Initialize the three models
clf_A = SVC(random_state = 42)
clf_B = AdaBoostClassifier(random_state = 42)
clf_C = LogisticRegression(random_state = 42)

# TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data
# HINT: samples_100 is the entire training set i.e. len(y_train)
# HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
# HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
samples = len(y_train)

# Collect results on the learners
results = []
for clf in [clf_A, clf_B, clf_C]:
    results.append(train_predict(clf, samples, X_train, y_train, X_test, y_test))

# Run metrics visualization for the three supervised learning models chosen
results

SVC trained on 712 samples.
AdaBoostClassifier trained on 712 samples.
LogisticRegression trained on 712 samples.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[{'train_time': 0.0069501399993896484,
  'pred_time': 0.008662939071655273,
  'acc_train': 0.69,
  'acc_test': 0.7206703910614525,
  'f_train': 0.4893617021276595,
  'f_test': 0.6476683937823834},
 {'train_time': 0.03214883804321289,
  'pred_time': 0.004682064056396484,
  'acc_train': 0.8333333333333334,
  'acc_test': 0.8156424581005587,
  'f_train': 0.7729941291585127,
  'f_test': 0.7657657657657658},
 {'train_time': 0.008093833923339844,
  'pred_time': 7.295608520507812e-05,
  'acc_train': 0.7766666666666666,
  'acc_test': 0.8156424581005587,
  'f_train': 0.6893203883495145,
  'f_test': 0.7593123209169055}]

In [59]:
predictions = clf_B.predict(test)

In [66]:
df_predictions = pd.DataFrame(predictions, columns=['Survived'], index=df_test['PassengerId'])

df_predictions.to_csv('predictions.csv')