In [65]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [66]:
# Check for TensorFlow GPU access
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# See TensorFlow version
print(f"TensorFlow version: {tf.__version__}")

TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.8.0


In [67]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [68]:
# Inspecting the training data
df_train.shape

(891, 12)

In [69]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [70]:
df_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [71]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [72]:
# Empty check
df_train.isnull().sum().sort_values() / len(df_train)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Embarked       0.002245
Age            0.198653
Cabin          0.771044
dtype: float64

In [73]:
# Delete rows without embarked values
df_train = df_train[df_train['Embarked'].notna()]

In [74]:
df_train['Age'].fillna(df_train['Age'].mean(), inplace = True)

In [75]:
df_train.isnull().sum().sort_values() / len(df_train)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Embarked       0.000000
Cabin          0.772778
dtype: float64

In [76]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [77]:
x_train = df_train.drop('Survived', axis = 1).to_numpy()

In [78]:
y_train = df_train['Survived'].to_numpy()

In [79]:
# Inspect test data
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [80]:
df_test.isnull().sum().sort_values() / len(df_test)

PassengerId    0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Embarked       0.000000
Fare           0.002392
Age            0.205742
Cabin          0.782297
dtype: float64

In [81]:
df_test['Age'].fillna(df_test['Age'].mean(), inplace = True)
df_test['Fare'].fillna(df_test['Fare'].mean(), inplace = True)

In [82]:
df_test.isnull().sum().sort_values() / len(df_test)

PassengerId    0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Embarked       0.000000
Cabin          0.782297
dtype: float64

In [83]:
df_test.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [84]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [85]:
test_data = df_test.drop('PassengerId', axis = 1).to_numpy()

In [101]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

y = df_train["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(df_train[features])
X_test = pd.get_dummies(df_test[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)

print(accuracy_score(model.predict(X), y))

predictions = model.predict(X_test)



output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

0.8143982002249719
Your submission was successfully saved!


In [105]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Always scale the input. The most convenient way is to use a pipeline.
clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(X.to_numpy(), y.to_numpy())

print(clf(model.predict(X), y))

TypeError: 'Pipeline' object is not callable

In [106]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X)

scaled_x_train = scaler.transform(X)
scaled_test_data = scaler.transform(X_test)


In [112]:
from sklearn.model_selection import GridSearchCV

clf = SGDClassifier(max_iter=1000, tol=1e-3)
clf.fit(X, y)

print(accuracy_score(clf.predict(X), y))

0.8053993250843644


In [115]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV

clf = SGDClassifier(loss='hinge', max_iter=100)
calibrated_clf = CalibratedClassifierCV(base_estimator=clf, method='sigmoid', cv=3)  # set the SGD classifier as the base estimator


grid_params = {'base_estimator__alpha': [0.0001, 0.001, 0.01]}  # note 'base_estimator__' in the params because you want to change params in the SGDClassifier
grid_search = GridSearchCV(estimator=calibrated_clf, param_grid=grid_params, cv=3)
grid_search.fit(X, y)

print(grid_search.best_params_)

{'base_estimator__alpha': 0.0001}


In [116]:
calibrated_clf.set_params(**grid_search.best_params_)
calibrated_clf.fit(X, y)

print(accuracy_score(calibrated_clf.predict(X), y))

0.8053993250843644


In [119]:
pred = calibrated_clf.predict(X_test)


output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': pred})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [118]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(scaled_x_train, y)

print(accuracy_score(model.predict(scaled_x_train), y))

predictions = model.predict(scaled_test_data)



output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

0.8143982002249719
Your submission was successfully saved!


In [None]:
scaled_x_train.shape

In [None]:
y_train.shape

In [None]:
from sklearn.linear_model import LogisticRegression

logmodel=LogisticRegression()

logmodel.fit(scaled_x_train, y_train)

y_pred = logmodel.predict(scaled_test_data)

In [None]:

df_predictions = pd.DataFrame(y_pred.reshape(418,1).tolist(), columns=['Survived'], index=df_test['PassengerId'])

df_predictions.to_csv('predictions_log.csv')

In [None]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(scaled_x_train, 
                                                    y_train, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

In [None]:
from sklearn.metrics import fbeta_score, accuracy_score
from time import time

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
    start = time() # Get start time
    learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set(X_test),
    #       then get predictions on the first 300 training samples(X_train) using .predict()
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    end = time() # Get end time
    
    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start
            
    # TODO: Compute accuracy on the first 300 training samples which is y_train[:300]
    results['acc_train'] = accuracy_score(y_train, predictions_train)
        
    # TODO: Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    # TODO: Compute F-score on the the first 300 training samples using fbeta_score()
    results['f_train'] = fbeta_score(y_train, predictions_train, beta = 0.5)
        
    # TODO: Compute F-score on the test set which is y_test
    results['f_test'] = fbeta_score(y_test, predictions_test, beta = 0.5)
       
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
        
    # Return the results
    return results

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

# TODO: Initialize the three models
clf_A = SVC(random_state = 42)
clf_B = AdaBoostClassifier(random_state = 42)
clf_C = LogisticRegression(random_state = 42)

# TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data
# HINT: samples_100 is the entire training set i.e. len(y_train)
# HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
# HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
samples = len(y_train)

# Collect results on the learners
results = []
for clf in [clf_A, clf_B, clf_C]:
    results.append(train_predict(clf, samples, X_train, y_train, X_test, y_test))

# Run metrics visualization for the three supervised learning models chosen
results

In [None]:
predictions = clf_B.predict(test)

In [None]:
df_predictions = pd.DataFrame(predictions, columns=['Survived'], index=df_test['PassengerId'])

df_predictions.to_csv('predictions.csv')

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier

clf = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(), random_state = 42)

parameters = {'base_estimator__max_depth': [5, 10, 25, 50, 100], 'n_estimators':[5, 10, 25, 50, 75, 100, 150]}

scorer = make_scorer(fbeta_score, beta = 0.5)

grid_obj = GridSearchCV(estimator = clf, param_grid = parameters, scoring = scorer)

grid_fit = grid_obj.fit(X_train, y_train)

best_clf = grid_fit.best_estimator_

predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))



In [None]:
print('Is there a GPU Available:', tf.test.is_gpu_available())

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(7)),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=192, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

In [None]:
X_train.shape

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

EPOCHS = 10

history = model.fit(X_train, y_train, epochs = 100)

In [None]:
history = model.fit(X_train, y_train, epochs = 100)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
predictions_nn = model.predict(test)

In [None]:
predictions_nn

In [None]:
predictions_nn_0_1 = map(lambda x: 0 if x <= 0.5 else 1, predictions_nn)

In [None]:
df_predictions_nn = pd.DataFrame(predictions_nn_0_1, columns=['Survived'], index=df_test['PassengerId'])

df_predictions_nn.to_csv('predictions_nn.csv')