In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
import csv
import numpy as np
import random
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import math

Using TensorFlow backend.


In [2]:
df = pd.read_csv('train.csv')

df_train_y = df['Survived'].values
df_train_x = df.drop(['Survived'], axis=1)
df_predict_x = pd.read_csv('test.csv')

test_passenger_ids = df_predict_x['PassengerId']
test_passenger_ids = np.reshape(test_passenger_ids.values, (df_predict_x.shape[0], 1))

labels = df_train_x.columns.values

# Remove unused Columns

In [3]:
def remove_unused_cols(dataset, labels):
    unused_cols = ["PassengerId", "Name", "Cabin", "Embarked", "Ticket"]
    
    for col_name in unused_cols:
        labels = labels[labels != col_name]
        dataset = dataset.drop([col_name], axis=1)
       
    
    return dataset, labels

df_train_x, labels = remove_unused_cols(df_train_x, labels)
df_predict_x, _ = remove_unused_cols(df_predict_x, labels)

# Preprocessing

**Fill NaN Values for**
- Age

**Apply One Hot Encoding on**
- Sex

In [4]:
# Should we check for sibling/parents and use this as a criteria for the random value?
# We could potentially even train a network to guess ages based on the other parameters
def fill_missing_age_fields(dataset):
    ages = dataset["Age"]
    mean_age = np.mean(ages[np.where(np.isnan(ages) == False)[0]])
    dataset["Age"].fillna(mean_age, inplace =True)
    
fill_missing_age_fields(df_train_x)
fill_missing_age_fields(df_predict_x)

In [5]:
def categorize_ages(dataset):
    ages = dataset["Age"]
    ages_categorized = np.where(ages >= 0, "          ", '')
    ages_categorized[np.where(ages < 2)] = "infant"
    ages_categorized[np.where((ages >= 2) & (ages < 10))] = "child"
    ages_categorized[np.where((ages >= 10) & (ages < 18))] = "teenager"
    ages_categorized[np.where((ages >= 18) & (ages < 30))] = "youngAdult"
    ages_categorized[np.where((ages >= 30) & (ages < 50))] = "midlife"
    ages_categorized[np.where((ages >= 50))] = "oldFart"
    return dataset.assign(Age = lambda x: ages_categorized)
    #dataset.assign()

df_train_x = categorize_ages(df_train_x)
df_predict_x = categorize_ages(df_predict_x)

In [6]:
def one_hot_encode_column(dataset, col_names):
    for col_name in col_names:
        col = dataset.loc[:, col_name]
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(col.values)
        onehot_encoder = OneHotEncoder(sparse=False)
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

        dataset = dataset.drop([col_name], axis=1)

        for i in range(0, onehot_encoded.shape[1]):
            new_col_name = "{0}_{1}".format(col_name, str(i))
            dataset[new_col_name] = onehot_encoded[:, i]
    
    return dataset

columns_to_one_hot_encode = ['Pclass', 'Sex', 'Age']

df_train_x = one_hot_encode_column(df_train_x, columns_to_one_hot_encode)
df_predict_x = one_hot_encode_column(df_predict_x, columns_to_one_hot_encode)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


# Standardize Data

In [7]:
scaler = StandardScaler().fit(df_train_x)
train_x = scaler.transform(df_train_x)
predict_x = scaler.transform(df_predict_x)

  return self.partial_fit(X, y)
  
  This is separate from the ipykernel package so we can avoid doing imports until


# Split training data into test and verification set (80:20)

In [8]:
train_x, test_x, train_y, test_y = train_test_split(train_x, df_train_y, test_size=0.2, random_state=12)

In [15]:
results = {}

# Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegressionCV

lr = LogisticRegressionCV(cv=7, random_state=0)
lr.fit(train_x, train_y)

lr_score = lr.score(test_x, test_y)
print(lr_score)
results[lr_score] = {'name': "Logistic Regression",
                     'model': lr}

0.7877094972067039


# Use Variations of SVM

- Apply Grid Search Cross Validation to identify best Parameters (that might take a while depending on the number of steps...)

In [20]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

def find_svm_params():

    NUM_STEPS = 40

    param_grid = {
        'C': np.logspace(-3, 4, NUM_STEPS)
    }

    grid = GridSearchCV(svm.SVC(gamma='auto'), param_grid, cv=7)
    grid.fit(train_x, train_y)

    print(grid.best_params_)
    
#find_svm_params()

{'C': 2.5719138090593443}




In [48]:
svc_linear = svm.SVC(kernel='linear', C=2.5719138090593443)
svc_linear.fit(train_x, train_y)

svc_linear_score = svc_linear.score(test_x, test_y)
print(svc_linear_score)

results[svc_linear_score] = {
    'name': 'SVC Linear',
    'model': svc_linear
}

0.7597765363128491


In [14]:
def find_svm_kernel_params()
    param_grid = {
        'C': np.logspace(-3, 4, NUM_STEPS),
        'gamma': np.logspace(-3, 4, NUM_STEPS),
    }

    grid = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=7)
    grid.fit(train_x, train_y)

    print(grid.best_params_)
    
#find_svm_kernel_params()



{'C': 8.886238162743407, 'gamma': 0.04124626382901352}

In [54]:
svc_rbf = svm.SVC(kernel='rbf', gamma=0.04124626382901352, C=8.886238162743407)
svc_rbf.fit(train_x, train_y)

svc_rbf_score = svc_rbf.score(test_x, test_y)
print(svc_rbf_score)

results[svc_rbf_score] = {
    'name': 'SVC RBF',
    'model': svc_rbf
}

0.8044692737430168


# Naive Bayes

In [53]:
from sklearn.naive_bayes import GaussianNB

naive_gauss = GaussianNB()
naive_gauss.fit(train_x, train_y)

naive_gauss_score = naive_gauss.score(test_x, test_y)
print(naive_gauss_score)

results[naive_gauss_score] = {
    'name': 'Naive Gauss',
    'model': naive_gauss
}

0.770949720670391


# Classification Trees & Random Forests

In [51]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=6)

clf.fit(train_x, train_y)
clf_score = clf.score(test_x, test_y)
print(clf_score)

results[clf_score] = {
    'name': 'Classification Tree',
    'model': clf
}

0.7541899441340782


In [40]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

def random_search_for_params():
    param_grid = {
        'max_features': ['auto', 'sqrt'],
        'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 7000, num = 10)],
        'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False],
    }

    rf = RandomForestRegressor()
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(train_x, train_y)

    print(rf_random.best_params_)
    
def grid_search_for_params():
    param_grid = {
        'max_features': ['auto'],
        'n_estimators': [4000, 5000, 5500, 6000, 7000],
        'max_depth': [5, 8, 10, 15, 20, 50],
        'min_samples_split': [4, 5, 6],
        'min_samples_leaf': [1],
        'bootstrap': [True],
    }

    rf = RandomForestRegressor()
    rf_gridCV = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, verbose=2, n_jobs = -1)
    rf_gridCV.fit(train_x, train_y)

    print(rf_gridCV.best_params_)
    
#random_search_for_params()
#grid_search_for_params()

Fitting 3 folds for each of 90 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   36.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:  5.7min finished


{'bootstrap': True, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 5000}


In [52]:
rfc = RandomForestClassifier(max_features='auto', max_depth=5, n_estimators=5000, min_samples_split=4, min_samples_leaf=1, bootstrap=True)
rfc.fit(train_x, train_y)

rfc_score = rfc.score(test_x, test_y)
print(rfc_score)

results[rfc_score] = {
    'name': 'Random Forest',
    'model': rfc
}

0.7821229050279329


# Evaluate Candidates based on Score

In [58]:
best_score = 0

for key in results:
    model_name = results[key]['name']
    print("{0} scored {1}".format(model_name, key))
    
    if key > best_score:
        best_score = key
        best_model_name = model_name
        best_model = results[key]['model']
    
print('Best Model is: {0} with a score of {1} - will continue with this'.format(best_model_name, best_score))

Logistic Regression scored 0.7877094972067039
SVC RBF scored 0.8044692737430168
Naive Gauss scored 0.770949720670391
Classification Tree scored 0.7541899441340782
Random Forest scored 0.7821229050279329
SVC Linear scored 0.7597765363128491
Best Model is: SVC RBF with a score of 0.8044692737430168 - will continue with this


# Predict with best Model

In [59]:
# Run the model against the test data
predict_y = best_model.predict(predict_x)
predict_y = np.around(predict_y)
predict_y = predict_y.astype(np.integer)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [226]:
# Write our predictions to a csv file
csv_predict = np.concatenate((test_passenger_ids, predict_y), axis=1)
csv_predict = np.concatenate((np.reshape(["PassengerId", "Survived"], (1, 2)), csv_predict))
with open('prediction.csv', 'w', newline='') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(csv_predict)
csvFile.close()