In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
import csv
import numpy as np
import random
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import math
from matplotlib import pyplot as plt
import collections
import Dataset as ds
import DatasetModifier as dsm
import Classifications as classifications

Using TensorFlow backend.


In [2]:
# Load the train/test dataset
dataset = ds.Dataset(has_Y_param = True)
dataset.load_dataset_from_csv('train.csv')

In [3]:
# Load the prediction dataset
dataset_prediction = ds.Dataset(has_Y_param = False)
dataset_prediction.load_dataset_from_csv('test.csv')

In [4]:
datasetModifier = dsm.DatasetModifier()
datasetModifier.dataset_randomize()

In [5]:
def extract_title_from_name(row):
    return row['Name'].split(',')[1].split('.')[0].strip()

In [6]:
# Define the parameter creation steps
datasetModifier.dataset_fill_missing_value_based_on_criteria('Age', ['Sex', 'Pclass'])
datasetModifier.dataset_categorize_number('Age', [['infant', 0, 2], ['child', 2, 10], ['teenager', 10, 18], ['youngadult', 18, 30], ['midlife', 30, 50], ['oldfart', 50, math.inf]])
datasetModifier.add_X_parameter('Age')
datasetModifier.one_hot_X_parameter('Age')

datasetModifier.add_X_parameter('Sex')
datasetModifier.one_hot_X_parameter('Sex')

datasetModifier.dataset_fill_missing_value('Fare')
#datasetModifier.dataset_categorize_number('Fare', [['poor feck', 0, 10], ['middle class', 10, 50], ['richy rich', 50, math.inf]])
datasetModifier.add_X_parameter('Fare')
#datasetModifier.one_hot_X_parameter('Fare')

datasetModifier.dataset_add_new_feature_based_on_existing('FamilySize', ['SibSp', 'Parch'])
datasetModifier.add_X_parameter('FamilySize')

datasetModifier.dataset_add_new_feature_based_on_custom_function('Title', extract_title_from_name)
datasetModifier.add_X_parameter('Title')
datasetModifier.one_hot_X_parameter('Title')

#datasetModifier.standardize_X(dataset)

datasetModifier.add_Y_parameter('Survived')

In [7]:
# Apply the parameter creation steps to the two datasets
datasetModifier.generate_dataset(dataset)

In [8]:
train_X, test_X, train_Y, test_Y = dataset.get_train_test_set(0.2, random_state=42)

In [9]:
# Fetch the predict set data
datasetModifier.generate_dataset(dataset_prediction)
predict_X = dataset_prediction.X
test_passenger_ids = dataset_prediction.get_dataset_parameter('PassengerId')
test_passenger_ids = np.reshape(test_passenger_ids.values, (test_passenger_ids.shape[0], 1))

In [10]:
clf = classifications.Classifications()
#clf.find_svm_linear_params(train_X, train_Y)
#clf.find_svm_kernel_params(train_X, train_Y)
#clf.randomized_random_forest_parameter_search(train_X, train_Y)
#clf.grid_search_for_params(train_X, train_Y.reshape(-1, 1), ['sqrt'], [5500, 6000, 6500, 7000], [8, 9, 10, 20, 50], [4, 4, 5], [3, 4, 5], [False])

In [11]:
clf.add_logistic_regression(cross_validation=7, rnd_state=12, max_iter=4000)
clf.add_linear_svm(8.886238162743407)
clf.add_rbf_svm(8.886238162743407, 0.04124626382901352)
clf.add_naive_bayes()
clf.add_classification_tree(depth=6)
clf.add_random_forest(max_features='sqrt', max_depth=8, n_estimators=5500, min_samples_split=4, min_samples_leaf=3, bootstrap=False)

In [12]:
best_algorithm = clf.find_best_algorithm(train_X, train_Y, test_X, test_Y)

Evaluating Logistic Regression...
Logistic Regression scored 0.8379888268156425
Evaluating Linear SVM...
Linear SVM scored 0.8435754189944135
Evaluating RBF SVM...
RBF SVM scored 0.8156424581005587
Evaluating Naive Bayes...
Naive Bayes scored 0.7653631284916201
Evaluating Classification Tree...
Classification Tree scored 0.8491620111731844
Evaluating Random Forest...
Random Forest scored 0.8659217877094972
Best Model is: Random Forest with a score of 0.8659217877094972 - will continue with this


# Predict Model

In [13]:
# Run the model against the test data
predict_Y = best_algorithm.predict(predict_X)
predict_Y = np.around(predict_Y)
predict_Y = predict_Y.astype(np.integer)
predict_Y = np.reshape(predict_Y, (predict_Y.shape[0], 1))

In [14]:
csv_predict = np.concatenate((test_passenger_ids, predict_Y), axis=1)
csv_predict = np.concatenate((np.reshape(["PassengerId", "Survived"], (1, 2)), csv_predict))
with open('prediction.csv', 'w', newline='') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(csv_predict)
csvFile.close()