In [15]:
'''
Author: Nick Quinn
Description: Following Alexis Cook's tutorial on the Kaggle "Titantic - Machine Learning from Disaster" Competition. In addition,
             I've applied some of the techniques learned from Kaggle's intro to ML course such as model validation to test different
             combinations of features and model parameters.
Links:
    - Alexis Cook's tutorial - https://github.com/FreakyNobleGas/machine-learning
    - Kaggle Titantic Competition - https://www.kaggle.com/c/titanic
'''

'\nAuthor: Nick Quinn\nDescription: Following Alexis Cook\'s tutorial on the Kaggle "Titantic - Machine Learning from Disaster" Competition\nLinks:\n    - Alexis Cook\'s tutorial - https://github.com/FreakyNobleGas/machine-learning\n    - Kaggle Titantic Competition - https://www.kaggle.com/c/titanic\n'

In [16]:
# Setting up environment

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from copy import deepcopy
import itertools

train_data_file_path = '.\\data\\train.csv'
test_data_file_path = '.\\data\\test.csv'
gender_data_file_path = '.\\data\\gender_submission.csv'

In [17]:
# Load training dataset and confirm data exists

train_data = pd.read_csv(train_data_file_path)
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
# Load test dataset and confirm data exists

test_data = pd.read_csv(test_data_file_path)
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [19]:
# Load gender dataset and confirm data exists

gender_data = pd.read_csv(gender_data_file_path)
gender_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [20]:
# Testing a pattern. Assume all women survived, and all males died.

women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [21]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


In [22]:
# Create Random Forest Model and Generate Submission Data
train_data = pd.read_csv(train_data_file_path)
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Get all combinations of length 2 or greater
features_comb = []
for i in range(2, len(features)):
    features_comb += itertools.combinations(features, r=i)

# Go through each combination to find best feature set for model
best_features = []
best_mae = -1
num_of_comb = len(features_comb)
count = 0
for f in features_comb:
    f = list(f)
    
    all_features = deepcopy(f)
    all_features.append("Survived")

    # Drop any rows that contain at least one missing variable
    train_data = train_data[all_features]
    train_data = train_data.dropna()

    X = pd.get_dummies(train_data[f])

    y = train_data["Survived"]

    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

    model = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=1)
    model.fit(train_X, train_y)

    val_predictions = model.predict(val_X)
    mae = mean_absolute_error(val_y, val_predictions)

    # If calculated MAE is better than best, set new mae and features
    if (best_mae == -1) or (mae < best_mae):
        best_mae = mae
        best_features = f
        print("Found better model...")

    count += 1
    if count % 10 == 0:
        print(str(count) + "/" + str(num_of_comb))

    train_data = pd.read_csv(train_data_file_path)



Found better model...
Found better model...
10/119
20/119
Found better model...
Found better model...
Found better model...
30/119
40/119
50/119
Found better model...
60/119
70/119
80/119
90/119
Found better model...
100/119
110/119
Found better model...


In [31]:
# Print Best Model
print(best_mae)
print(best_features)

0.1787709497206704
['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']


In [36]:
# Create Model with entire train dataset

# Bring in entire dataset
train_data = pd.read_csv(train_data_file_path)
all_features = deepcopy(best_features)
all_features.append("Survived")

# Drop any rows that contain at least one missing variable
train_data = train_data[all_features]
train_data = train_data.dropna()

X = pd.get_dummies(train_data[best_features])
y = train_data["Survived"]

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)

test_data = pd.read_csv(test_data_file_path)

# Make predictions based on test data
test_data["Age"] = test_data["Age"].replace(np.NaN, test_data["Age"].mean())
test_data["Fare"] = test_data["Fare"].replace(np.NaN, test_data["Fare"].mean())

X_test = pd.get_dummies(test_data[best_features])
predictions = model.predict(X_test)

In [37]:
# Generate Submission
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
