In [None]:
### LOADING LIBRARIES ###

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [42]:
### LOADING, CLEANING AND FEATURE ENGINEERING ON TRAIN AND TEST DATA ###

data = pd.read_csv('/home/boris/Documents/MachineLearning/Kaggle_Titanic/train.csv')
df_test = pd.read_csv('/home/boris/Documents/MachineLearning/Kaggle_Titanic/test.csv')
df_test_PassId = df_test['PassengerId'] # Needed for submission

#Judgment call on which columns won't offer any correlation with survival rate.
data = data.drop(['Ticket','Cabin','PassengerId'], axis=1)
df_test = df_test.drop(['Ticket','Cabin','PassengerId'], axis=1)

#preprocess families by groups. Gives a very slight advantage
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['Singleton']=data['FamilySize'].map(lambda s:1 if s==1 else 0)
data['SmallFamily']=data['FamilySize'].map(lambda s:1 if 2 <= s <= 4 else 0)
data['LargeFamily']=data['FamilySize'].map(lambda s:1 if 5<= s else 0)

df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1
df_test['Singleton']=df_test['FamilySize'].map(lambda s:1 if s==1 else 0)
df_test['SmallFamily']=df_test['FamilySize'].map(lambda s:1 if 2 <= s <= 4 else 0)
df_test['LargeFamily']=df_test['FamilySize'].map(lambda s:1 if 5<= s else 0)

#preprocess Name titles, and categorize them. There is a strong correlation between passenger title and survival rate
Title_Dictionnary={
"Capt":"Officer",
"Col":"Officer",
"Major":"Officer",
"Jonkheer":"Royalty",
"Don":"Royalty",
"Sir":"Royalty",
"Dr":"Officer",
"Rev":"Officer",
"the Countess":"Royalty",
"Mme":"Mrs",
"Mlle":"Miss",
"Ms":"Mr",
"Mr":"Mr",
"Mrs":"Mrs",
"Miss":"Miss",
"Master":"Master",
"Lady":"Royalty"
}
data['Title']=data['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
data['Title']=data.Title.map(Title_Dictionnary)

df_test['Title']=df_test['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
df_test['Title']=df_test.Title.map(Title_Dictionnary)

#There are 100+ rows of empty Age, which sucks because there is good correlation between Age and survival rate. I have tried different type of impute methods, but none is actually good, 
# because deleting those records always give a better prediction
data.dropna(inplace=True)
#I can't remove data from the test dataset unfortunately, so just the easy way, I will fill the empty data with the mean, finger crossed it will be ok
df_test.fillna(df_test.mean(), inplace=True)

# encoding title in dummy variable
df_dummies = pd.get_dummies(data['Title'], prefix='Title')
data = pd.concat([data, df_dummies], axis=1)

df_dummies = pd.get_dummies(df_test['Title'], prefix='Title')
df_test = pd.concat([df_test, df_dummies], axis=1)
#Weird stuff that I don't have time to investigate, the test set doesn't contain any royaly, therefore the column didn't get created during dumiesation, and therefore my predict fails because fo the missing column. 
# So I am just manually inserting the column full of 0
df_test['Title_Royalty']=0

#Preproccess Embarked
data.Embarked.fillna('S', inplace=True)
# dummy encoding 
df_dummies = pd.get_dummies(data['Embarked'], prefix='Embarked')
data = pd.concat([data, df_dummies], axis=1)

df_dummies = pd.get_dummies(df_test['Embarked'], prefix='Embarked')
df_test = pd.concat([df_test, df_dummies], axis=1)


#Preproccess Sex
data.Sex.fillna('M', inplace=True)
# dummy encoding 
df_dummies = pd.get_dummies(data['Sex'], prefix='Sex')
data = pd.concat([data, df_dummies], axis=1)

df_dummies = pd.get_dummies(df_test['Sex'], prefix='Sex')
df_test = pd.concat([df_test, df_dummies], axis=1)

#Drop all obsolete column
data = data.drop(['SibSp','Parch','FamilySize','Sex','Embarked','Title','Name'], axis=1)
df_test = df_test.drop(['SibSp','Parch','FamilySize','Sex','Embarked','Title','Name'], axis=1)

#print(data.head())

Pclass           418
Name             418
Sex              418
Age              418
SibSp            418
Parch            418
Fare             418
Embarked         418
FamilySize       418
Singleton        418
SmallFamily      418
LargeFamily      418
Title            417
Title_Master     418
Title_Miss       418
Title_Mr         418
Title_Mrs        418
Title_Officer    418
Title_Royalty    418
Embarked_C       418
Embarked_Q       418
Embarked_S       418
Sex_female       418
Sex_male         418
dtype: int64
   Survived  Pclass   Age     Fare  Singleton  SmallFamily  LargeFamily  \
0         0       3  22.0   7.2500          0            1            0   
1         1       1  38.0  71.2833          0            1            0   
2         1       3  26.0   7.9250          1            0            0   
3         1       1  35.0  53.1000          0            1            0   
4         0       3  35.0   8.0500          1            0            0   

   Title_Master  Title_Miss  Tit

In [None]:
### TRAIN, TEST AND EVALUATE MODEL ###

y = data.Survived
X = data.drop('Survived', axis=1)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)

# Pipelines Creation, testing different algorithm
# 1. Data Preprocessing by using Standard Scaler
# 2. Apply  Classifier

pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('lr_classifier',LogisticRegression(random_state=0))])

pipeline_dt=Pipeline([('scalar2',StandardScaler()),
                     ('dt_classifier',DecisionTreeClassifier())])

pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),
                     ('rf_classifier',RandomForestClassifier())])

# Lets make the list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

best_accuracy=0.0
best_classifier=0
best_pipeline=""

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

# Fit the pipelines
for pipe in pipelines:
	pipe.fit(X_train, y_train)

for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

In [None]:
### PERFORM PREDICTION ON TEST SET

#I have ran the previous bloc manually a few time and the good old Logistic Regression always come on top. So I am going to use it
pred = pipeline_lr.predict(df_test)

output = pd.DataFrame({'PassengerId': df_test_PassId, 'Survived': pred})
output.to_csv('/home/boris/Documents/MachineLearning/Kaggle_Titanic/my_submission_6.csv', index=False)
print('Your submission was successfully saved!')