# import and read data

In [1]:
import pandas as pd
import numpy as np
from math import log

from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.ensemble import RandomForestClassifier



# Data Dictionary
## Variable
Survived : Survival, 0 = No, 1 = Yes

Pclass : Ticket class, 1 = 1st, 2 = 2nd, 3 = 3rd

Sex : Sex

Age: Age in years

SibSp: Number of siblings / spouses aboard the Titanic

Parch: Number of parents / children aboard the Titanic

Ticket: Ticket number

Fare: Passenger fare

Cabin: Cabin number

Embarked: Part of Embarkation, C = Cherbourg, Q = Queenstown, S = Southampton

## Variable Notes
Pclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower

Age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

SibSp: The number of siblings/spouses

Parch: The number of parents/children

Some children travelled only with a nanny, therefore parch=0 for them.

In [4]:
dfRaw_train = pd.read_csv('train.csv')
dfRaw_test = pd.read_csv('test.csv')

In [None]:
dfRaw_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
# Seems that the Cabin number doesn't really help with so many nan, drop.
dfTrain = dfRaw_train.drop('Cabin',axis=1)

# Also drops the rows with null by embarked and Age
dfTrain = dfTrain.dropna(axis=0,subset=['Embarked','Age'])

In [59]:
dfTest = dfRaw_test.drop('Cabin',axis=1)

# Also drops the rows with null by embarked and Age
mean_age = dfTest.Age.mean()
mean_fare = dfTest.Fare.mean()
dfTest['Age'].fillna(mean_age,inplace=True)
dfTest['Fare'].fillna(mean_fare,inplace=True)


In [73]:
dfTrain.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

# Data ready

In [124]:
# Age, Sex, Pclass, Embarked, Fare, Sibsp

X_train = dfTrain[['Age','Fare','SibSp','Parch']].reset_index()
X_train['Fam_size'] = X_train['SibSp'] + X_train['Parch']
X_train.drop(columns=['SibSp','Parch'],axis=1,inplace=True)



dftemp = dfTrain[['Embarked','Pclass','Sex']].astype('category')
onehot = OneHotEncoder()
dftemp_onehot = pd.DataFrame(onehot.fit_transform(dftemp).toarray(),columns = onehot.get_feature_names())

X_train = pd.concat([X_train,dftemp_onehot],axis=1)

Y_train = dfTrain.Survived

In [125]:
X_test = dfTest[['Age','Fare','SibSp','Parch']].reset_index()
X_test['Fam_size'] = X_test['SibSp'] + X_test['Parch']
X_test.drop(columns=['SibSp','Parch'],axis=1,inplace=True)



dftemp = dfTest[['Embarked','Pclass','Sex']].astype('category')
onehot = OneHotEncoder()
dftemp_onehot = pd.DataFrame(onehot.fit_transform(dftemp).toarray(),columns = onehot.get_feature_names())

X_test = pd.concat([X_test,dftemp_onehot],axis=1)


# Data ready

In [154]:
clf = RandomForestClassifier(n_estimators=1000,
                                      min_samples_split=12,
                                      min_samples_leaf=1,
                                      oob_score=True,
                                     max_depth=10,
                                      random_state=1
                                     )

clf.fit(X_train,Y_train)
Y_train_pred = clf.predict(X_train)

clf.score(X_train,Y_train)

0.901685393258427

In [155]:
Y_test_pred=clf.predict(X_test)

dfPred = dfTest[['PassengerId']].copy()

dfPred['Survived'] = Y_test_pred

In [153]:
# dfPred.to_csv('SVN_005.csv',index=False)