# import and read data

In [1]:
import pandas as pd
import numpy as np
from math import log

from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid,cross_validate
from sklearn.ensemble import RandomForestClassifier



# Data Dictionary
## Variable
Survived : Survival, 0 = No, 1 = Yes

Pclass : Ticket class, 1 = 1st, 2 = 2nd, 3 = 3rd

Sex : Sex

Age: Age in years

SibSp: Number of siblings / spouses aboard the Titanic

Parch: Number of parents / children aboard the Titanic

Ticket: Ticket number

Fare: Passenger fare

Cabin: Cabin number

Embarked: Part of Embarkation, C = Cherbourg, Q = Queenstown, S = Southampton

## Variable Notes
Pclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower

Age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

SibSp: The number of siblings/spouses

Parch: The number of parents/children

Some children travelled only with a nanny, therefore parch=0 for them.

In [2]:
dfRaw_train = pd.read_csv('train.csv')
dfRaw_test = pd.read_csv('test.csv')

In [3]:
# Seems that the Cabin number doesn't really help with so many nan, drop.
dfTrain = dfRaw_train.copy()
# dfTrain = dfTrain.drop('Cabin',axis=1)

# Also drops the rows with null by embarked and Age
dfTrain = dfTrain.dropna(axis=0,subset=['Embarked','Age'])

In [4]:
dfTest = dfRaw_test.copy()
# dfTest = dfTest.drop('Cabin',axis=1)

# Also drops the rows with null by embarked and Age
age_median = dfTest.Age.median()
fare_median = dfTest.Fare.median()
dfTest['Age'].fillna(age_median,inplace=True)
dfTest['Fare'].fillna(fare_median,inplace=True)


# Data ready

In [5]:
# Age, Sex, Pclass, Embarked, Fare, Sibsp

X_train = dfTrain[['Age','Fare','SibSp','Parch','Ticket','Embarked','Pclass','Sex','Name','Cabin','Survived']].reset_index(drop=True)
X_train['Fam_size'] = X_train['SibSp'] + X_train['Parch']
X_train.drop(columns=['SibSp','Parch'],axis=1,inplace=True)

Y_train = dfTrain.Survived



X_test = dfTest[['Age','Fare','SibSp','Parch','Ticket','Embarked','Pclass','Sex','Name','Cabin']].reset_index(drop=True)
X_test['Fam_size'] = X_test['SibSp'] + X_test['Parch']
X_test.drop(columns=['SibSp','Parch'],axis=1,inplace=True)



### Onehot Embarked,Pclass,Sex

In [6]:
X_train = pd.get_dummies(X_train, columns=['Embarked','Pclass','Sex'])
X_test = pd.get_dummies(X_test, columns=['Embarked','Pclass','Sex'])

### Ticket survival

In [7]:
ticket_cnt = pd.DataFrame(X_train['Ticket'].value_counts())

ticket_cnt['survived_cnt'] = (X_train.groupby('Ticket').sum()['Survived'])
# ticket_cnt['fully_survived'] = ticket_cnt['Ticket']-ticket_cnt['survived_cnt']
# ticket_cnt['fully_survived'] = (ticket_cnt['survived_cnt'] == ticket_cnt['Ticket'])
ticket_cnt['no_survived']= ticket_cnt['survived_cnt']==0
# ticket_cnt['partial_survived'] = ~(ticket_cnt['fully_survived'] ^ ticket_cnt['no_survived'])

# ticket_cnt['fully_survived'] = ticket_cnt['fully_survived'].astype('float')
ticket_cnt['no_survived']=  ticket_cnt['no_survived'].astype('float')
# ticket_cnt['partial_survived'] =  ticket_cnt['partial_survived'].astype('float')

ticket_cnt.drop(['Ticket','survived_cnt'],axis=1,inplace=True)
ticket_cnt = ticket_cnt.reset_index()

X_train = X_train.merge(ticket_cnt,left_on='Ticket',right_on='index')
X_test = X_test.merge(ticket_cnt,left_on='Ticket',right_on='index',how='left')
X_test = X_test.fillna(0.5)


# X_train['survive_rate'] = (X_train['no_survived'] == 0)


# X_test['survive_rate'] = (X_test['no_survived'] == 0)
X_test.drop(columns=['Ticket','index'],axis=1,inplace=True)
X_train.drop(columns=['Ticket','Survived','index'],axis=1,inplace=True)

### Fare bin 5 

In [8]:
label = LabelEncoder()

X_train['Fare_bin_5'] = pd.qcut(X_train['Fare'],5)
X_test['Fare_bin_5'] = pd.qcut(X_test['Fare'],5)

X_train['FareBin_Code_5'] = label.fit_transform(X_train['Fare_bin_5'])
X_test['FareBin_Code_5'] = label.fit_transform(X_test['Fare_bin_5'])

X_train = pd.get_dummies(X_train, columns=['FareBin_Code_5'])
X_test = pd.get_dummies(X_test, columns=['FareBin_Code_5'])

In [9]:
X_test.drop(columns=['Fare_bin_5','Fare'],axis=1,inplace=True)
X_train.drop(columns=['Fare_bin_5','Fare'],axis=1,inplace=True)

### Age

In [10]:
X_train['Age'] = np.where(X_train['Age'] <= 16, 1, X_train['Age'])
X_train['Age'] = np.where((X_train['Age'] > 16), 0, X_train['Age'])
X_train = X_train.astype({'Age':'int'})
X_test['Age'] = np.where(X_test['Age'] <=16, 1, X_test['Age'])
X_test['Age'] = np.where((X_test['Age'] > 16), 0, X_test['Age'])
X_test = X_test.astype({'Age':'int'})

X_train = pd.get_dummies(X_train, columns=['Age'])
X_test = pd.get_dummies(X_test, columns=['Age'])

### Prefix

In [11]:
X_train['prefix'] = X_train['Name'].str.extract(r'(Mr\.|Mrs\.|Miss\.|Master\.)')
X_train['prefix'].fillna('Ohter.',inplace=True)

X_test['prefix'] = X_test['Name'].str.extract(r'(Mr\.|Mrs\.|Miss\.|Master\.)')
X_test['prefix'].fillna('Ohter.',inplace=True)

X_train = pd.get_dummies(X_train, columns=['prefix'])
X_test = pd.get_dummies(X_test, columns=['prefix'])

In [12]:
X_test.drop(columns=['Name'],axis=1,inplace=True)
X_train.drop(columns=['Name'],axis=1,inplace=True)

### Cabin

In [13]:
dfTemp = dfTrain[['Cabin','Survived']].copy()

dfTemp['Char'] = dfTemp.Cabin.str.slice(start=0,stop=1)

dfTemp.Survived = dfTemp.Survived.astype('str')

pd.DataFrame(dfTemp.groupby('Char')['Survived'].value_counts()).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived
Char,Survived,Unnamed: 2_level_1
A,0,5
A,1,7
B,0,11
B,1,32
C,0,21
C,1,30
D,0,8
D,1,23
E,0,8
E,1,22


In [14]:
X_train['Canbin_Char'] = X_train.Cabin.str.slice(start=0,stop=1)
X_train['Canbin_Char'].fillna('NoCabin',inplace = True)

X_test['Canbin_Char'] = X_test.Cabin.str.slice(start=0,stop=1)
X_test['Canbin_Char'].fillna('NoCabin',inplace = True)

X_train = pd.get_dummies(X_train, columns=['Canbin_Char'])
X_test = pd.get_dummies(X_test, columns=['Canbin_Char'])

In [15]:
X_test.drop(columns=['Cabin'],axis=1,inplace=True)
X_train.drop(columns=['Cabin'],axis=1,inplace=True)

# Model Fit

In [16]:
clf = RandomForestClassifier(n_estimators=1000,
                                      min_samples_split=12,
                                      min_samples_leaf=1,
                                      oob_score=True,
                                     max_depth=10,
                                      random_state=1
                                     )

clf.fit(X_train,Y_train)
Y_train_pred = clf.predict(X_train)

clf.score(X_train,Y_train)

0.7148876404494382

In [17]:
X_train

Unnamed: 0,Fam_size,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,no_survived,...,prefix_Ohter.,Canbin_Char_A,Canbin_Char_B,Canbin_Char_C,Canbin_Char_D,Canbin_Char_E,Canbin_Char_F,Canbin_Char_G,Canbin_Char_NoCabin,Canbin_Char_T
0,1,0,0,1,0,0,1,0,1,1.0,...,0,0,0,0,0,0,0,0,1,0
1,1,1,0,0,1,0,0,1,0,0.0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,1,0,0,1,1,0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,1,0,0,1,1,0,0,1,0,0.0,...,0,0,0,1,0,0,0,0,0,0
4,1,0,0,1,1,0,0,0,1,0.0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,0,0,0,1,0,0,1,0,1,1.0,...,0,0,0,0,0,0,0,0,1,0
708,0,0,0,1,0,1,0,0,1,1.0,...,1,0,0,0,0,0,0,0,1,0
709,0,0,0,1,1,0,0,1,0,0.0,...,0,0,1,0,0,0,0,0,0,0
710,0,1,0,0,1,0,0,0,1,0.0,...,0,0,0,1,0,0,0,0,0,0


In [18]:
# cross_validate(clf, X_train,Y_train, cv=10)

In [19]:
Y_test_pred=clf.predict(X_test)

dfPred = dfTest[['PassengerId']].copy()

dfPred['Survived'] = Y_test_pred

Feature names seen at fit time, yet now missing:
- Canbin_Char_T



ValueError: X has 30 features, but RandomForestClassifier is expecting 31 features as input.

In [None]:
# dfPred.to_csv('SVN_005.csv',index=False)