In [28]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [29]:
#Reading the data
train=pd.read_csv('train.csv',index_col='PassengerId')
test=pd.read_csv('test.csv',index_col='PassengerId')
y=train['Survived']
X=train.append(test)

In [30]:
#Title
X['Title']=X['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
X['Title'].replace(['Miss', 'Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Ms', inplace=True)
X['Title'].replace(['Major', 'Col', 'Capt', 'Don', 'Sir', 'Jonkheer'], 'Mr', inplace=True)

In [31]:
#Embarked
X['Embarked']=X['Embarked'].fillna('S')
#Fare
def rem_fare(r):
    if r.Fare==0:
        r.Fare=np.NaN
    return r    
X=X.apply(rem_fare,axis=1)        
X['Ticket_Freq'] = X.groupby('Ticket')['Ticket'].transform('count')
def fare(r):
    return r.Fare//r.Ticket_Freq
X['Fare']=X.apply(fare,axis=1)
X['Fare']=X['Fare'].fillna(X['Fare'].median())
#X['Fare']=pd.cut(X['Fare'],4)

In [32]:
#Family Size
X['Fam_Size'] = X['SibSp'] + X['Parch'] + 1
X['Fam_Size']=pd.cut(X['Fam_Size'],[0,1,4,7,11])

In [33]:
#Ticket
X['Ticket_lett'] = X.Ticket.apply(lambda x: x[:2])
X['Ticket_len'] = X.Ticket.apply(lambda x: len(x))

In [34]:
X['Fam_Size']=LabelEncoder().fit_transform(X['Fam_Size'])

encoded_features = []
cat_features = ['Pclass', 'Embarked', 'Title', 'Fam_Size','Ticket_lett','Ticket_len']
for feature in cat_features:
        encoded_feat = OneHotEncoder().fit_transform(X[feature].values.reshape(-1, 1)).toarray()
        n = X[feature].nunique()
        cols = ['{}_{}'.format(feature, n) for n in range(1, n + 1)]
        encoded_df = pd.DataFrame(encoded_feat, columns=cols)
        encoded_df.index = X.index
        encoded_features.append(encoded_df)
    

X=pd.concat([X,*encoded_features[:5]], axis=1)

In [35]:
#Dropping useless values
X=X.drop(['Sex','Age','Cabin','Ticket','Name','Ticket_Freq','Embarked','Fam_Size','Pclass','Title','SibSp','Parch'],axis=1)
X=X.drop(['Ticket_lett','Ticket_len'],axis=1)

In [36]:
X_train=X.drop(['Survived'],axis=1)
X_train=X_train[:891]
X_test=X.drop(['Survived'],axis=1)
X_test=X_test[891:]

In [37]:
# Second Best model 79.1 accuracy
rfs= RandomForestClassifier(n_estimators=1000,max_depth=5,
                            max_features='auto', oob_score=True, n_jobs=-1, verbose=1,random_state=10)
rfs.fit(X_train,y)
result=rfs.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    0.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.0s finished


In [41]:
#Best model 79.6
rfs2=RandomForestClassifier(n_estimators=1000,max_depth=5,random_state=0)
rfs2.fit(X_train,y)
result=rfs2.predict(X_test)

In [42]:
a=cross_val_score(rfs2,X_train,y,cv=6)
a

array([0.79194631, 0.79865772, 0.89932886, 0.82432432, 0.79054054,
       0.84459459])

In [43]:
res=pd.read_csv('gender_submission.csv',index_col=None)
res['Survived']=result
res.to_csv('SubHighestEver.csv',index=False)