In [37]:
import numpy as np
import pandas as pd

In [38]:
df = pd.read_csv("MF_imputed.csv")

In [39]:
df.head()

Unnamed: 0,y,Age,SibSp,ParCh,Fare,Sex_male,Class_2,Class_3
0,0.0,22.0,1.0,0.0,7.2,1.0,0.0,1.0
1,1.0,38.0,1.0,0.0,71.3,0.0,0.0,0.0
2,1.0,26.0,0.0,0.0,7.9,0.0,0.0,1.0
3,1.0,35.0,1.0,0.0,53.1,0.0,0.0,0.0
4,0.0,35.0,0.0,0.0,8.0,1.0,0.0,1.0


In [42]:
df = df.astype({'y': 'int64'}).copy()

In [43]:
df.head()

Unnamed: 0,y,Age,SibSp,ParCh,Fare,Sex_male,Class_2,Class_3
0,0,22.0,1.0,0.0,7.2,1.0,0.0,1.0
1,1,38.0,1.0,0.0,71.3,0.0,0.0,0.0
2,1,26.0,0.0,0.0,7.9,0.0,0.0,1.0
3,1,35.0,1.0,0.0,53.1,0.0,0.0,0.0
4,0,35.0,0.0,0.0,8.0,1.0,0.0,1.0


# Isolate X and Y 

In [152]:
y = df.y
X = df.drop(columns='y',axis=1)
X.head()

Unnamed: 0,Age,SibSp,ParCh,Fare,Sex_male,Class_2,Class_3
0,22.0,1.0,0.0,7.2,1.0,0.0,1.0
1,38.0,1.0,0.0,71.3,0.0,0.0,0.0
2,26.0,0.0,0.0,7.9,0.0,0.0,1.0
3,35.0,1.0,0.0,53.1,0.0,0.0,0.0
4,35.0,0.0,0.0,8.0,1.0,0.0,1.0


# Train, Test, Split

In [46]:
from sklearn.model_selection import train_test_split

In [153]:
df.y.mean()

0.3838383838383838

Data is imblanaced, 38% survived so we will use stratify to ensure balanced split

In [154]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state = 1502,
                                                    stratify = y)

# RandomForestClassifier

In [155]:
from sklearn.ensemble import RandomForestClassifier

In [164]:
model = RandomForestClassifier(n_estimators = 425,
                               random_state = 1502)
model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=425, random_state=1502)

# Predictions

In [165]:
predictions = model.predict(X_test)
predictions[:5]

array([1, 0, 1, 0, 1], dtype=int64)

# Assessing the model

In [52]:
from sklearn.metrics import classification_report, f1_score

In [166]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87       110
           1       0.81      0.75      0.78        69

    accuracy                           0.84       179
   macro avg       0.83      0.82      0.83       179
weighted avg       0.84      0.84      0.84       179



In [167]:
f1_score(y_test, predictions)

0.7819548872180452

# Feature importance

In [168]:
importance = pd.Series(model.feature_importances_,
                       index = X_train.columns.values)

importance.sort_values(ascending=False)

Age         0.322797
Sex_male    0.263035
Fare        0.244380
Class_3     0.071713
SibSp       0.045439
ParCh       0.035527
Class_2     0.017109
dtype: float64

# Improving the model with parameter tuning

We're tuning the number of estimators and number of trees

In [58]:
from sklearn.model_selection import ParameterGrid

In [147]:
param_grid = {'n_estimators': range(425,450,1)}
param_grid

{'n_estimators': range(425, 450)}

In [148]:
grid = ParameterGrid(param_grid)
grid

<sklearn.model_selection._search.ParameterGrid at 0x1acbdfa5970>

In [149]:
#Storing results
f1score = []
number = []
i = 0
for p in grid:
  model = RandomForestClassifier(n_estimators = p['n_estimators'],
                                 random_state = 1502)
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)
  f1 = f1_score(y_test, predictions)
  f1score.append(f1)
  number.append(grid[i])
  i += 1

In [150]:
results = pd.DataFrame((zip(number,f1score)),columns=['n_estimators', 'f1score'])

In [151]:
results.sort_values('f1score', ascending=False)

Unnamed: 0,n_estimators,f1score
0,{'n_estimators': 425},0.781955
13,{'n_estimators': 438},0.781955
23,{'n_estimators': 448},0.781955
22,{'n_estimators': 447},0.781955
21,{'n_estimators': 446},0.781955
20,{'n_estimators': 445},0.781955
19,{'n_estimators': 444},0.781955
18,{'n_estimators': 443},0.781955
17,{'n_estimators': 442},0.781955
16,{'n_estimators': 441},0.781955


In [144]:
print(max(f1score))

0.7819548872180452


#  Submission

## Trained model

In [169]:
train = df.copy()
train_y = train.y

In [170]:
train.columns

Index(['y', 'Age', 'SibSp', 'ParCh', 'Fare', 'Sex_male', 'Class_2', 'Class_3'], dtype='object')

In [171]:
predictors_cols = ['Age', 'SibSp', 'ParCh', 'Fare', 'Sex_male', 'Class_2', 'Class_3']

In [172]:
train_X = train[predictors_cols]

In [175]:
model = RandomForestClassifier(n_estimators = 425,
                               random_state = 1502)
model.fit(train_X, train_y)

RandomForestClassifier(n_estimators=425, random_state=1502)

## Test data

In [178]:
test = pd.read_csv('test_MF_imputed.csv')

In [179]:
test.shape

(418, 8)

In [180]:
test.head()

Unnamed: 0,PassengerId,Age,SibSp,ParCh,Fare,Class_2,Class_3,Sex_male
0,892.0,34.5,0.0,0.0,7.8,0.0,1.0,1.0
1,893.0,47.0,1.0,0.0,7.0,0.0,1.0,0.0
2,894.0,62.0,0.0,0.0,9.7,1.0,0.0,1.0
3,895.0,27.0,0.0,0.0,8.7,0.0,1.0,1.0
4,896.0,22.0,1.0,1.0,12.3,0.0,1.0,0.0


In [182]:
train_X.head(0), test.head(0)

(Empty DataFrame
 Columns: [Age, SibSp, ParCh, Fare, Sex_male, Class_2, Class_3]
 Index: [],
 Empty DataFrame
 Columns: [PassengerId, Age, SibSp, ParCh, Fare, Class_2, Class_3, Sex_male]
 Index: [])

In [184]:
test_X = test[predictors_cols]

In [186]:
predicted = model.predict(test_X)

In [188]:
test['Survived'] = predicted

In [189]:
test.head()

Unnamed: 0,PassengerId,Age,SibSp,ParCh,Fare,Class_2,Class_3,Sex_male,Survived
0,892.0,34.5,0.0,0.0,7.8,0.0,1.0,1.0,0
1,893.0,47.0,1.0,0.0,7.0,0.0,1.0,0.0,0
2,894.0,62.0,0.0,0.0,9.7,1.0,0.0,1.0,1
3,895.0,27.0,0.0,0.0,8.7,0.0,1.0,1.0,1
4,896.0,22.0,1.0,1.0,12.3,0.0,1.0,0.0,0


In [190]:
test = test[['PassengerId', 
                    #'Age', 'SibSp', 
                    #'ParCh', 'Fare', 
                    #'Class_2', 'Class_3',
                   #'Sex_male', 
                    'Survived']].copy()

In [191]:
test.dtypes

PassengerId    float64
Survived         int64
dtype: object

In [195]:
test = test.astype({'PassengerId': 'int64'}).copy()

In [196]:
test.dtypes

PassengerId    int64
Survived       int64
dtype: object

In [197]:
test.to_csv('RandomForestClassifier_Submission.csv', index=False)