1. Preprocessing: dealing with missing values, encoding categorical features, creating new features etc (Feature engineering).
2. Modelling: use different models, different parameters and different features (feature selection)
3. Evaluation: evaluate the model using various metrics, kfold cross validation etc.


In [1]:
import pandas as pd

df = pd.read_csv(r'train.csv')
test = pd.read_csv(r'test.csv')

# pclass: A proxy for socio-economic status (SES)
# 1st = Upper
# 2nd = Middle
# 3rd = Lowe

y = df["Survived"]
X = df.drop(["Survived"], axis=1)

In [2]:
def get_numerical_features(df):
    return df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

get_numerical_features(df)


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.2500
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.9250
3,1,35.0,1,0,53.1000
4,3,35.0,0,0,8.0500
...,...,...,...,...,...
886,2,27.0,0,0,13.0000
887,1,19.0,0,0,30.0000
888,3,,1,2,23.4500
889,1,26.0,0,0,30.0000


In [3]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

Sex

In [4]:
# X[['Pclass', 'Age','Fare']] = X[['Pclass', 'Age','Fare']].fillna(0)
X['Sex'] = X['Sex'].replace(['male', 'female'], [1, 0])


Embarked

In [5]:
from sklearn.preprocessing import OneHotEncoder
X['Embarked'].value_counts()
X[['Embarked_C', 'Embarked_Q', 'Embarked_S']] = pd.get_dummies(X['Embarked'])
X.drop(['Embarked'], axis=1, inplace=True)

NA Age

In [6]:
def NA_Age(data): # fill NA age with mean age of each class

    testmeanAge = data['Age'].mean()
    data['Age'] = data['Age'].fillna(testmeanAge)

    # Pclass3_mean_age = int(data[data['Pclass'] == 3]['Age'].mean())
    # Pclass2_mean_age = int(data[data['Pclass'] == 2]['Age'].mean())
    # Pclass1_mean_age = int(data[data['Pclass'] == 1]['Age'].mean())
    # print(Pclass3_mean_age, Pclass2_mean_age, Pclass1_mean_age)

    # listnullAge = data[data['Age'].isna()].PassengerId.values.tolist()
    # print(listnullAge)

    # for id in listnullAge:
    #     if data[data['PassengerId']== id]['Pclass'].values == 3:
    #         data.loc[data['PassengerId'] == id, 'Age'] = Pclass3_mean_age
    #     elif data[data['PassengerId'] == id]['Pclass'].values == 2:
    #         data.loc[data['PassengerId'] == id, 'Age'] = Pclass2_mean_age
    #     elif data[data['PassengerId'] == id]['Pclass'].values == 1:
    #         data.loc[data['PassengerId'] == id, 'Age'] = Pclass1_mean_age
        
    return data

NA Fare

In [7]:
def NA_Fare(data):

    Pclass3_mean_Fare = int(data[data['Pclass'] == 3]['Fare'].mean())
    Pclass2_mean_Fare = int(data[data['Pclass'] == 2]['Fare'].mean())
    Pclass1_mean_Fare = int(data[data['Pclass'] == 1]['Fare'].mean())
    print(Pclass3_mean_Fare, Pclass2_mean_Fare, Pclass1_mean_Fare)

    listnullFare = data[data['Fare'].isna()].PassengerId.values.tolist()
    print(listnullFare)

    for id in listnullFare:
        if data[data['PassengerId']== id]['Pclass'].values == 3:
            data.loc[data['PassengerId'] == id, 'Fare'] = Pclass3_mean_Fare
        elif data[data['PassengerId'] == id]['Pclass'].values == 2:
            data.loc[data['PassengerId'] == id, 'Fare'] = Pclass2_mean_Fare
        elif data[data['PassengerId'] == id]['Pclass'].values == 1:
            data.loc[data['PassengerId'] == id, 'Fare'] = Pclass1_mean_Fare

    return data


In [8]:
X['Fare'].isnull().any()

False

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train = NA_Age(X_train)
X_train = NA_Fare(X_train)

X_test = NA_Age(X_test)
X_test = NA_Fare(X_test)

14 20 85
[]
11 20 81
[]


Models

In [10]:
# random forest
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)
rf_model = rf_model.fit(X_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked_C', 'Embarked_Q', 'Embarked_S']] , y_train)

# train an xgboost model
from xgboost import XGBClassifier
xgb_model = XGBClassifier()
xgb_model = xgb_model.fit(X_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked_C', 'Embarked_Q', 'Embarked_S']] , y_train)

Train Accuracy/Error

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

train_preds = rf_model.predict(X_train.drop(['PassengerId','Name','Ticket','Cabin'], axis=1))
print(classification_report(y_train, train_preds))


              precision    recall  f1-score   support

           0       0.93      0.99      0.96       374
           1       0.99      0.86      0.92       222

    accuracy                           0.95       596
   macro avg       0.96      0.93      0.94       596
weighted avg       0.95      0.95      0.95       596



Validation Accuracy/Error

In [12]:
rf_predictions = rf_model.predict(X_test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1))
xgb_predictions = xgb_model.predict(X_test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1))

In [13]:
# sklearn classification metrics
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, rf_predictions))
print(classification_report(y_test, xgb_predictions))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       175
           1       0.80      0.70      0.75       120

    accuracy                           0.81       295
   macro avg       0.81      0.79      0.80       295
weighted avg       0.81      0.81      0.80       295

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       175
           1       0.74      0.72      0.73       120

    accuracy                           0.78       295
   macro avg       0.78      0.77      0.77       295
weighted avg       0.78      0.78      0.78       295



## From here, we apply preprocessing and prediction on test data

In [14]:
test['Sex'] = test['Sex'].replace(['male', 'female'], [1, 0])

test[['Embarked_C', 'Embarked_Q', 'Embarked_S']] = pd.get_dummies(test['Embarked'])
test.drop(['Embarked'], axis=1, inplace=True)

In [15]:
test['Pclass'].isna().any()

False

In [16]:
test[test['Age'].isna()].PassengerId.values.tolist()

[902,
 914,
 921,
 925,
 928,
 931,
 933,
 939,
 946,
 950,
 957,
 968,
 975,
 976,
 977,
 980,
 983,
 985,
 994,
 999,
 1000,
 1003,
 1008,
 1013,
 1016,
 1019,
 1024,
 1025,
 1038,
 1040,
 1043,
 1052,
 1055,
 1060,
 1062,
 1065,
 1075,
 1080,
 1083,
 1091,
 1092,
 1097,
 1103,
 1108,
 1111,
 1117,
 1119,
 1125,
 1135,
 1136,
 1141,
 1147,
 1148,
 1157,
 1158,
 1159,
 1160,
 1163,
 1165,
 1166,
 1174,
 1178,
 1180,
 1181,
 1182,
 1184,
 1189,
 1193,
 1196,
 1204,
 1224,
 1231,
 1234,
 1236,
 1249,
 1250,
 1257,
 1258,
 1272,
 1274,
 1276,
 1300,
 1302,
 1305,
 1308,
 1309]

In [17]:
test['Fare'].isnull().any()

True

In [18]:
test[test['Fare'].isna()].PassengerId.values.tolist()

[1044]

In [19]:
test = NA_Age(test)
test = NA_Fare(test)

12 22 94
[1044]


In [20]:
test[test['Fare'].isna()].PassengerId.values.tolist()

[]

In [21]:
# function to get columns with missing values in the dataset
def get_missing_columns(df):
    missing_cols = set()
    for col in df.columns:
        if df[col].isnull().any():
            missing_cols.add(col)
    return missing_cols
get_missing_columns(test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1))


set()

In [22]:
rf_model.predict(test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1))

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [23]:
passengers = test['PassengerId']

In [24]:
result = pd.DataFrame({'PassengerId': passengers, 'Survived': rf_model.predict(test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1))})

In [25]:
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [26]:
result.to_csv('result.csv', index=False)
# ~0.77 accurac/y