In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None  # default='warn'

# Load Data

In [2]:
all_train_df = pd.read_csv('data/train.csv')
train_df, val_df = train_test_split(all_train_df, test_size=0.2)
test_df = pd.read_csv('data/test.csv')

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
420,421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C
425,426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S
407,408,1,2,"Richards, Master. William Rowe",male,3.0,1,1,29106,18.75,,S
761,762,0,3,"Nirva, Mr. Iisakki Antino Aijo",male,41.0,0,0,SOTON/O2 3101272,7.125,,S
478,479,0,3,"Karlsson, Mr. Nils August",male,22.0,0,0,350060,7.5208,,S


# Approaches

## Random Forest

In [54]:
from sklearn.ensemble import RandomForestClassifier

y_train = train_df["Survived"]
y_val = val_df["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
X_train = pd.get_dummies(train_df[features])
X_val = pd.get_dummies(val_df[features])

rf_model = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=1)
rf_model.fit(X_train, y_train)

In [55]:
y_val_pred = rf_model.predict(X_val)
acc = (1 - (sum(abs(y_val_pred - y_val.to_numpy())) / len(y_val_pred))) * 100
print(f"Train Set Accuracy: {round(acc, 1)}%")

Train Set Accuracy: 76.0%


## Histogram-based Gradient Boosting Classification

In [52]:
from sklearn.ensemble import HistGradientBoostingClassifier

y_train = train_df["Survived"]
y_val = val_df["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare", "Embarked"]
X_train = pd.get_dummies(train_df[features])
X_val = pd.get_dummies(val_df[features])

hgbc_model = HistGradientBoostingClassifier(max_depth=5, random_state=1)
hgbc_model.fit(X_train, y_train)

In [53]:
y_val_pred = hgbc_model.predict(X_val)
acc = (1 - (sum(abs(y_val_pred - y_val.to_numpy())) / len(y_val_pred))) * 100
print(f"Val Set Accuracy: {round(acc, 1)}%")

Val Set Accuracy: 81.0%


## XGBoost

In [78]:
import xgboost as xgb
xgb.set_config(verbosity=0)

y_train = train_df["Survived"]
y_val = val_df["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare", "Embarked"]
X_train = train_df[features]
X_val = val_df[features]
X_test = test_df[features]

def one_hot_encode(X_df):
    X_df['Sex'] = X_df.apply(lambda row: (row['Sex'] == 'male') * 1, axis=1)
    ports = X_df['Embarked'].unique().tolist()
    X_df['Embarked'] = X_df.apply(lambda row: ports.index(row["Embarked"]), axis=1)

    return X_df

X_train = one_hot_encode(X_train)
X_val = one_hot_encode(X_val)
X_test = one_hot_encode(X_test)

xgb_model = xgb.XGBClassifier(verbosity=0, silent=True)
bst = xgb_model.fit(X_train, y_train)

In [57]:
y_val_pred = xgb_model.predict(X_val)
acc = (1 - (sum(abs(y_val_pred - y_val.to_numpy())) / len(y_val_pred))) * 100
print(f"Val Set Accuracy: {round(acc, 1)}%")

Val Set Accuracy: 77.1%


## XGBoost Grid Search

In [63]:
from datetime import datetime

import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [64]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [69]:
params = {
    'eta': np.linspace(0.01, 0.2, 10),
    'min_child_weight': [0.5, 1, 1.5],
    'max_depth': range(2, 10, 1),
    'gamma': np.linspace(0, 0.5, 10),
    'subsample': np.linspace(0.5, 1, 10),
    'colsample_bytree': np.linspace(0.5, 1, 10),
    'lambda': np.linspace(0.5, 1.5, 10),
    'n_estimators': range(60, 300, 10),
    'learning_rate': [0.1, 0.01, 0.005]
}

In [70]:
xgb_model = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

In [74]:
folds = 3
param_comb = 10000

skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(xgb_model, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train, y_train), random_state=1001 )

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.



In [75]:
print('\n Best hyperparameters:')
print(random_search.best_params_)


 Best hyperparameters:
{'subsample': 0.5, 'n_estimators': 110, 'min_child_weight': 0.5, 'max_depth': 2, 'learning_rate': 0.1, 'lambda': 0.9444444444444444, 'gamma': 0.38888888888888884, 'eta': 0.11555555555555555, 'colsample_bytree': 0.7777777777777778}


In [76]:
y_val_pred = random_search.predict(X_val)
acc = (1 - (sum(abs(y_val_pred - y_val.to_numpy())) / len(y_val_pred))) * 100
print(f"Val Set Accuracy: {round(acc, 1)}%")

Val Set Accuracy: 79.9%


# Generate Submission

In [80]:
predictions = random_search.predict(X_test)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
