In [22]:
# import dataset from ./titanic/train.csv
import pandas as pd

train_data = pd.read_csv('./titanic/train.csv')
test_data = pd.read_csv('./titanic/test.csv')

In [23]:
# sanity check
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [24]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [25]:
# check for missing values
print(train_data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


## Data Preprocessing

In [26]:
# show types of data for each feature
train_data.info()

train_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Looking at this, it seems that passengerID is a feature we can drop, since it’s just a unique identifier and won’t provide any discriminatory power. Ticket may be the same but I'll some testing to confirm.

In [27]:
# print feature names of dataframe
print(train_data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


I can build a pipeline with some custom preprocessing steps that extract a person's title, like Mr, Miss, etc. This might provide more insight and will reduce dimensionality after one-hot encoding compared to the unprocessed name field. I'm doing something similar for the deck extractor function, where we extract the cabin deck letter (A, B, C, etc.). This again reduces dimensionality after one-hot encoding while still retaining some classification power.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class TitleExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        s = X.iloc[:, 0].astype(str)
        titles = s.str.extract(r',\s*([^\.]+)\.', expand=False).str.strip()
        return titles.to_frame(name='Title')

    def get_feature_names_out(self, input_features=None):
        return np.array(['Title'])

class DeckExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        s = X.iloc[:, 0].astype(str)
        deck = s.str.extract(r'([A-Za-z])', expand=False)
        return deck.to_frame(name='Deck')

    def get_feature_names_out(self, input_features=None):
        return np.array(['Deck'])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Features we want to keep
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare', 'Pclass']
categorical_basic = ['Sex', 'Embarked', 'Ticket']

# Pipelines
numerical_transformer = SimpleImputer(strategy='median')

categorical_basic_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

title_pipeline = Pipeline(steps=[
    ('title', TitleExtractor()),
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

deck_pipeline = Pipeline(steps=[
    ('deck', DeckExtractor()),
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat_basic', categorical_basic_transformer, categorical_basic),
        ('title', title_pipeline, ['Name']),
        ('deck', deck_pipeline, ['Cabin'])
    ],
    remainder='drop'
)

preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Split X/y
y = train_data['Survived']
X = train_data.drop(columns=['Survived'])

train_data_processed = preprocessing_pipeline.fit_transform(X)

print("Preprocessing with Title and Deck created and applied successfully")
print(f"Original shape: {train_data.shape}")
print(f"Processed shape: {train_data_processed.shape}")

Preprocessing with Title and Deck created and applied successfully
Original shape: (891, 12)
Processed shape: (891, 717)


### Feature Importance Scores

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np
import pandas as pd

def train_model(clf, X, y, seed=42, print_features=False):
    # 10-fold stratified cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    accuracies = cross_val_score(clf, X, y, cv=skf, scoring='accuracy')
    
    print(f"Mean accuracy: {accuracies.mean():.4f} ({chr(177)} {accuracies.std():.4f})")
    

    if print_features:
        clf.fit(X, y)
        feature_names = preprocessing_pipeline.named_steps['preprocessor'].get_feature_names_out()
        fi = pd.Series(clf.feature_importances_, index=feature_names).sort_values(ascending=False)
        
        print("\nTop features:")
        print(fi.head(25).to_string())
        print("\nLow-importance features:")
        print(fi[fi < 0.005].to_string())
    
    return accuracies



In [44]:

rf = RandomForestClassifier(n_estimators=300, random_state=42)
train_model(rf, train_data_processed, y, 28, True)

Mean accuracy: 0.8182 (± 0.0240)

Top features:
num__Fare                0.214100
num__Age                 0.206126
title__Title_Mr          0.092376
cat_basic__Sex_male      0.081787
cat_basic__Sex_female    0.081104
num__Pclass              0.066252
num__SibSp               0.051980
num__Parch               0.032385
deck__Deck_n             0.029468
title__Title_Mrs         0.028792
title__Title_Miss        0.027012
cat_basic__Embarked_S    0.014177
cat_basic__Embarked_C    0.012176
title__Title_Master      0.011070
deck__Deck_E             0.008933
cat_basic__Embarked_Q    0.007852
deck__Deck_D             0.006880
deck__Deck_C             0.006748
deck__Deck_B             0.006573
deck__Deck_A             0.002807
deck__Deck_F             0.001914
title__Title_Rev         0.001885
title__Title_Dr          0.001774
deck__Deck_G             0.001582
title__Title_Major       0.001101

Low-importance features:
deck__Deck_A                 0.002807
deck__Deck_F                 0.001914


array([0.82222222, 0.83146067, 0.79775281, 0.78651685, 0.7752809 ,
       0.80898876, 0.83146067, 0.85393258, 0.84269663, 0.83146067])

Looks like we won't be needing the ticket field, it doesn't provide much value and creates a whole lot of extra columns when we one-hot encode.

Now I'm going to train the basic set of models to check the performance of each

In [32]:

# Features we want to keep
categorical_basic = ['Sex', 'Embarked']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat_basic', categorical_basic_transformer, categorical_basic),
        ('title', title_pipeline, ['Name']),
        ('deck', deck_pipeline, ['Cabin'])
    ],
    remainder='drop'
)

preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

train_data_processed = preprocessing_pipeline.fit_transform(X)

rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('rf', RandomForestClassifier(n_estimators=300, random_state=42))
])

X = train_data.drop(columns=['Survived'])
y = train_data['Survived']
train_model(rf, X, y, 28)


Mean accuracy: 0.8126 (± 0.0285)


array([0.82222222, 0.84269663, 0.7752809 , 0.78651685, 0.76404494,
       0.79775281, 0.83146067, 0.85393258, 0.83146067, 0.82022472])

In [33]:
# Linear SVM Classifier
from sklearn.svm import LinearSVC

svm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svm', LinearSVC(random_state=42))
])

X = train_data.drop(columns=['Survived'])
y = train_data['Survived']
train_model(svm, X, y, 28)

Mean accuracy: 0.8193 (± 0.0278)


array([0.83333333, 0.79775281, 0.80898876, 0.80898876, 0.7752809 ,
       0.84269663, 0.78651685, 0.86516854, 0.82022472, 0.85393258])

In [34]:
# RBF SVM Classifier
from sklearn.svm import SVC

rbf= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svm', SVC(kernel='rbf', random_state=42))
])

X = train_data.drop(columns=['Survived'])
y = train_data['Survived']
train_model(rbf, X, y, 28)

Mean accuracy: 0.6790 (± 0.0423)


array([0.71111111, 0.68539326, 0.62921348, 0.69662921, 0.62921348,
       0.75280899, 0.70786517, 0.65168539, 0.70786517, 0.61797753])

In [35]:
# naive bayes classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import FunctionTransformer
nb = Pipeline(steps=[   
    ('preprocessor', preprocessor),
    # transform to array
    ('to_array', FunctionTransformer(lambda x: x.toarray())),
    ('nb', GaussianNB())
])

X = train_data.drop(columns=['Survived'])
y = train_data['Survived']
train_model(nb, X, y, 28)


Mean accuracy: 0.7407 (± 0.0606)


array([0.81111111, 0.74157303, 0.76404494, 0.75280899, 0.66292135,
       0.86516854, 0.69662921, 0.68539326, 0.75280899, 0.6741573 ])

In [36]:
from sklearn.linear_model import LogisticRegression
lf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(solver='liblinear', max_iter=400, class_weight='balanced'))
])

X = train_data.drop(columns=['Survived'])
y = train_data['Survived']
train_model(lf, X, y, 28)

Mean accuracy: 0.8159 (± 0.0304)


array([0.84444444, 0.7752809 , 0.78651685, 0.78651685, 0.80898876,
       0.86516854, 0.78651685, 0.82022472, 0.83146067, 0.85393258])

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

y = train_data['Survived']
X = train_data.drop(columns=['Survived'])

# Train/validation split for final evaluation
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

rf_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Hyperparameter grid (tune 'clf__*')
param_grid = {
    'clf__n_estimators': [200, 400, 800],
    'clf__max_depth': [None, 8, 12, 20],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__max_features': ['sqrt', 'log2', None],
    'clf__class_weight': [None, 'balanced'],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gs = GridSearchCV(
    estimator=rf_pipe,
    param_grid=param_grid,
    scoring={'roc_auc': 'roc_auc', 'accuracy': 'accuracy', 'f1': 'f1'},
    refit='roc_auc',            # refit the best by ROC-AUC
    cv=cv,
    n_jobs=-1,
    verbose=1,
)

gs.fit(X_tr, y_tr)

print("Best params:", gs.best_params_)
print("Best CV ROC-AUC:", gs.best_score_)

# Evaluate on hold-out set
y_pred = gs.predict(X_te)
y_proba = gs.predict_proba(X_te)[:, 1]
print("Hold-out ROC-AUC:", roc_auc_score(y_te, y_proba))
print("Classification report:\n", classification_report(y_te, y_pred, digits=4))

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Best params: {'clf__class_weight': 'balanced', 'clf__max_depth': 20, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 5, 'clf__n_estimators': 800}
Best CV ROC-AUC: 0.8835273714153024
Hold-out ROC-AUC: 0.8477602108036891
Classification report:
               precision    recall  f1-score   support

           0     0.8393    0.8545    0.8468       110
           1     0.7612    0.7391    0.7500        69

    accuracy                         0.8101       179
   macro avg     0.8002    0.7968    0.7984       179
weighted avg     0.8092    0.8101    0.8095       179



In [38]:
# tain a model using the best parameters
rf_best = gs.best_estimator_

X = train_data.drop(columns=['Survived'])
y = train_data['Survived']

best_model = Pipeline(steps=[   
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced', max_depth=20, max_features='sqrt', min_samples_split=5, min_samples_leaf=2, n_estimators=800))
])

train_model(best_model, X, y, 28)


Mean accuracy: 0.8339 (± 0.0257)


array([0.85555556, 0.83146067, 0.80898876, 0.82022472, 0.83146067,
       0.88764045, 0.79775281, 0.85393258, 0.84269663, 0.80898876])

In [39]:
# use the best model to predict the test set
y_pred = best_model.predict(test_data)

def save_preds(_fn, _y_pred, _df):
    import csv
    with open(_fn, 'w') as fout:
        writer = csv.writer(fout, delimiter=',', lineterminator='\n')
        writer.writerow(['PassengerId', 'Survived'])
        for yid, ypred in zip(_df['PassengerId'], _y_pred):
            writer.writerow([yid, ypred])

save_preds('predictions_colclough.csv', y_pred, test_data)

I was able to make my first submission on kaggle using this the output here. This was a really fun assignment! I decided to go with the random forrest classifier as it and the linear svm were very close in score, but I know how robust random forrest classifiers can be. In the assignment instructions had also mentioned using a random forest and getting a 79% score, which I thought would be a good benchmark for my classifier.

![Alt text](./score.png)
