In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

1. Below is a code cell, purpose of which is confirming basic stats like Dataframe shape and the proportion of Missing values in the training data.

In [None]:

train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')

train_raw = train.copy()
test_raw = test.copy()

train_pipeline = train_raw.copy()
test_pipeline = test_raw.copy()

print(f'Train Shape: {train.shape}')
print(f'Test Shape: {test.shape}')

display(train.head())
display(train.isna().mean().sort_values(ascending=False))

Step 2 Data Analysis and Data Cleaning 

2.1 Missing Value Overview
To accurately clean data, it's needed to know proportion of missing data of each column. From running the only full cols are Passenger ID and whether they made it. 

In [None]:
missing_frac = train.isna().mean()
missing_frac = missing_frac.sort_values(ascending= False)
print(missing_frac)
full_cols = missing_frac[missing_frac == 0].index.to_list()
#print(full_cols)

2.2 Inspection of indivual columns and their distributions: range and obvious outliers.

In [None]:
numeric_cols = ['Age', 'Spa', 'FoodCourt','RoomService','VRDeck','ShoppingMall']
print('Numeric Column summaries:')
display(train[numeric_cols].describe())

cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
for col in cat_cols:
    print(f'{col}')
    print(train[col].value_counts(dropna= False))
    print()

2.3 Univariate Analysis: Numerical Features and Categorical Features 

In [None]:

for col in numeric_cols:
    fig, (ax1,ax2) = plt.subplots(1,2,figsize=(12,4))
    
    #Histogram
    ax1.hist(train[col].dropna(), bins = 30)
    ax1.set_title(f'{col} distribution')
    ax1.set_label(col)
    
    #Boxplot 
    ax2.boxplot(train[col].dropna(), vert=False)
    ax2.set_title(f'{col} — boxplot')
    ax2.set_xlabel(col)
    
    plt.tight_layout()
    plt.show()
    
    missing_pct = missing_frac[col] * 100
    print(f'{col}: {missing_pct:.1f}% missing\n')


2.4 Bivariate Analysis for both categorical and numerical features
First Code box shows the count plot for the categorical data and then splits the count into Transported vs Not based on the categorical column its in.
Second box is for numerical data shows the distribution of transported vs Not of the specific columns.

In [None]:
for col in cat_cols:
    fig, (ax1,ax2) = plt.subplots(1,2,figsize = (12,5))
    sns.countplot(data = train, x = col, ax = ax1)
    ax1.set_title(f'Passenger Count by {col}')
    
    
    sns.countplot(data=train, x= col, hue='Transported', ax = ax2)
    ax1.set_title(f'Transported Status by {col}')
    
    plt.tight_layout()
    plt.show()


In [None]:
for col in numeric_cols:
    sns.boxplot(data = train, x = 'Transported', y = col)
    plt.title(f'{col} Distribution by Transported Status')
    plt.show()

3. Data Preprocessing and Feature Engineering

3.1 Missing Data Handling and Imputation

In [None]:
numeric_medians = {col: train_raw[col].median() for col in numeric_cols}
cat_modes = {col: train_raw[col].mode().iloc[0] for col in cat_cols}

for df in [train,test]:
    df[numeric_cols] = df[numeric_cols].fillna(value= numeric_medians)
    df[cat_cols] = df[cat_cols].fillna(value = cat_modes)
    

3.2 Feature Engineering and imputation

Splits Cabin into Deck, Room Number and side of ship. Also makes Total Spend column and update the columns.

In [None]:
new_cols = ['Deck','Num','Side']
spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

for df in [train,test]:
    #Creation of group category which can be used to calculate group size outside of the for loop 
    df['TotalSpend'] = df[spending_cols].sum(axis = 1)
    df['Group'] = df['PassengerId'].str.split('_').str[0]
    
    #Splitting of Cabin into Deck and Side 
    df[new_cols] = df['Cabin'].str.split('/', expand= True)


cat_cols_extended = cat_cols + new_cols
numeric_cols_extended = numeric_cols + ['TotalSpend']

Calculation of group size and addition to Dataframes

In [None]:
group_sizes = pd.concat([train['Group'], test['Group']]).value_counts().rename('GroupSize')

train['GroupSize'] = train['Group'].map(group_sizes)
test['GroupSize'] = test['Group'].map(group_sizes)

numeric_cols_extended = numeric_cols_extended + ['GroupSize']
cat_cols_extended.remove('Num')

Drops Non-Needed columns 

In [None]:
cols_to_drop = ['Num','Group','PassengerId','Cabin']
test_pass_ids = test['PassengerId']
train_model = train.drop(columns= cols_to_drop)
test_model = test.drop(columns= cols_to_drop)

y = train_model['Transported'].astype(int)
x = train_model.drop(columns=['Transported'])


4. Building and Evaluating a ML model

4.1 Splitting train data into training and validation

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_val,y_train,y_val = train_test_split(
    x,y,
    test_size= 0.2,
    stratify = y,
    random_state= 42
)

4.2 Setting up a preprocessor for Categorical and Numerical features

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler

cat_feats = cat_cols_extended
num_feats = numeric_cols_extended

cat_transformer = OneHotEncoder(handle_unknown= 'ignore',sparse_output= False)
num_transformer = StandardScaler()

preprocessor = ColumnTransformer([
    ('cat',cat_transformer,cat_feats),
    ('num',num_transformer,num_feats)
])


4.3 Testing different Models for the pipeline 

Evaluation Function

In [None]:
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
from sklearn.model_selection import cross_val_predict

def eval_model(model,x,y):
    y_pred = cross_val_predict(model,x,y, cv =5)
    y_proba = cross_val_predict(model,x,y, cv=5, method = 'predict_proba')[:,1] #Probability of y being True (1)
    
    accuracy = accuracy_score(y,y_pred)
    F1 = f1_score(y,y_pred)
    ROC = roc_auc_score(y,y_proba)
    
    print(f"Accuracy: {accuracy:.4f} | F1: {F1:.4f} | ROC AUC: {ROC:.4f}")


Models and Comparison of each

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',None)
])

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=200),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier( eval_metric='logloss'),
    'LightGBM': LGBMClassifier(verbose = -1),
    'KNN': KNeighborsClassifier(),
    'Dummy Classifier' : DummyClassifier(strategy='most_frequent')
}

for model, clf in models.items():
    pipeline.set_params(classifier = clf)
    print(model)
    eval_model(pipeline,x,y)
    print()

4.4 Tuning of Parameters for the Gradient Boosting Model (best performance determined in prev step)     

Intial parameter sweep with RandomizedSearchCV to find values to build grid around.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

GB_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',GradientBoostingClassifier())        
])

scoring = {
    'Accuracy': 'accuracy',
    'F1' : 'f1',
    'ROC_AUC':'roc_auc'
}

param_dist = {
    'classifier_n_estimators': randint(100, 500),
    'classifier_learning_rate': uniform(0.001, 1),
    'classifier_max_depth': randint(3, 10),
    'classifier_subsample': uniform(0.6, 0.4),
    'classifier_min_samples_split': randint(2, 20)
}

rand = RandomizedSearchCV(
    GB_pipeline,
    param_distributions= param_dist,
    n_iter = 50,
    scoring = scoring,
    refit= 'F1',
    n_jobs=-1,
    random_state=42
)
rand.fit(x,y)
results = rand.cv_results_
print("Best params (by F1):", rand.best_params_)
print("Mean test ROC AUC for best:", 
      results['mean_test_ROC_AUC'][rand.best_index_])


Best params (by F1): {'classifier__learning_rate': np.float64(0.016966252220214195), 'classifier__max_depth': 4, 'classifier__min_samples_split': 16, 'classifier__n_estimators': 363, 'classifier__subsample': np.float64(0.6137554084460873)}
Mean test ROC AUC for best: 0.8883043549444096


Grid Search Using the Found best parameters from above

In [None]:
from sklearn.model_selection import GridSearchCV

rand_search_best_params = {
    'classifier_learning_rate': np.float64(0.016966252220214195),
    'classifier_max_depth': 4,
    'classifier_min_samples_split': 16,
    'classifier_n_estimators': 363,
    'classifier_subsample': np.float64(0.6137554084460873)
    }

best_learning_rate = rand_search_best_params['classifier_learning_rate']
best_subsample = rand_search_best_params['classifier_subsample']

param_grid = {
    'classifier__n_estimators':[313,363,413], #Number of Trees (boosting rounds) in prediction model 
    'classifier__learning_rate':[best_learning_rate*0.5,
                     best_learning_rate,
                     best_learning_rate*1.5],  #Changes weight of contribution from each tree 
    'classifier__max_depth':[3,4,5], #Controls depth of individual trees 
    'classifier__subsample':[best_subsample-0.1,best_subsample,best_subsample + 0.1], #Fraction of samples used per boosting round (lowers overfitting)
    'classifier__min_samples_split':[12,16,20] #Min samples to split a node
}

grid = GridSearchCV(
    GB_pipeline,
    param_grid= param_grid,
    cv = 5,
    scoring= scoring,
    refit= 'F1',
    n_jobs= -1
)

grid.fit(x,y)

results = grid.cv_results_
best_idx = grid.best_index_

print('Best Paramters:',grid.best_params_)
print('Best Score',grid.best_score_)



Best Paramters: {'classifier__learning_rate': np.float64(0.016966252220214195), 'classifier__max_depth': 4, 'classifier__min_samples_split': 20, 'classifier__n_estimators': 363, 'classifier__subsample': np.float64(0.7137554084460873)}
Best Score 0.8114338266342467


In [None]:
best_f1   = results['mean_test_F1'][best_idx]
best_auc  = results['mean_test_ROC_AUC'][best_idx]
best_acc  = results['mean_test_Accuracy'][best_idx]

print('F1:',best_f1)
print('AUC:',best_auc)
print('Acc:',best_acc)

F1: 0.8114338266342467
AUC: 0.8882161131809347
Acc: 0.8051331697978614


Seaching more to find optimal parameters based on previous output

In [57]:
shifted_best_subsample = best_subsample + 0.15 

shifted_param_grid={
    'classifier__learning_rate': [best_learning_rate],
    'classifier__max_depth': [4],
    'classifier__min_samples_split': [20,24,28],
    'classifier__n_estimators': [363],
    'classifier__subsample': [shifted_best_subsample - 0.1 ,
                              shifted_best_subsample ,
                              min(shifted_best_subsample+ 0.1,1)] 
}

shifted_grid = GridSearchCV(
    GB_pipeline,
    param_grid= shifted_param_grid,
    cv = 5,
    scoring= scoring,
    refit= 'F1',
    n_jobs= -1
)

shifted_grid.fit(x,y)

shifted_results = grid.cv_results_
shifted_best_idx = grid.best_index_

print('Best Paramters:',shifted_grid.best_params_)
print('Best Score',shifted_grid.best_score_)

shifted_best_f1   = results['mean_test_F1'][shifted_best_idx]
shifted_best_auc  = results['mean_test_ROC_AUC'][shifted_best_idx]
shifted_best_acc  = results['mean_test_Accuracy'][shifted_best_idx]

print('F1:',shifted_best_f1)
print('AUC:',shifted_best_auc)
print('Acc:',shifted_best_acc)

Best Paramters: {'classifier__learning_rate': np.float64(0.016966252220214195), 'classifier__max_depth': 4, 'classifier__min_samples_split': 28, 'classifier__n_estimators': 363, 'classifier__subsample': np.float64(0.6637554084460874)}
Best Score 0.8100492831686011
F1: 0.8114338266342467
AUC: 0.8882161131809347
Acc: 0.8051331697978614


In [60]:
final_params = grid.best_params_
print(final_params)

{'classifier__learning_rate': np.float64(0.016966252220214195), 'classifier__max_depth': 4, 'classifier__min_samples_split': 20, 'classifier__n_estimators': 363, 'classifier__subsample': np.float64(0.7137554084460873)}


4.5 Fitting onto Training Data

In [None]:
best_clf = GradientBoostingClassifier(
    n_estimators=363,
    learning_rate=0.016966252220214195,
    max_depth=4,
    subsample=0.7137554084460873,
    min_samples_split= 20
)

#Sets the best found parameters into Pipeline
GB_pipeline.set_params(classifier = best_clf)

#Fits the data into the pipeline
GB_pipeline.fit(x,y)

y_val_pred = GB_pipeline.predict(x_val)

