# Model Developement

This is the model development notebook  

Import the required libraries

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import re
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score, precision_score, recall_score, f1_score

Setup contant values

In [2]:
RANDOM_STATE = 1

## Data dictionary

- **PassengerId** - A unique Id for each passenger. Each Id takes the form ```gggg_pp``` where ```gggg``` indicates a group the passenger is travelling with and ```pp``` is their number within the group. People in a group are often family members, but not always.
- **HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.
- **CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- **Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- **Destination** - The planet the passenger will be debarking to.
- **Age** - The age of the passenger.
- **VIP** - Whether the passenger has paid for special VIP service during the voyage.
- **RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- **Name** - The first and last names of the passenger.
- **Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

## Check the dataframe

let's load the data

In [3]:
df = pd.read_csv('../data/processed/train.csv')
# df = pd.read_csv('../data/train.csv', dtype_backend='pyarrow')

In [4]:
df.head()

Unnamed: 0,Age,Luxury,GroupSize,Deck,CryoSleep,Side,VIP,TravelingAlone,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,PassengerId,Num,Group,Transported
0,0.711945,-0.514066,-0.648735,2.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0001_01,0,1,0.0
1,-0.334037,-0.251479,-0.648735,6.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0002_01,0,2,1.0
2,2.036857,3.190333,-0.022268,1.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0003_01,0,3,0.0
3,0.293552,1.332604,-0.022268,1.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0003_02,0,3,0.0
4,-0.891895,-0.124824,-0.648735,6.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0004_01,1,4,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        8693 non-null   float64
 1   Luxury                     8693 non-null   float64
 2   GroupSize                  8693 non-null   float64
 3   Deck                       8693 non-null   float64
 4   CryoSleep                  8693 non-null   float64
 5   Side                       8693 non-null   float64
 6   VIP                        8693 non-null   float64
 7   TravelingAlone             8693 non-null   float64
 8   HomePlanet_Earth           8693 non-null   float64
 9   HomePlanet_Europa          8693 non-null   float64
 10  HomePlanet_Mars            8693 non-null   float64
 11  Destination_55 Cancri e    8693 non-null   float64
 12  Destination_PSO J318.5-22  8693 non-null   float64
 13  Destination_TRAPPIST-1e    8693 non-null   float

## Separate the training data

In [6]:
# features = list(df.columns.values[:14]) + list(df.columns.values[15:-1])
features = ['Age', 'Luxury', 'GroupSize', 'CryoSleep', 'Side', 'TravelingAlone', 'Destination_55 Cancri e', 
            'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Group', 'VIP', 'Deck']
label = df.columns.values[-1]
print(f'Features: {features}')
print(f'Label or Target: {label}')

Features: ['Age', 'Luxury', 'GroupSize', 'CryoSleep', 'Side', 'TravelingAlone', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Group', 'VIP', 'Deck']
Label or Target: Transported


Create the dataframes 

In [7]:
x = df[features]
y = df[label]

Split the dataframe into 3 parts, train, cross validation and test datasets

In [8]:
x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.20, random_state=RANDOM_STATE, shuffle=True)
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=RANDOM_STATE, shuffle=True)
print("X_train.shape", x_train.shape, "y_train.shape", y_train.shape)
print("X_cv.shape", x_cv.shape, "y_cv.shape", y_cv.shape)
print("X_test.shape", x_test.shape, "y_test.shape", y_test.shape)

X_train.shape (6954, 12) y_train.shape (6954,)
X_cv.shape (869, 12) y_cv.shape (869,)
X_test.shape (870, 12) y_test.shape (870,)


## Model Building

Baseline: 

A simple logistic regression model will be my baseline (score to beat)

In [9]:
log_reg_model = LogisticRegression(penalty='l2', random_state=RANDOM_STATE, max_iter=100, verbose=1)
log_reg_model.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [10]:
print('Metrics')
print(f'Accuracy in the train dataset: {accuracy_score(log_reg_model.predict(x_train), y_train)}')
print(f'Accuracy in the cross validation dataset: {accuracy_score(log_reg_model.predict(x_cv), y_cv)}')
print(f'Accuracy in the test dataset: {accuracy_score(log_reg_model.predict(x_test), y_test)}')
print(f'Confusion matrix of the test dataset:\n{confusion_matrix(y_test, log_reg_model.predict(x_test))}')
print(f'Mean Squared Error: {mean_squared_error(y_test, log_reg_model.predict(x_test))}')
print(f'F1 Score: {f1_score(y_test, log_reg_model.predict(x_test))}')
print(f'Precision Score: {precision_score(y_test, log_reg_model.predict(x_test))}')
print(f'Recall Score: {recall_score(y_test, log_reg_model.predict(x_test))}')

Metrics
Accuracy in the train dataset: 0.7165660051768766
Accuracy in the cross validation dataset: 0.7295742232451093
Accuracy in the test dataset: 0.7034482758620689
Confusion matrix of the test dataset:
[[326  95]
 [163 286]]
Mean Squared Error: 0.296551724137931
F1 Score: 0.689156626506024
Precision Score: 0.7506561679790026
Recall Score: 0.6369710467706013


XGBoost: 

Gonna try gradient boosting decision trees first. 

In [11]:
xgb_model = XGBClassifier(n_estimators=150, learning_rate=1, verbosity=1, random_state=RANDOM_STATE, 
                            early_stopping_rounds=70, reg_lambda=8)
xgb_model.fit(x_train, y_train, eval_set=[(x_cv, y_cv)])

[0]	validation_0-logloss:0.51395
[1]	validation_0-logloss:0.49313
[2]	validation_0-logloss:0.49951
[3]	validation_0-logloss:0.50588
[4]	validation_0-logloss:0.50081
[5]	validation_0-logloss:0.49858
[6]	validation_0-logloss:0.50040
[7]	validation_0-logloss:0.50082
[8]	validation_0-logloss:0.50374
[9]	validation_0-logloss:0.50758
[10]	validation_0-logloss:0.50956
[11]	validation_0-logloss:0.50848
[12]	validation_0-logloss:0.51038
[13]	validation_0-logloss:0.50896
[14]	validation_0-logloss:0.51195
[15]	validation_0-logloss:0.51305
[16]	validation_0-logloss:0.51152
[17]	validation_0-logloss:0.51496
[18]	validation_0-logloss:0.51676
[19]	validation_0-logloss:0.51891
[20]	validation_0-logloss:0.51522
[21]	validation_0-logloss:0.51783
[22]	validation_0-logloss:0.52041
[23]	validation_0-logloss:0.52269
[24]	validation_0-logloss:0.52642
[25]	validation_0-logloss:0.52376
[26]	validation_0-logloss:0.52018
[27]	validation_0-logloss:0.52272
[28]	validation_0-logloss:0.52585
[29]	validation_0-loglos

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


[50]	validation_0-logloss:0.55783
[51]	validation_0-logloss:0.55573
[52]	validation_0-logloss:0.55555
[53]	validation_0-logloss:0.55729
[54]	validation_0-logloss:0.55857
[55]	validation_0-logloss:0.56152
[56]	validation_0-logloss:0.56199
[57]	validation_0-logloss:0.56369
[58]	validation_0-logloss:0.56246
[59]	validation_0-logloss:0.56433
[60]	validation_0-logloss:0.56396
[61]	validation_0-logloss:0.56382
[62]	validation_0-logloss:0.56444
[63]	validation_0-logloss:0.56337
[64]	validation_0-logloss:0.56508
[65]	validation_0-logloss:0.56522
[66]	validation_0-logloss:0.56696
[67]	validation_0-logloss:0.56801
[68]	validation_0-logloss:0.56796
[69]	validation_0-logloss:0.56820
[70]	validation_0-logloss:0.56725
[71]	validation_0-logloss:0.56677


In [12]:
xgb_model.evals_result()

{'validation_0': OrderedDict([('logloss',
               [0.513949449663881,
                0.49312558052986344,
                0.499508988352276,
                0.5058805346403216,
                0.5008138137873823,
                0.4985831440758642,
                0.5003980612459993,
                0.5008229241747693,
                0.5037351668576825,
                0.507580591331668,
                0.5095571466819121,
                0.5084756643655233,
                0.51037580765559,
                0.5089612454157194,
                0.5119538813849115,
                0.5130458001744793,
                0.5115195670206919,
                0.514957258432125,
                0.5167626732620341,
                0.5189051335206015,
                0.5152164415577392,
                0.5178280856350281,
                0.5204056173471439,
                0.5226940244032494,
                0.5264238269430407,
                0.5237552750990252,
                0.520178239

In [13]:
print('Metrics')
print(f'Accuracy in the train dataset: {accuracy_score(xgb_model.predict(x_train), y_train)}')
print(f'Accuracy in the cross validation dataset: {accuracy_score(xgb_model.predict(x_cv), y_cv)}')
print(f'Accuracy in the test dataset: {accuracy_score(xgb_model.predict(x_test), y_test)}')
print(f'Confusion matrix of the test dataset:\n{confusion_matrix(y_test, xgb_model.predict(x_test))}')
print(f'Mean Squared Error: {mean_squared_error(y_test, xgb_model.predict(x_test))}')
print(f'F1 Score: {f1_score(y_test, xgb_model.predict(x_test))}')
print(f'Precision Score: {precision_score(y_test, xgb_model.predict(x_test))}')
print(f'Recall Score: {recall_score(y_test, xgb_model.predict(x_test))}')

Metrics
Accuracy in the train dataset: 0.7635893011216566
Accuracy in the cross validation dataset: 0.7422324510932106
Accuracy in the test dataset: 0.7425287356321839
Confusion matrix of the test dataset:
[[353  68]
 [156 293]]
Mean Squared Error: 0.2574712643678161
F1 Score: 0.7234567901234569
Precision Score: 0.8116343490304709
Recall Score: 0.6525612472160356


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  

right now the model is overfitting, i'm still going to upload my first entry as is, in order to get some feedback on the real test data

## Productionize code 

Now lets make a pipeline to chain the previous notebooks into simpler code

Load the submission(test) dataset

In [14]:
df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [16]:
# features = ['Age', 'Luxury', 'GroupSize', 'CryoSleep', 'Side', 'TravelingAlone', 'Destination_55 Cancri e', 
            # 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Group', 'VIP', 'Deck']
features = ['Age', 'CryoSleep', 'Destination', 'HomePlanet', 'VIP', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService', 'Cabin']
label = 'Transported'
x = df[features]
y = df[label]
x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.20, random_state=RANDOM_STATE, shuffle=True)
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=RANDOM_STATE, shuffle=True)
print("X_train.shape", x_train.shape, "y_train.shape", y_train.shape)
print("X_cv.shape", x_cv.shape, "y_cv.shape", y_cv.shape)
print("X_test.shape", x_test.shape, "y_test.shape", y_test.shape)

X_train.shape (6954, 11) y_train.shape (6954,)
X_cv.shape (869, 11) y_cv.shape (869,)
X_test.shape (870, 11) y_test.shape (870,)


Prepare the Imputer_transformer

In [17]:
s_imputer_median = SimpleImputer(strategy='median')
s_imputer_mode = SimpleImputer(strategy='most_frequent')
s_imputer_const = SimpleImputer(strategy='constant', fill_value=0)
s_imputer_const_cab = SimpleImputer(strategy='constant', fill_value='0/0/0')
median_features = ['Age']
mode_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
const_features = ['FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService']
const_cab_features = ['Cabin']

In [18]:
# The ColumnTransformer class needs a list of transformers, these transformers are tuples of 3 values: 
# Name of the step, the transformer to run, the objects affected by the step
# The remainder parameter controls what to do with the columns not involved in the ColumnTransformer
# Remainder default value = 'drop', drop the others column in the output 
# The columns in the output are ordered by their step, first in first out 
# The verbose parameter makes the ColumnTransformer return the time required to complete their operations
# The verbose_feature_names_out parameter adds a prefix to each column with the stepname that generated it
imputer_transformer = ColumnTransformer([
    ('Median', s_imputer_median, median_features), 
    ('Mode', s_imputer_mode, mode_features), 
    ('Const', s_imputer_const, const_features), 
    ('Cabin', s_imputer_const_cab, const_cab_features)],
    remainder='passthrough',
    verbose_feature_names_out=False
    )

In [19]:
float_features = ['Age', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService']
for cols in float_features: 
    df[cols] = df[cols].astype('float')

Feature engineering

df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
df['Luxury'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
df['Group'] = df['PassengerId'].str.split('_', expand=True)[0]
df_group = pd.DataFrame(df['Group'].value_counts()).reset_index()
df_group = df_group.rename(columns={'count': 'GroupSize'})
df = df.merge(df_group, how='left', on='Group')
df['TravelingAlone'] = None
df.loc[df['GroupSize'] > 1, 'TravelingAlone'] = False
df.loc[df['GroupSize'] == 1, 'TravelingAlone'] = True
df.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin', 'Name', 'PassengerId'], axis=1, inplace=True)

Data Transformations

In [20]:
# Prepare ColumnTransformer
oh_encoder = OneHotEncoder(sparse_output=False)
o_encoder = OrdinalEncoder()
s_scaler = StandardScaler()
# num_features = ['Age', 'Luxury', 'GroupSize']
num_features = ['Age']
# cat_features_ordinal = ['Deck', 'Transported', 'CryoSleep', 'Side', 'VIP', 'TravelingAlone']
cat_features_ordinal = ['Transported', 'CryoSleep', 'VIP']
cat_features_one_hot = ['HomePlanet','Destination']

In [21]:
# The ColumnTransformer class needs a list of transformers, these transformers are tuples of 3 values: 
# Name of the step, the transformer to run, the objects affected by the step
# The remainder parameter controls what to do with the columns not involved in the ColumnTransformer
# Remainder default value = 'drop', drop the others column in the output 
# The columns in the output are ordered by their step, first in first out 
# The verbose parameter makes the ColumnTransformer return the time required to complete their operations
# The verbose_feature_names_out parameter adds a prefix to each column with the stepname that generated it
transformation_transformer = ColumnTransformer([
    ('Scaler', s_scaler, num_features),
    ('OrdinalEncoder', o_encoder, cat_features_ordinal),
    ('OneHotEncoder', oh_encoder, cat_features_one_hot)],
    remainder='passthrough', 
    verbose_feature_names_out=False
    )

XGBoost model

In [22]:
xgb_model = XGBClassifier(n_estimators=150, learning_rate=1, verbosity=0, random_state=RANDOM_STATE, 
                            early_stopping_rounds=70, reg_lambda=8)

Create the pipeline

space_pipeline = Pipeline([
    ('Imputation', imputer_transformer),
    ('FeatureTransformation', transformation_transformer), 
    ('XGBoostModel', xgb_model)
])

In [23]:
# The ColumnTransformer class needs a list of transformers, these transformers are tuples of 3 values: 
# Name of the step, the transformer to run, the objects affected by the step
# The remainder parameter controls what to do with the columns not involved in the ColumnTransformer
# Remainder default value = 'drop', drop the others column in the output 
# The columns in the output are ordered by their step, first in first out 
# The verbose parameter makes the ColumnTransformer return the time required to complete their operations
# The verbose_feature_names_out parameter adds a prefix to each column with the stepname that generated it
transformation_transformer = ColumnTransformer([
    ('Median', s_imputer_median, median_features), 
    ('Mode', s_imputer_mode, mode_features), 
    ('Const', s_imputer_const, const_features), 
    ('Cabin', s_imputer_const_cab, const_cab_features),
    ('Scaler', s_scaler, num_features),
    ('OrdinalEncoder', o_encoder, cat_features_ordinal),
    ('OneHotEncoder', oh_encoder, cat_features_one_hot)],
    remainder='passthrough', 
    verbose_feature_names_out=False
    )

In [24]:
space_pipeline = Pipeline([
    ('ColumnTransformer', transformation_transformer), 
    ('XGBoostModel', xgb_model)
])

In [25]:
# space_pipeline.fit(X=x_train, y=y_train)

In [26]:
x_train.columns

Index(['Age', 'CryoSleep', 'Destination', 'HomePlanet', 'VIP', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'RoomService', 'Cabin'],
      dtype='object')