# Model Developement

This is the model development notebook  

Import the required libraries

In [None]:
import numpy as np
import pandas as pd
import os
import re
from xgboost import XGBClassifier
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score, precision_score, recall_score, f1_score



In [None]:
print(pd.__version__)
print(np.__version__)

2.1.0
1.26.0


Setup contant values and config

In [None]:
# Set a static a homogeneus random state
# Set every estimator or pipeline in sklearn to output a graphical representation of itself instad of pure text
RANDOM_STATE = 1
set_config(display='diagram')

## Data dictionary

- **PassengerId** - A unique Id for each passenger. Each Id takes the form ```gggg_pp``` where ```gggg``` indicates a group the passenger is travelling with and ```pp``` is their number within the group. People in a group are often family members, but not always.
- **HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.
- **CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- **Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- **Destination** - The planet the passenger will be debarking to.
- **Age** - The age of the passenger.
- **VIP** - Whether the passenger has paid for special VIP service during the voyage.
- **RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- **Name** - The first and last names of the passenger.
- **Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

## Check the dataframe

let's load the data

In [None]:
df = pd.read_csv('../data/processed/train.csv')
# df = pd.read_csv('../data/train.csv', dtype_backend='pyarrow')

In [None]:
df.head()

Unnamed: 0,Age,Luxury,GroupSize,Deck,CryoSleep,Side,VIP,TravelingAlone,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,PassengerId,Num,Group,Transported
0,0.711945,-0.514066,-0.648735,2.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0001_01,0,1,0.0
1,-0.334037,-0.251479,-0.648735,6.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0002_01,0,2,1.0
2,2.036857,3.190333,-0.022268,1.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0003_01,0,3,0.0
3,0.293552,1.332604,-0.022268,1.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0003_02,0,3,0.0
4,-0.891895,-0.124824,-0.648735,6.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0004_01,1,4,1.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        8693 non-null   float64
 1   Luxury                     8693 non-null   float64
 2   GroupSize                  8693 non-null   float64
 3   Deck                       8693 non-null   float64
 4   CryoSleep                  8693 non-null   float64
 5   Side                       8693 non-null   float64
 6   VIP                        8693 non-null   float64
 7   TravelingAlone             8693 non-null   float64
 8   HomePlanet_Earth           8693 non-null   float64
 9   HomePlanet_Europa          8693 non-null   float64
 10  HomePlanet_Mars            8693 non-null   float64
 11  Destination_55 Cancri e    8693 non-null   float64
 12  Destination_PSO J318.5-22  8693 non-null   float64
 13  Destination_TRAPPIST-1e    8693 non-null   float

## Separate the training data

In [None]:
# features = list(df.columns.values[:14]) + list(df.columns.values[15:-1])
features = ['Age', 'Luxury', 'GroupSize', 'CryoSleep', 'Side', 'TravelingAlone', 'Destination_55 Cancri e', 
            'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Group', 'VIP', 'Deck']
label = df.columns.values[-1]
print(f'Features: {features}')
print(f'Label or Target: {label}')

Features: ['Age', 'Luxury', 'GroupSize', 'CryoSleep', 'Side', 'TravelingAlone', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Group', 'VIP', 'Deck']
Label or Target: Transported


Create the dataframes 

In [None]:
x = df[features]    
y = df[label]

Split the dataframe into 3 parts, train, cross validation and test datasets

In [None]:
x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.20, random_state=RANDOM_STATE, shuffle=True)
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=RANDOM_STATE, shuffle=True)
print("X_train.shape", x_train.shape, "y_train.shape", y_train.shape)
print("X_cv.shape", x_cv.shape, "y_cv.shape", y_cv.shape)
print("X_test.shape", x_test.shape, "y_test.shape", y_test.shape)

X_train.shape (6954, 12) y_train.shape (6954,)
X_cv.shape (869, 12) y_cv.shape (869,)
X_test.shape (870, 12) y_test.shape (870,)


## Model Building

Baseline: 

A simple logistic regression model will be my baseline (score to beat)

In [None]:
log_reg_model = LogisticRegression(penalty='l2', random_state=RANDOM_STATE, max_iter=100, verbose=1)
log_reg_model.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [None]:
print('Metrics')
print(f'Accuracy in the train dataset: {accuracy_score(log_reg_model.predict(x_train), y_train)}')
print(f'Accuracy in the cross validation dataset: {accuracy_score(log_reg_model.predict(x_cv), y_cv)}')
print(f'Accuracy in the test dataset: {accuracy_score(log_reg_model.predict(x_test), y_test)}')
print(f'Confusion matrix of the test dataset:\n{confusion_matrix(y_test, log_reg_model.predict(x_test))}')
print(f'Mean Squared Error: {mean_squared_error(y_test, log_reg_model.predict(x_test))}')
print(f'F1 Score: {f1_score(y_test, log_reg_model.predict(x_test))}')
print(f'Precision Score: {precision_score(y_test, log_reg_model.predict(x_test))}')
print(f'Recall Score: {recall_score(y_test, log_reg_model.predict(x_test))}')

Metrics
Accuracy in the train dataset: 0.7165660051768766
Accuracy in the cross validation dataset: 0.7295742232451093
Accuracy in the test dataset: 0.7034482758620689
Confusion matrix of the test dataset:
[[326  95]
 [163 286]]
Mean Squared Error: 0.296551724137931
F1 Score: 0.689156626506024
Precision Score: 0.7506561679790026
Recall Score: 0.6369710467706013


XGBoost: 

Gonna try gradient boosting decision trees first. 

In [None]:
xgb_model = XGBClassifier(n_estimators=200, learning_rate=1, verbosity=1, random_state=RANDOM_STATE, 
                            early_stopping_rounds=100, reg_lambda=2, enable_categorical=True)
xgb_model.fit(x_train, y_train, eval_set=[(x_cv, y_cv)])

[0]	validation_0-logloss:0.51672
[1]	validation_0-logloss:0.50162
[2]	validation_0-logloss:0.49304
[3]	validation_0-logloss:0.50719
[4]	validation_0-logloss:0.50688
[5]	validation_0-logloss:0.51640
[6]	validation_0-logloss:0.50683
[7]	validation_0-logloss:0.51178
[8]	validation_0-logloss:0.51603
[9]	validation_0-logloss:0.52412
[10]	validation_0-logloss:0.53143
[11]	validation_0-logloss:0.53499
[12]	validation_0-logloss:0.53557
[13]	validation_0-logloss:0.53426
[14]	validation_0-logloss:0.53677
[15]	validation_0-logloss:0.53960
[16]	validation_0-logloss:0.53744
[17]	validation_0-logloss:0.54614
[18]	validation_0-logloss:0.54499
[19]	validation_0-logloss:0.54458
[20]	validation_0-logloss:0.55184
[21]	validation_0-logloss:0.55467
[22]	validation_0-logloss:0.55711
[23]	validation_0-logloss:0.56153
[24]	validation_0-logloss:0.56107
[25]	validation_0-logloss:0.56381
[26]	validation_0-logloss:0.56643
[27]	validation_0-logloss:0.56825
[28]	validation_0-logloss:0.57382
[29]	validation_0-loglos

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


[52]	validation_0-logloss:0.63030
[53]	validation_0-logloss:0.62893
[54]	validation_0-logloss:0.63441
[55]	validation_0-logloss:0.63677
[56]	validation_0-logloss:0.63756
[57]	validation_0-logloss:0.63630
[58]	validation_0-logloss:0.63722
[59]	validation_0-logloss:0.64026
[60]	validation_0-logloss:0.64268
[61]	validation_0-logloss:0.64211
[62]	validation_0-logloss:0.64083
[63]	validation_0-logloss:0.64391
[64]	validation_0-logloss:0.64496
[65]	validation_0-logloss:0.64326
[66]	validation_0-logloss:0.64470
[67]	validation_0-logloss:0.64267
[68]	validation_0-logloss:0.64464
[69]	validation_0-logloss:0.64425
[70]	validation_0-logloss:0.64298
[71]	validation_0-logloss:0.64582
[72]	validation_0-logloss:0.64349
[73]	validation_0-logloss:0.64559
[74]	validation_0-logloss:0.64590
[75]	validation_0-logloss:0.64478
[76]	validation_0-logloss:0.64742
[77]	validation_0-logloss:0.64695
[78]	validation_0-logloss:0.64825
[79]	validation_0-logloss:0.65195
[80]	validation_0-logloss:0.65964
[81]	validatio

In [None]:
xgb_model.evals_result()

{'validation_0': OrderedDict([('logloss',
               [0.5167241837953671,
                0.5016192479018375,
                0.49303551635118703,
                0.5071913005786297,
                0.5068849395361479,
                0.5164013821374458,
                0.5068306656691253,
                0.5117842241820544,
                0.5160256069184627,
                0.524124786610138,
                0.531434839955433,
                0.5349926890727196,
                0.5355679824129217,
                0.5342568154318342,
                0.5367707491340297,
                0.5396036101204141,
                0.5374438401703282,
                0.5461403984706978,
                0.5449907921195489,
                0.5445774942245816,
                0.5518354710081017,
                0.5546707090942313,
                0.5571121995135054,
                0.5615257092604444,
                0.5610731631278852,
                0.5638145936109464,
                0.56643

In [None]:
xgb_model.score(x_train, y_train)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


0.7853034224906529

In [None]:
print('Metrics')
print(f'Accuracy in the train dataset: {accuracy_score(xgb_model.predict(x_train), y_train)}')
print(f'Accuracy in the cross validation dataset: {accuracy_score(xgb_model.predict(x_cv), y_cv)}')
print(f'Accuracy in the test dataset: {accuracy_score(xgb_model.predict(x_test), y_test)}')
print(f'Confusion matrix of the test dataset:\n{confusion_matrix(y_test, xgb_model.predict(x_test))}')
print(f'Mean Squared Error: {mean_squared_error(y_test, xgb_model.predict(x_test))}')
print(f'F1 Score: {f1_score(y_test, xgb_model.predict(x_test))}')
print(f'Precision Score: {precision_score(y_test, xgb_model.predict(x_test))}')
print(f'Recall Score: {recall_score(y_test, xgb_model.predict(x_test))}')

Metrics
Accuracy in the train dataset: 0.7853034224906529
Accuracy in the cross validation dataset: 0.7548906789413119
Accuracy in the test dataset: 0.7344827586206897
Confusion matrix of the test dataset:
[[339  82]
 [149 300]]
Mean Squared Error: 0.2655172413793103
F1 Score: 0.7220216606498194
Precision Score: 0.7853403141361257
Recall Score: 0.6681514476614699


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  

right now the model is overfitting, i'm still going to upload my first entry as is, in order to get some feedback on the real test data

## Productionize code 

Now lets make a pipeline to chain the previous notebooks into simpler code

Load the submission(test) dataset

In [None]:
df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [None]:
def change_type(dataframe: pd.DataFrame) -> pd.DataFrame:  
    dataframe['PassengerId'] = dataframe['PassengerId'].astype("category")
    dataframe['HomePlanet'] = dataframe['HomePlanet'].astype('category')
    dataframe['CryoSleep'] = dataframe['CryoSleep'].astype('bool')
    dataframe['Cabin'] = dataframe['Cabin'].astype('category')
    # dataframe['Destination'] = dataframe['Destination'].astype('category')
    dataframe['VIP'] = dataframe['VIP'].astype('bool')
    return dataframe

In [None]:
change_type(df)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [None]:
df = change_type(df)
test_df = change_type(test_df)

In [None]:
df['HomePlanet'].value_counts()

HomePlanet
Earth     4602
Europa    2131
Mars      1759
Name: count, dtype: int64

In [None]:
# features = ['Age', 'Luxury', 'GroupSize', 'CryoSleep', 'Side', 'TravelingAlone', 'Destination_55 Cancri e', 
            # 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Group', 'VIP', 'Deck']
# features = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 
#             'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
features = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 
            'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
label = 'Transported'
x = df[features]
y = df[label]
x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.20, random_state=RANDOM_STATE, shuffle=True)
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=RANDOM_STATE, shuffle=True)
print("X_train.shape", x_train.shape, "y_train.shape", y_train.shape)
print("X_cv.shape", x_cv.shape, "y_cv.shape", y_cv.shape)
print("X_test.shape", x_test.shape, "y_test.shape", y_test.shape)

X_train.shape (6954, 12) y_train.shape (6954,)
X_cv.shape (869, 12) y_cv.shape (869,)
X_test.shape (870, 12) y_test.shape (870,)


Prepare the Imputer_transformer

In [None]:
s_imputer_median = SimpleImputer(strategy='median')
s_imputer_mode = SimpleImputer(strategy='most_frequent')
s_imputer_const = SimpleImputer(strategy='constant', fill_value=0)
s_imputer_const_cab = SimpleImputer(strategy='constant', fill_value='0/0/0')
median_features = ['Age']
mode_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
# mode_features = ['CryoSleep', 'Destination', 'VIP']
const_features = ['FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService']
const_cab_features = ['Cabin']

In [None]:
# The ColumnTransformer class needs a list of transformers, these transformers are tuples of 3 values: 
# Name of the step, the transformer to run, the objects affected by the step
# The remainder parameter controls what to do with the columns not involved in the ColumnTransformer
# Remainder default value = 'drop', drop the others column in the output 
# The columns in the output are ordered by their step, first in first out 
# The verbose parameter makes the ColumnTransformer return the time required to complete their operations
# The verbose_feature_names_out parameter adds a prefix to each column with the stepname that generated it

float_features = ['Age', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService']
for cols in float_features: 
    df[cols] = df[cols].astype('float')

Feature engineering

Data Transformations

In [None]:
# Prepare ColumnTransformer
oh_encoder = OneHotEncoder(sparse_output=False)
o_encoder = OrdinalEncoder()
s_scaler = StandardScaler()
# num_features = ['Age', 'Luxury', 'GroupSize']
num_features = ['Age']
# cat_features_ordinal = ['Deck', 'Transported', 'CryoSleep', 'Side', 'VIP', 'TravelingAlone']
cat_features_ordinal = ['CryoSleep', 'VIP']
cat_features_one_hot = ['HomePlanet','Destination']

XGBoost model

In [None]:
xgb_model = XGBClassifier(n_estimators=150, learning_rate=1, verbosity=0, random_state=RANDOM_STATE, 
                            early_stopping_rounds=70, reg_lambda=8)

Create the pipeline

In [None]:
# Prepare LabelEncoder
l_encoder = LabelEncoder()

# Prepare the ColumnTransformer: Imputer
s_imputer_median = SimpleImputer(strategy='median')
s_imputer_mode = SimpleImputer(strategy='most_frequent')
s_imputer_const = SimpleImputer(strategy='constant', fill_value=0)
s_imputer_const_cab = SimpleImputer(strategy='constant', fill_value='0/0/0')
median_features = ['Age']
# mode_features = ['CryoSleep', 'Destination', 'VIP']
mode_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
const_features = ['FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService']
const_cab_features = ['Cabin']

# Prepare ColumnTransformer: FeatureEngineering
feat_eng_input = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin', 'PassengerId']

def new_features(dataframe: pd.DataFrame) -> pd.DataFrame:
    dataframe[['Deck', 'Num', 'Side']] = dataframe['Cabin'].str.split('/', expand=True)
    dataframe['Luxury'] = dataframe['RoomService'] + dataframe['FoodCourt'] + dataframe['ShoppingMall'] + dataframe['Spa'] + dataframe['VRDeck']
    dataframe['Group'] = dataframe['PassengerId'].str.split('_', expand=True)[0]
    dataframe['Group'] = dataframe['Group'].astype(float)
    dataframe_group = pd.DataFrame(dataframe['Group'].value_counts()).reset_index()
    dataframe_group = dataframe_group.rename(columns={'count': 'GroupSize'})
    dataframe = dataframe.merge(dataframe_group, how='left', on='Group')
    dataframe['TravelingAlone'] = 0.0
    dataframe.loc[dataframe['GroupSize'] > 1, 'TravelingAlone'] = 0
    dataframe.loc[dataframe['GroupSize'] == 1, 'TravelingAlone'] = 1
    dataframe.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin', 'PassengerId', 'Num'], axis=1, inplace=True)
    # dataframe.reset_index(inplace=True)
    return dataframe


def new_features_cols(transformer: FunctionTransformer, input_features: list) -> list :
    cols = [col for col in transformer.feature_names_in_ if col not in 
            ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin', 'Name', 'PassengerId']]
    cols = cols + [ 'GroupSize', 'TravelingAlone']
    return cols

feat_eng_transformer = FunctionTransformer(func=new_features)

# Prepare ColumnTransformer: Scaler
oh_encoder = OneHotEncoder(sparse_output=False)
o_encoder = OrdinalEncoder()
s_scaler = StandardScaler()
num_features = ['Age', 'Luxury', 'GroupSize']
# num_features = ['Age']
cat_features_ordinal = ['Deck', 'CryoSleep', 'Side', 'VIP']
# cat_features_ordinal = ['CryoSleep', 'VIP']
# cat_features_one_hot = ['Destination']
cat_features_one_hot = ['HomePlanet','Destination']

# xgb_model = XGBClassifier(n_estimators=150, learning_rate=1, verbosity=0, random_state=RANDOM_STATE, 
                            # early_stopping_rounds=70, reg_lambda=8, enable_categorical=True)
xgb_model = XGBClassifier(n_estimators=200, learning_rate=1, verbosity=1, random_state=RANDOM_STATE, 
                            early_stopping_rounds=50, reg_lambda=8, enable_categorical=True)

In [None]:
# feat_eng_transformer.fit_transform(df)

In [None]:
# feat_eng_transformer.feature_names_in_

In [None]:
# feat_eng_transformer.get_feature_names_out()

In [None]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
# The ColumnTransformer class needs a list of transformers, these transformers are tuples of 3 values: 
# Name of the step, the transformer to run, the objects affected by the step
# The remainder parameter controls what to do with the columns not involved in the ColumnTransformer
# Remainder default value = 'drop', drop the others column in the output 
# The columns in the output are ordered by their step, first in first out 
# The verbose parameter makes the ColumnTransformer return the time required to complete their operations
# The verbose_feature_names_out parameter adds a prefix to each column with the stepname that generated it
imputer_transformer = ColumnTransformer([
    ('Median', s_imputer_median, median_features), 
    ('Mode', s_imputer_mode, mode_features), 
    ('Const', s_imputer_const, const_features), 
    ('Cabin', s_imputer_const_cab, const_cab_features)],
    remainder='passthrough',
    verbose_feature_names_out=False
    )

# feature_c_transformer = ColumnTransformer([
#     ('FeatureEngineering', feat_eng_transformer, feat_eng_input)],
#     remainder='passthrough', 
#     verbose_feature_names_out=False
#     )

scaler_transformer = ColumnTransformer([
    ('Scaler', s_scaler, num_features),
    ('OrdinalEncoder', o_encoder, cat_features_ordinal),
    ('OneHotEncoder', oh_encoder, cat_features_one_hot)],
    remainder='passthrough', 
    verbose_feature_names_out=False
    )

# main_column_transformer = ColumnTransformer([('Imputers', imputer_transformer, median_features+mode_features+const_features+const_cab_features), 
#                                             ('Scalers', scaler_transformer, num_features+cat_features_one_hot+cat_features_ordinal)
#                                             ], remainder='passthrough')


In [None]:
# df.columns

In [None]:
dimension_pipeline = Pipeline([
    ('Imputation', imputer_transformer),
    # ('FeatureEngineering', feature_c_transformer),
    ('FeatureEngineering', feat_eng_transformer),
    ('FeatureTransformation', scaler_transformer), 
    # ('Model', xgb_model)
])

In [None]:
imputer_transformer.set_output(transform='pandas')
# scaler_transformer.set_output(transform='pandas')
# dimension_pipeline.set_output(transform='pandas')
dimension_pipeline_cv = Pipeline([
    ('Imputation', imputer_transformer),
    ('FeatureEngineering', feat_eng_transformer),
    ('FeatureTransformation', scaler_transformer), 
])

In [None]:
x_cv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 869 entries, 2942 to 2305
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   PassengerId   869 non-null    category
 1   HomePlanet    848 non-null    category
 2   CryoSleep     869 non-null    bool    
 3   Cabin         842 non-null    category
 4   Destination   848 non-null    object  
 5   Age           847 non-null    float64 
 6   VIP           869 non-null    bool    
 7   RoomService   852 non-null    float64 
 8   FoodCourt     853 non-null    float64 
 9   ShoppingMall  852 non-null    float64 
 10  Spa           848 non-null    float64 
 11  VRDeck        853 non-null    float64 
dtypes: bool(2), category(3), float64(6), object(1)
memory usage: 695.6+ KB


In [None]:
dimension_pipeline_cv.fit_transform(x_cv, y_cv)

array([[-6.04323658e-01, -4.57690568e-01, -2.91294026e-01, ...,
         1.00000000e+00,  3.19200000e+03,  1.00000000e+00],
       [ 1.98300238e+00, -2.95962366e-01, -2.91294026e-01, ...,
         0.00000000e+00,  6.11000000e+03,  1.00000000e+00],
       [-2.54685004e-01, -2.22122823e-01, -2.91294026e-01, ...,
         1.00000000e+00,  7.87200000e+03,  1.00000000e+00],
       ...,
       [ 5.84447766e-01, -2.41726241e-01, -2.91294026e-01, ...,
         1.00000000e+00,  9.10800000e+03,  1.00000000e+00],
       [-3.94540465e-01, -2.81913249e-01, -2.91294026e-01, ...,
         0.00000000e+00,  2.53700000e+03,  1.00000000e+00],
       [-5.34395927e-01, -5.05718943e-01, -2.91294026e-01, ...,
         0.00000000e+00,  2.47900000e+03,  1.00000000e+00]])

In [None]:
# dimension_pipeline.fit(x_train, y_train, Model__eval_set=[(x_cv, y_cv)])
# dimension_pipeline.fit(x_train, y_train)

xgb_model = XGBClassifier(n_estimators=200, learning_rate=0.2, verbosity=0, random_state=RANDOM_STATE, 
                            early_stopping_rounds=80, reg_lambda=4)
xgb_model.fit(x_train, y_train, eval_set=[(x_cv, y_cv)])

print('Metrics')
print(f'Accuracy in the train dataset: {accuracy_score(xgb_model.predict(x_train), y_train)}')
print(f'Accuracy in the cross validation dataset: {accuracy_score(xgb_model.predict(x_cv), y_cv)}')
print(f'Accuracy in the test dataset: {accuracy_score(xgb_model.predict(x_test), y_test)}')
print(f'Confusion matrix of the test dataset:\n{confusion_matrix(y_test, xgb_model.predict(x_test))}')
print(f'Mean Squared Error: {mean_squared_error(y_test, xgb_model.predict(x_test))}')
print(f'F1 Score: {f1_score(y_test, xgb_model.predict(x_test))}')
print(f'Precision Score: {precision_score(y_test, xgb_model.predict(x_test))}')
print(f'Recall Score: {recall_score(y_test, xgb_model.predict(x_test))}')

dimension_pipeline.predict(x_train)

print('Metrics')
print(f'Accuracy in the train dataset: {accuracy_score(dimension_pipeline.predict(x_train), y_train)}')
print(f'Accuracy in the cross validation dataset: {accuracy_score(dimension_pipeline.predict(x_cv), y_cv)}')
print(f'Accuracy in the test dataset: {accuracy_score(dimension_pipeline.predict(x_test), y_test)}')
print(f'Confusion matrix of the test dataset:\n{confusion_matrix(y_test, dimension_pipeline.predict(x_test))}')
print(f'Mean Squared Error: {mean_squared_error(y_test, dimension_pipeline.predict(x_test))}')
print(f'F1 Score: {f1_score(y_test, dimension_pipeline.predict(x_test))}')
print(f'Precision Score: {precision_score(y_test, dimension_pipeline.predict(x_test))}')
print(f'Recall Score: {recall_score(y_test, dimension_pipeline.predict(x_test))}')

In [None]:
x_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
5776,6118_02,Mars,True,E/391/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
220,0234_01,Earth,False,F/50/P,TRAPPIST-1e,43.0,False,,888.0,0.0,0.0,0.0
2398,2580_01,Earth,False,G/412/S,TRAPPIST-1e,24.0,False,19.0,676.0,0.0,26.0,0.0
5876,6224_01,Earth,False,F/1191/S,TRAPPIST-1e,16.0,False,0.0,0.0,60.0,0.0,1963.0
192,0213_01,Earth,False,F/46/P,TRAPPIST-1e,21.0,False,367.0,281.0,0.0,0.0,146.0


In [None]:
xgb_model_fn = XGBClassifier(n_estimators=200, learning_rate=0.2, verbosity=1, random_state=RANDOM_STATE, 
                            reg_lambda=4, enable_categorical=True)
final_pipeline = Pipeline([
                        # ('TransformPipeline', dimension_pipeline),
                        ('Model', xgb_model_fn)
                        ])

In [None]:
# final_pipeline.fit(x_cv, y_cv)
# final_pipeline.transform(x_cv)
# final_pipeline.fit(x_train, y_train)
y_train = l_encoder.fit_transform(y_train)
y_cv = l_encoder.transform(y_cv)
y_test = l_encoder.transform(y_test)
x_train = dimension_pipeline.fit_transform(x_train)
x_cv = dimension_pipeline.transform(x_cv)
x_test = dimension_pipeline.transform(x_test)
# final_pipeline.fit(x_train, y_train, Model__eval_set=[(x_cv, y_cv)])
y = l_encoder.fit_transform(y)
x = dimension_pipeline.fit_transform(x)

In [None]:
# plot_tree(final_pipeline.named_steps['Model'])
# trees = final_pipeline.named_steps['Model'].get_booster().get_dump()

In [None]:
# for i, tree in enumerate(trees):
    # print(f'TRee {i+1}:\n{tree}')

print('Metrics')
print(f'Accuracy in the train dataset: {accuracy_score(final_pipeline.predict(x_train), y_train)}')
print(f'Accuracy in the cross validation dataset: {accuracy_score(final_pipeline.predict(x_cv), y_cv)}')
print(f'Accuracy in the test dataset: {accuracy_score(final_pipeline.predict(x_test), y_test)}')
print(f'Confusion matrix of the test dataset:\n{confusion_matrix(y_test, final_pipeline.predict(x_test))}')
print(f'Mean Squared Error: {mean_squared_error(y_test, final_pipeline.predict(x_test))}')
print(f'F1 Score: {f1_score(y_test, final_pipeline.predict(x_test))}')
print(f'Precision Score: {precision_score(y_test, final_pipeline.predict(x_test))}')
print(f'Recall Score: {recall_score(y_test, final_pipeline.predict(x_test))}')

final_pipeline.named_steps['Model']

In [None]:
# cross_val_score(final_pipeline, x_train, y_train, cv=4, scoring='accuracy').mean()

In [None]:
np.arange(0.2, 0.8, 0.1)

array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])

In [None]:
cv_params = {
    'Model__n_estimators': list(range(50, 100, 5)),
    'Model__learning_rate': list(np.arange(0.00, 0.5, 0.05)),
    'Model__reg_lambda': list(np.arange(0.0, 8.0, 0.5)), 
    'Model__reg_alpha': list(np.arange(0.0, 8.0, 0.5)),
    'Model__tree_method': ['hist', 'approx'],
    'Model__random_state': [RANDOM_STATE],
    'Model__enable_categorical': [True],
    'Model__device': ['cpu'],
    'Model__n_jobs': [-1]
    }
grid_search = GridSearchCV(final_pipeline, param_grid=cv_params, n_jobs=-1)
# grid_search.fit(x, y)

cv_params = {
    'Model__n_estimators': list(range(20, 100, 5)),
    'Model__learning_rate': list(np.arange(0.00, 0.5, 0.05)),
    'Model__reg_lambda': list(np.arange(0.0, 8.0, 0.5)), 
    'Model__reg_alpha': list(np.arange(0.0, 8.0, 0.5)),
    'Model__tree_method': ['hist', 'approx'],
    'Model__random_state': [RANDOM_STATE],
    # 'Model__enable_categorical': [True],
    'Model__device': ['cpu'],
    'Model__n_jobs': [-1]
    }
grid_search = GridSearchCV(final_pipeline, param_grid=cv_params, n_jobs=-1)
# grid_search.fit(x, y)

In [None]:
# grid_search.best_params_

In [None]:
# grid_search.best_score_

{'Model__device': 'cpu',
 'Model__enable_categorical': True,
 'Model__learning_rate': 0.1,
 'Model__n_estimators': 50,
 'Model__n_jobs': -1,
 'Model__objective': 'binary:logistic',
 'Model__random_state': 1,
 'Model__reg_alpha': 5.5,
 'Model__reg_lambda': 6.5,
 'Model__tree_method': 'approx'}
 With 0.743 in the test dataset and 0.76 in kaggle 

In [None]:
best_model = XGBClassifier(n_estimators=50, learning_rate=0.1, verbosity=1, random_state=RANDOM_STATE,
                        reg_lambda=6.5, reg_alpha=5.5, device='cpu', tree_method='approx', n_jobs=-1)
best_model.fit(x_train, y_train, eval_set=[(x_cv, y_cv)])

[0]	validation_0-logloss:0.66355
[1]	validation_0-logloss:0.63872
[2]	validation_0-logloss:0.61895
[3]	validation_0-logloss:0.60260
[4]	validation_0-logloss:0.58935
[5]	validation_0-logloss:0.57623
[6]	validation_0-logloss:0.56551
[7]	validation_0-logloss:0.55678
[8]	validation_0-logloss:0.54825
[9]	validation_0-logloss:0.54109
[10]	validation_0-logloss:0.53505
[11]	validation_0-logloss:0.52884
[12]	validation_0-logloss:0.52362
[13]	validation_0-logloss:0.51845
[14]	validation_0-logloss:0.51426
[15]	validation_0-logloss:0.51032
[16]	validation_0-logloss:0.50671
[17]	validation_0-logloss:0.50364
[18]	validation_0-logloss:0.50087
[19]	validation_0-logloss:0.49819
[20]	validation_0-logloss:0.49589
[21]	validation_0-logloss:0.49380
[22]	validation_0-logloss:0.49210
[23]	validation_0-logloss:0.49048
[24]	validation_0-logloss:0.48893
[25]	validation_0-logloss:0.48781
[26]	validation_0-logloss:0.48669
[27]	validation_0-logloss:0.48576
[28]	validation_0-logloss:0.48491
[29]	validation_0-loglos

In [None]:
print('Metrics')
print(f'Accuracy in the train dataset: {accuracy_score(best_model.predict(x_train), y_train)}')
print(f'Accuracy in the cross validation dataset: {accuracy_score(best_model.predict(x_cv), y_cv)}')
print(f'Accuracy in the test dataset: {accuracy_score(best_model.predict(x_test), y_test)}')
print(f'Confusion matrix of the test dataset:\n{confusion_matrix(y_test, best_model.predict(x_test))}')
print(f'Mean Squared Error: {mean_squared_error(y_test, best_model.predict(x_test))}')
print(f'F1 Score: {f1_score(y_test, best_model.predict(x_test))}')
print(f'Precision Score: {precision_score(y_test, best_model.predict(x_test))}')
print(f'Recall Score: {recall_score(y_test, best_model.predict(x_test))}')

Metrics
Accuracy in the train dataset: 0.7850158182341098
Accuracy in the cross validation dataset: 0.7560414269275029
Accuracy in the test dataset: 0.7436781609195402
Confusion matrix of the test dataset:
[[350  71]
 [152 297]]
Mean Squared Error: 0.25632183908045975
F1 Score: 0.7270501835985311
Precision Score: 0.8070652173913043
Recall Score: 0.6614699331848553


In [None]:
best_model.feature_importances_

array([0.01540554, 0.32621124, 0.00664581, 0.05600733, 0.04044646,
       0.03442273, 0.01106942, 0.40377322, 0.02690349, 0.01884208,
       0.01673163, 0.00596844, 0.01822541, 0.01934709, 0.        ],
      dtype=float32)

In [None]:
x_train

array([[ 7.06771440e-01, -5.10399804e-01,  1.14454690e-01, ...,
         1.00000000e+00,  6.11800000e+03,  0.00000000e+00],
       [ 9.85951128e-01, -1.97177089e-01, -6.26623443e-01, ...,
         1.00000000e+00,  2.34000000e+02,  1.00000000e+00],
       [-3.40152386e-01, -2.56082712e-01, -6.26623443e-01, ...,
         1.00000000e+00,  2.58000000e+03,  1.00000000e+00],
       ...,
       [-3.40152386e-01, -4.96290673e-01, -6.26623443e-01, ...,
         1.00000000e+00,  9.78000000e+02,  1.00000000e+00],
       [ 1.26513081e+00, -5.10399804e-01,  1.14454690e-01, ...,
         1.00000000e+00,  5.53800000e+03,  0.00000000e+00],
       [ 7.86171444e-02,  4.37363289e-02, -6.26623443e-01, ...,
         1.00000000e+00,  2.52000000e+02,  1.00000000e+00]])

## Predict using the real test data

In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   PassengerId   4277 non-null   category
 1   HomePlanet    4190 non-null   category
 2   CryoSleep     4277 non-null   bool    
 3   Cabin         4177 non-null   category
 4   Destination   4185 non-null   object  
 5   Age           4186 non-null   float64 
 6   VIP           4277 non-null   bool    
 7   RoomService   4195 non-null   float64 
 8   FoodCourt     4171 non-null   float64 
 9   ShoppingMall  4179 non-null   float64 
 10  Spa           4176 non-null   float64 
 11  VRDeck        4197 non-null   float64 
 12  Name          4183 non-null   object  
dtypes: bool(2), category(3), float64(6), object(2)
memory usage: 613.8+ KB


In [None]:
test_df_processed = dimension_pipeline.transform(test_df)

In [None]:
transported = pd.DataFrame(data=best_model.predict(test_df_processed), columns=['Transported'])

In [None]:
transported.shape

(4277, 1)

In [None]:
transported.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Transported  4277 non-null   int32
dtypes: int32(1)
memory usage: 16.8 KB


In [None]:
transported = transported.astype('bool')
transported.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Transported  4277 non-null   bool 
dtypes: bool(1)
memory usage: 4.3 KB


In [None]:
submission = test_df.merge(transported, left_index=True, right_index=True, how='inner')

Save the predictions

In [None]:
submission[['PassengerId', 'Transported']].to_csv('../data/processed/submission.csv', index=False)

Save the model

In [None]:
best_model.save_model('../model/spaceship_xgboost_classifier.json')