# Model Developement

This is the model development notebook  

Import the required libraries

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import re
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score, precision_score, recall_score, f1_score

Setup contant values

In [2]:
RANDOM_STATE = 2

## Data dictionary

- **PassengerId** - A unique Id for each passenger. Each Id takes the form ```gggg_pp``` where ```gggg``` indicates a group the passenger is travelling with and ```pp``` is their number within the group. People in a group are often family members, but not always.
- **HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.
- **CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- **Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- **Destination** - The planet the passenger will be debarking to.
- **Age** - The age of the passenger.
- **VIP** - Whether the passenger has paid for special VIP service during the voyage.
- **RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- **Name** - The first and last names of the passenger.
- **Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

## Check the dataframe

let's load the data

In [3]:
df = pd.read_csv('../data/processed/train.csv')
# df = pd.read_csv('../data/train.csv', dtype_backend='pyarrow')

In [4]:
df.head()

Unnamed: 0,Age,Luxury,GroupSize,Deck,CryoSleep,Side,VIP,TravelingAlone,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,PassengerId,Num,Group,Transported
0,0.711945,-0.514066,-0.648735,2.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0001_01,0,1,0.0
1,-0.334037,-0.251479,-0.648735,6.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0002_01,0,2,1.0
2,2.036857,3.190333,-0.022268,1.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0003_01,0,3,0.0
3,0.293552,1.332604,-0.022268,1.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0003_02,0,3,0.0
4,-0.891895,-0.124824,-0.648735,6.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0004_01,1,4,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        8693 non-null   float64
 1   Luxury                     8693 non-null   float64
 2   GroupSize                  8693 non-null   float64
 3   Deck                       8693 non-null   float64
 4   CryoSleep                  8693 non-null   float64
 5   Side                       8693 non-null   float64
 6   VIP                        8693 non-null   float64
 7   TravelingAlone             8693 non-null   float64
 8   HomePlanet_Earth           8693 non-null   float64
 9   HomePlanet_Europa          8693 non-null   float64
 10  HomePlanet_Mars            8693 non-null   float64
 11  Destination_55 Cancri e    8693 non-null   float64
 12  Destination_PSO J318.5-22  8693 non-null   float64
 13  Destination_TRAPPIST-1e    8693 non-null   float

## Separate the training data

In [6]:
# features = list(df.columns.values[:14]) + list(df.columns.values[15:-1])
features = ['Age', 'Luxury', 'GroupSize', 'CryoSleep', 'Side', 'TravelingAlone', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 
            'Destination_TRAPPIST-1e']
label = df.columns.values[-1]
print(f'Features: {features}')
print(f'Label or Target: {label}')

Features: ['Age', 'Luxury', 'GroupSize', 'CryoSleep', 'Side', 'TravelingAlone', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e']
Label or Target: Transported


Create the dataframes 

In [7]:
x = df[features]
y = df[label]

Split the dataframe into 3 parts, train, cross validation and test datasets

In [8]:
x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.20, random_state=RANDOM_STATE, shuffle=True)
x_cv, x_test, y_cv, y_test = train_test_split(x, y, test_size=0.50, random_state=RANDOM_STATE, shuffle=True)
print("X_train.shape", x_train.shape, "y_train.shape", y_train.shape)
print("X_cv.shape", x_cv.shape, "y_cv.shape", y_cv.shape)
print("X_test.shape", x_test.shape, "y_test.shape", y_test.shape)

X_train.shape (6954, 9) y_train.shape (6954,)
X_cv.shape (4346, 9) y_cv.shape (4346,)
X_test.shape (4347, 9) y_test.shape (4347,)


## Model Building

XGBoost: 

Gonna try Gradient boosting decision trees first. 

In [9]:
xgb_model = XGBClassifier(n_estimators=280, learning_rate=0.1, verbosity=1, random_state=RANDOM_STATE, 
                            early_stopping_rounds=70, reg_lambda=25)
xgb_model.fit(x_train, y_train, eval_set=[(x_cv, y_cv)])

[0]	validation_0-logloss:0.67015
[1]	validation_0-logloss:0.65118
[2]	validation_0-logloss:0.63533
[3]	validation_0-logloss:0.62206
[4]	validation_0-logloss:0.61081
[5]	validation_0-logloss:0.60129
[6]	validation_0-logloss:0.59295
[7]	validation_0-logloss:0.58571
[8]	validation_0-logloss:0.57954
[9]	validation_0-logloss:0.57413
[10]	validation_0-logloss:0.56942
[11]	validation_0-logloss:0.56528
[12]	validation_0-logloss:0.56174
[13]	validation_0-logloss:0.55832
[14]	validation_0-logloss:0.55552
[15]	validation_0-logloss:0.55291
[16]	validation_0-logloss:0.55050
[17]	validation_0-logloss:0.54852
[18]	validation_0-logloss:0.54662
[19]	validation_0-logloss:0.54471
[20]	validation_0-logloss:0.54315
[21]	validation_0-logloss:0.54145
[22]	validation_0-logloss:0.54015
[23]	validation_0-logloss:0.53891
[24]	validation_0-logloss:0.53781
[25]	validation_0-logloss:0.53664
[26]	validation_0-logloss:0.53563
[27]	validation_0-logloss:0.53455
[28]	validation_0-logloss:0.53366
[29]	validation_0-loglos

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


[129]	validation_0-logloss:0.50313
[130]	validation_0-logloss:0.50306
[131]	validation_0-logloss:0.50299
[132]	validation_0-logloss:0.50295
[133]	validation_0-logloss:0.50287
[134]	validation_0-logloss:0.50254
[135]	validation_0-logloss:0.50247
[136]	validation_0-logloss:0.50240
[137]	validation_0-logloss:0.50233
[138]	validation_0-logloss:0.50225
[139]	validation_0-logloss:0.50218
[140]	validation_0-logloss:0.50214
[141]	validation_0-logloss:0.50208
[142]	validation_0-logloss:0.50201
[143]	validation_0-logloss:0.50170
[144]	validation_0-logloss:0.50164
[145]	validation_0-logloss:0.50156
[146]	validation_0-logloss:0.50149
[147]	validation_0-logloss:0.50122
[148]	validation_0-logloss:0.50094
[149]	validation_0-logloss:0.50088
[150]	validation_0-logloss:0.50081
[151]	validation_0-logloss:0.50049
[152]	validation_0-logloss:0.50041
[153]	validation_0-logloss:0.50030
[154]	validation_0-logloss:0.50022
[155]	validation_0-logloss:0.50006
[156]	validation_0-logloss:0.49969
[157]	validation_0-l

In [10]:
print('Metrics')
print(f'Accuracy in the train dataset: {accuracy_score(xgb_model.predict(x_train), y_train)}')
print(f'Accuracy in the cross validation dataset: {accuracy_score(xgb_model.predict(x_cv), y_cv)}')
print(f'Accuracy in the test dataset: {accuracy_score(xgb_model.predict(x_test), y_test)}')
print(f'Confusion matrix of the test dataset:\n{confusion_matrix(y_test, xgb_model.predict(x_test))}')
print(f'Mean Squared Error: {mean_squared_error(y_test, xgb_model.predict(x_test))}')
print(f'F1 Score: {f1_score(y_test, xgb_model.predict(x_test))}')
print(f'Precision Score: {precision_score(y_test, xgb_model.predict(x_test))}')
print(f'Recall Score: {recall_score(y_test, xgb_model.predict(x_test))}')

Metrics
Accuracy in the train dataset: 0.7664653436870865
Accuracy in the cross validation dataset: 0.7676023930050622
Accuracy in the test dataset: 0.7508626639061422
Confusion matrix of the test dataset:
[[1774  362]
 [ 721 1490]]
Mean Squared Error: 0.24913733609385783
F1 Score: 0.7334481909918781
Precision Score: 0.8045356371490281
Recall Score: 0.6739032112166441


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  

right now the model is overfitting 