In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn

from tqdm import tqdm

from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer,  KNNImputer

from sklearn.preprocessing import (MinMaxScaler,
                                   StandardScaler,
                                   RobustScaler,
                                   Normalizer,
                                   PolynomialFeatures,
                                   LabelEncoder,
                                   OneHotEncoder,
                                   OrdinalEncoder)

from sklearn.model_selection import (cross_val_score,
                                     KFold,
                                     StratifiedShuffleSplit,
                                     GridSearchCV,
                                     StratifiedKFold,
                                     cross_validate,
                                     train_test_split)


from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import (precision_score,
                             accuracy_score,
                             confusion_matrix,
                             ConfusionMatrixDisplay,
                             PrecisionRecallDisplay)

from IPython.display import display
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('train.csv', index_col = 0)

train.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Name          8493 non-null   object 
 12  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(6)
memory usage: 891.4+ KB


In [None]:
train.isnull().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [None]:
train.dropna()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0001_01,Europa,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False
0002_01,Earth,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True
0003_01,Europa,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
0003_02,Europa,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
0004_01,Earth,False,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
9278_01,Earth,True,18.0,False,0.0,0.0,0.0,0.0,0.0,False
9279_01,Earth,False,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
9280_01,Europa,False,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


In [None]:
drop_cols = ['Cabin', 'Destination', 'Name']

train.drop(drop_cols, axis = 1, inplace = True)

In [None]:
train.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0001_01,Europa,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False
0002_01,Earth,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True
0003_01,Europa,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
0003_02,Europa,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
0004_01,Earth,False,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [None]:
train.HomePlanet.unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [None]:
ohe = OneHotEncoder()
ohe.fit_transform(pd.DataFrame(train.HomePlanet))
train_ohe = ohe.transform(pd.DataFrame(train.HomePlanet)).toarray()

In [None]:
ohe.__dict__

{'categories': 'auto',
 'sparse': 'deprecated',
 'sparse_output': True,
 'dtype': numpy.float64,
 'handle_unknown': 'error',
 'drop': None,
 'min_frequency': None,
 'max_categories': None,
 '_infrequent_enabled': False,
 'n_features_in_': 1,
 'feature_names_in_': array(['HomePlanet'], dtype=object),
 'categories_': [array(['Earth', 'Europa', 'Mars', nan], dtype=object)],
 '_drop_idx_after_grouping': None,
 'drop_idx_': None,
 '_n_features_outs': [4]}

In [None]:
pd.DataFrame(train_ohe, columns = ohe.categories_)

Unnamed: 0,Earth,Europa,Mars,NaN
0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
8688,0.0,1.0,0.0,0.0
8689,1.0,0.0,0.0,0.0
8690,1.0,0.0,0.0,0.0
8691,0.0,1.0,0.0,0.0


In [None]:
x = train.drop('Transported', axis = 1)
y = train.Transported

In [None]:
x.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0001_01,Europa,False,39.0,False,0.0,0.0,0.0,0.0,0.0
0002_01,Earth,False,24.0,False,109.0,9.0,25.0,549.0,44.0
0003_01,Europa,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0
0003_02,Europa,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0
0004_01,Earth,False,16.0,False,303.0,70.0,151.0,565.0,2.0


In [None]:
cat_cols = ['HomePlanet', 'CryoSleep', 'VIP']
num_cols = x.drop(cat_cols, axis = 1).columns.to_list()


cat_pipe = Pipeline([
                  ('imputer_cat', SimpleImputer(strategy = 'most_frequent')),
                  ('encoder', OneHotEncoder())
                ])
num_pipe = Pipeline([
                  ('imputer_cat', SimpleImputer(strategy = 'mean')),
                  ('scaler', StandardScaler())
                ])

CT = ColumnTransformer([

        ('cat_pipe', cat_pipe, cat_cols),
        ('num_pipe', num_pipe, num_cols)


    ],  remainder = 'passthrough')


pipe = Pipeline([
                  ('preprocessing', CT),
                 ('classifier', RandomForestClassifier(random_state=42))
                ])

In [None]:
param_grid = [
    {
      'classifier': [RandomForestClassifier(random_state=42)],
     'classifier__n_estimators': [30, 50],
     'classifier__max_depth': [3,5],
    }]


cross_val = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, cv=cross_val,
                    return_train_score = True,
                  n_jobs = -1, scoring = 'roc_auc')


grid.fit(x, y)

In [None]:
result = pd.DataFrame(grid.cv_results_)
result.set_index('param_classifier', inplace = True)

result.sort_values(['rank_test_score'], ascending = [True]).T

param_classifier,"RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42)","RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42).1","RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42).2","RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42).3"
mean_fit_time,0.270462,0.674492,0.890446,0.379193
std_fit_time,0.044266,0.276877,0.07203,0.122646
mean_score_time,0.032283,0.106963,0.168514,0.065534
std_score_time,0.005979,0.109179,0.082133,0.025044
param_classifier__max_depth,5,5,3,3
param_classifier__n_estimators,50,30,50,30
params,{'classifier': RandomForestClassifier(max_dept...,{'classifier': RandomForestClassifier(max_dept...,{'classifier': RandomForestClassifier(max_dept...,{'classifier': RandomForestClassifier(max_dept...
split0_test_score,0.860283,0.860195,0.849965,0.846494
split1_test_score,0.855126,0.855496,0.845466,0.842168
split2_test_score,0.855094,0.854578,0.843531,0.84317


In [None]:
pd.DataFrame(grid.cv_results_).sort_values(['rank_test_score'], ascending = [True]).T

Unnamed: 0,0,1,2,3
mean_fit_time,0.356849,0.563395,0.52144,0.62852
std_fit_time,0.052606,0.055539,0.11478,0.095021
mean_score_time,0.034792,0.04014,0.028509,0.03553
std_score_time,0.010388,0.011901,0.007577,0.009012
param_classifier__max_depth,3,3,5,5
param_classifier__n_estimators,30,50,30,50
params,"{'classifier__max_depth': 3, 'classifier__n_es...","{'classifier__max_depth': 3, 'classifier__n_es...","{'classifier__max_depth': 5, 'classifier__n_es...","{'classifier__max_depth': 5, 'classifier__n_es..."
split0_test_score,0.87981,0.880254,0.878826,0.877022
split1_test_score,0.868175,0.866962,0.864463,0.861437
split2_test_score,0.871608,0.872257,0.870911,0.868819


In [None]:
pipe.steps[0][1].transformers_[0]

('encoder', OneHotEncoder(), ['HomePlanet', 'CryoSleep', 'VIP'])