# Introduction


1. Data Exploratory 
2. Encoding and Cleaning Data
3. Training Machine Learning Models
4. Predictions

# 1. Data Exploratory 

In [158]:
import numpy as np 
import pandas as pd 

In [159]:
train = pd.read_csv('/train.csv', index_col='PassengerId')
test = pd.read_csv('/test.csv', index_col='PassengerId')

# Shape and preview
print('Train set shape:', train.shape)
print('Test set shape:', test.shape)
train.head()

Train set shape: (8693, 13)
Test set shape: (4277, 12)


Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [160]:
test_df = pd.read_csv('/test.csv')

In [161]:
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

In [162]:
train['Transported'].replace(False, 0, inplace=True)
train['Transported'].replace(True, 1, inplace=True)

In [163]:
train[['deck','num', 'side']] = train['Cabin'].str.split('/', expand=True)
test[['deck','num', 'side']] = test['Cabin'].str.split('/', expand=True)

train.drop('Cabin', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)

In [164]:
col_to_sum = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train['SumSpends'] = train[col_to_sum].sum(axis=1)
test['SumSpends'] = test[col_to_sum].sum(axis=1)

In [165]:
train.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side,SumSpends
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0,B,0,P,0.0
0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,1,F,0,S,736.0
0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S,10383.0
0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0,A,0,S,5176.0
0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1,F,1,S,1091.0


In [166]:
test.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,side,SumSpends
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,3,S,0.0
0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,4,S,2832.0
0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,0,S,0.0
0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,1,S,7418.0
0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,5,S,645.0


In [167]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Destination   8511 non-null   object 
 3   Age           8514 non-null   float64
 4   VIP           8490 non-null   object 
 5   RoomService   8512 non-null   float64
 6   FoodCourt     8510 non-null   float64
 7   ShoppingMall  8485 non-null   float64
 8   Spa           8510 non-null   float64
 9   VRDeck        8505 non-null   float64
 10  Transported   8693 non-null   int64  
 11  deck          8494 non-null   object 
 12  num           8494 non-null   object 
 13  side          8494 non-null   object 
 14  SumSpends     8693 non-null   float64
dtypes: float64(7), int64(1), object(7)
memory usage: 1.1+ MB


In [168]:
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,SumSpends
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0,8693.0,8693.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791,0.503624,1440.866329
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189,0.500016,2803.045694
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0,1.0,716.0
75%,38.0,47.0,76.0,27.0,59.0,46.0,1.0,1441.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,1.0,35987.0


In [169]:
train.isna().sum()

HomePlanet      201
CryoSleep       217
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
deck            199
num             199
side            199
SumSpends         0
dtype: int64

In [170]:
test.isna().sum()

HomePlanet       87
CryoSleep        93
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
deck            100
num             100
side            100
SumSpends         0
dtype: int64

In [171]:
null_cols = train.isnull().sum().sort_values(ascending=False)
null_cols = list(null_cols[null_cols>1].index)
null_cols

['CryoSleep',
 'ShoppingMall',
 'VIP',
 'HomePlanet',
 'deck',
 'num',
 'side',
 'VRDeck',
 'FoodCourt',
 'Spa',
 'Destination',
 'RoomService',
 'Age']

# 2. Encoding and Cleaning Data

In [172]:
object_cols = [col for col in train.columns if train[col].dtype == 'object' or train[col].dtype == 'category']
numeric_cols = [col for col in train.columns if train[col].dtype == 'float64']

print(f'Object cols -- {object_cols}')
print(f'Numeric cols -- {numeric_cols}')

Object cols -- ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'deck', 'num', 'side']
Numeric cols -- ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'SumSpends']


In [173]:
train[object_cols] = train[object_cols].astype('category')
test[object_cols] = test[object_cols].astype('category')

In [174]:
from sklearn.preprocessing import OrdinalEncoder

oc = OrdinalEncoder()

df_for_encode = pd.concat([train, test])

df_for_encode[object_cols] = df_for_encode[object_cols].astype('category')

df_for_encode[object_cols] = oc.fit_transform(df_for_encode[object_cols])

del train, test

train = df_for_encode.iloc[:8693, :]
test = df_for_encode.iloc[8693: , :]

del df_for_encode

test.drop('Transported', inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [175]:
test.head(5)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,side,SumSpends
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0013_01,0.0,1.0,2.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1117.0,1.0,0.0
0018_01,0.0,0.0,2.0,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,5.0,1228.0,1.0,2832.0
0019_01,1.0,1.0,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0
0021_01,1.0,0.0,2.0,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,2.0,1.0,1.0,7418.0
0023_01,0.0,0.0,2.0,20.0,0.0,10.0,0.0,635.0,0.0,0.0,5.0,1339.0,1.0,645.0


In [176]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


ct = ColumnTransformer([("imp", SimpleImputer(strategy='mean'), null_cols)])
    
train[null_cols] = ct.fit_transform(train[null_cols])
test[null_cols] = ct.fit_transform(test[null_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [177]:
test.shape

(4277, 14)

# 3. Training machine learning models

In [178]:
X = train.copy()
y = X.pop('Transported')
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=101)
# from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)

In [179]:
!pip3 install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [180]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.model_selection import GridSearchCV

In [183]:
# XGBoost

xgb = XGBClassifier()
scores = cross_val_score(xgb,X_train,y_train,scoring='neg_mean_squared_error',cv=5)
xgb_mse = round(abs(scores.mean()), 4)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_valid)
xgb_acc = round(metrics.accuracy_score(y_valid, y_pred), 4)
print("The accuracy score of XGBoost Classifier is",xgb_acc)

The accuracy score of XGBoost Classifier is 0.8014


In [184]:
# Histogram-based Gradient Boosting Classification Tree

hgb = HistGradientBoostingClassifier()
scores = cross_val_score(hgb,X_train,y_train,scoring='neg_mean_squared_error',cv=5)
hgb_mse = round(abs(scores.mean()), 4)
hgb.fit(X_train, y_train)
y_pred = hgb.predict(X_valid)
hgb_acc = round(metrics.accuracy_score(y_valid, y_pred), 4)
print("The accuracy score of HistGradientBoostingClassifier is",hgb_acc)

The accuracy score of HistGradientBoostingClassifier is 0.8156


In [185]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
scores = cross_val_score(decision_tree,X_train,y_train,scoring='neg_mean_squared_error',cv=5)
decision_tree_mse = round(abs(scores.mean()), 4)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_valid)
decision_tree_acc = round(metrics.accuracy_score(y_valid, y_pred), 4)
print("The accuracy score of Decision tree is",decision_tree_acc)

The accuracy score of Decision tree is 0.7477


In [186]:
# Catboost

catb = CatBoostClassifier(verbose = 0)
scores = cross_val_score(catb,X_train,y_train,scoring='neg_mean_squared_error',cv=5)
catb_mse = round(abs(scores.mean()), 4)
catb.fit(X_train, y_train)
y_pred = catb.predict(X_valid)
catb_acc = round(metrics.accuracy_score(y_valid, y_pred), 4)
print("The accuracy score of CatBoost Classifier is",catb_acc)

The accuracy score of CatBoost Classifier is 0.8179


In [187]:
# Random Forest

random_forest = RandomForestClassifier()
scores = cross_val_score(random_forest,X_train,y_train,scoring='neg_mean_squared_error',cv=5)
random_forest_mse = round(abs(scores.mean()), 4)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_valid)
random_forest_acc = round(metrics.accuracy_score(y_valid, y_pred), 4)
print("The accuracy score of Random Forest Classifier is",random_forest_acc)

The accuracy score of Random Forest Classifier is 0.811


In [188]:
gb = GradientBoostingClassifier()
scores = cross_val_score(gb,X_train,y_train,scoring='neg_mean_squared_error',cv=5)
gb_mse = round(abs(scores.mean()), 4)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_valid)
gb_acc = round(metrics.accuracy_score(y_valid, y_pred), 4)
print("The accuracy score of Gradient Boosting Classifier is",gb_acc)


The accuracy score of Gradient Boosting Classifier is 0.7998


In [189]:
# Tuned Histboost
hist =HistGradientBoostingClassifier(verbose=0)

In [190]:
test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [191]:
test.shape

(4277, 14)

In [192]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [193]:
test_df.shape

(4277, 13)

#### 4.Predictions

In [194]:
model=hist
model.fit(X,y)
predictions = model.predict(test)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Transported': predictions})
output['Transported'] = output['Transported'].astype('bool')
output.to_csv('submission.csv', index=False)
