In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,8514.0,28.82793,14.489021,0.0,19.0,27.0,38.0,79.0
RoomService,8512.0,224.687617,666.717663,0.0,0.0,0.0,47.0,14327.0
FoodCourt,8510.0,458.077203,1611.48924,0.0,0.0,0.0,76.0,29813.0
ShoppingMall,8485.0,173.729169,604.696458,0.0,0.0,0.0,27.0,23492.0
Spa,8510.0,311.138778,1136.705535,0.0,0.0,0.0,59.0,22408.0
VRDeck,8505.0,304.854791,1145.717189,0.0,0.0,0.0,46.0,24133.0


In [5]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
def data_processing(df):
    Expenses_columns = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
    df["Expenses"] = df[Expenses_columns].sum(axis=1)
    df["Cabin"].fillna("Missing_Value",inplace=True)
    df[['Deck', 'Number', 'Side']] = [j for j in (i.split("/") if i!="Missing_Value" else ["Missing","Missing","Missing"] for i in df["Cabin"])] 
    df["Group"] = [i[:4] for i in df["PassengerId"]]
    df["Person"] = [i[6:] for i in df["PassengerId"]]
    df["CryoSleep"].fillna("Missing",inplace=True)
    df["HomePlanet"].fillna("Missing",inplace=True)
    df["Destination"].fillna("Missing",inplace=True)
    df["Age"].fillna(df["Age"].mean(),inplace=True)
    df["RoomService"].fillna(df["RoomService"].mean(),inplace=True)
    df["FoodCourt"].fillna(df["FoodCourt"].mean(),inplace=True)
    df["ShoppingMall"].fillna(df["ShoppingMall"].mean(),inplace=True)
    df["Spa"].fillna(df["Spa"].mean(),inplace=True)
    df["VRDeck"].fillna(df["VRDeck"].mean(),inplace=True)
    df = pd.get_dummies(data=df,columns=["HomePlanet","Destination","Deck"],drop_first=True)
    df.drop(columns=["Name","PassengerId","Cabin","VIP","CryoSleep","Number"],axis=1,inplace=True)
    df["Side"] = LabelEncoder().fit_transform(df["Side"])
    df = df.astype("float")

    return df   

In [7]:
df["Transported"] = LabelEncoder().fit_transform(df["Transported"])

In [8]:
df = data_processing(df=df)

In [9]:
df.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Expenses,Side,Group,...,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_Missing,Deck_T
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24.0,109.0,9.0,25.0,549.0,44.0,1.0,736.0,2.0,2.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,0.0,10383.0,2.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,33.0,0.0,1283.0,371.0,3329.0,193.0,0.0,5176.0,2.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16.0,303.0,70.0,151.0,565.0,2.0,1.0,1091.0,2.0,4.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [10]:
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        8693 non-null   float64
 1   RoomService                8693 non-null   float64
 2   FoodCourt                  8693 non-null   float64
 3   ShoppingMall               8693 non-null   float64
 4   Spa                        8693 non-null   float64
 5   VRDeck                     8693 non-null   float64
 6   Transported                8693 non-null   float64
 7   Expenses                   8693 non-null   float64
 8   Side                       8693 non-null   float64
 9   Group                      8693 non-null   float64
 10  Person                     8693 non-null   float64
 11  HomePlanet_Europa          8693 non-null   float64
 12  HomePlanet_Mars            8693 non-null   float64
 13  HomePlanet_Missing         8693 non-null   float

In [12]:
X = df.drop(columns="Transported",axis=1)
Y = df["Transported"]

In [13]:
X

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Expenses,Side,Group,Person,...,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_Missing,Deck_T
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24.0,109.0,9.0,25.0,549.0,44.0,736.0,2.0,2.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,10383.0,2.0,3.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,33.0,0.0,1283.0,371.0,3329.0,193.0,5176.0,2.0,3.0,2.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16.0,303.0,70.0,151.0,565.0,2.0,1091.0,2.0,4.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,41.0,0.0,6819.0,0.0,1643.0,74.0,8536.0,1.0,9276.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8689,18.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,9278.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8690,26.0,0.0,0.0,1872.0,1.0,0.0,1873.0,2.0,9279.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8691,32.0,0.0,1049.0,0.0,353.0,3235.0,4637.0,2.0,9280.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [14]:
Y

0       0.0
1       1.0
2       0.0
3       0.0
4       1.0
       ... 
8688    0.0
8689    0.0
8690    1.0
8691    0.0
8692    1.0
Name: Transported, Length: 8693, dtype: float64

In [15]:
x_train ,x_test ,y_train ,y_test = train_test_split(X,Y,train_size=0.9)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape,sep="\n")

(7823, 24)
(870, 24)
(7823,)
(870,)


In [16]:
min_max_scaler = StandardScaler()
Scaled_features = ["Age","RoomService","FoodCourt","ShoppingMall","Spa","VRDeck","Expenses"]

In [17]:
x_train[Scaled_features] = min_max_scaler.fit_transform(x_train[Scaled_features])

In [18]:
x_test[Scaled_features] = min_max_scaler.transform(x_test[Scaled_features])

In [19]:
log_reg = LogisticRegression()
dtc_model = DecisionTreeClassifier()
rfc_model = RandomForestClassifier()
xgb_model = XGBClassifier()

In [20]:
logistic_score = cross_val_score(log_reg,X,Y,cv=5).mean()
dtc_score = cross_val_score(dtc_model,X,Y,cv=5).mean()
rfc_score = cross_val_score(rfc_model,X,Y,cv=5).mean()
xgb_score = cross_val_score(xgb_model,X,Y,cv=5).mean()
print(f"Acc of logistig : {logistic_score}",f"Acc of dtc : {dtc_score}",f"Acc of rfc : {rfc_score}",f"Acc of xgb : {xgb_score}",sep="\n")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Acc of logistig : 0.77004825994861
Acc of dtc : 0.6592823144129365
Acc of rfc : 0.7692439274717755
Acc of xgb : 0.6251081431797834


In [21]:
xgb_model.fit(x_train,y_train)

In [22]:
y_pred = xgb_model.predict(x_test)

In [23]:
xgb_model.score(x_test,y_test)

0.8149425287356322

In [24]:
xgb_model.score(x_train,y_train)

0.9293110060079254

In [25]:
xgb_model.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': 0,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.300000012,
 'max_bin': 256,
 'max_cat_threshold': 64,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0,
 'max_depth': 6,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 100,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'sampling_method': 'uniform',
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

In [26]:
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

In [27]:
xgb_grid = GridSearchCV(estimator=xgb_model,
                        param_grid=parameters,
                        scoring="roc_auc",
                        n_jobs=5,
                        cv=5,
                        verbose=2)

In [28]:
xgb_grid.fit(x_train,y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [29]:
xgb_grid.best_params_

{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 180}

In [30]:
xgb_grid.score(x_test,y_test)

0.9096996152707901

In [31]:
xgb_grid.score(x_train,y_train)

0.9351662838909465

In [32]:
grid_pred = xgb_grid.predict(x_test)

In [33]:
new_data = pd.DataFrame({"y_true":y_test,
                         "pred":grid_pred},dtype=float)
new_data

Unnamed: 0,y_true,pred
2777,1.0,1.0
7151,0.0,0.0
1045,0.0,0.0
7385,0.0,0.0
6296,1.0,1.0
...,...,...
3654,1.0,1.0
6001,1.0,1.0
1943,0.0,0.0
3144,1.0,1.0


In [34]:
new_data["bool_deger"] = [True if i==j else False for i,j in zip(new_data.y_true,new_data.pred)]

In [35]:
new_data

Unnamed: 0,y_true,pred,bool_deger
2777,1.0,1.0,True
7151,0.0,0.0,True
1045,0.0,0.0,True
7385,0.0,0.0,True
6296,1.0,1.0,True
...,...,...,...
3654,1.0,1.0,True
6001,1.0,1.0,True
1943,0.0,0.0,True
3144,1.0,1.0,True


In [36]:
new_data["bool_deger"].value_counts()

True     718
False    152
Name: bool_deger, dtype: int64

In [37]:
df_test = pd.read_csv("test.csv")
df_test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [38]:
df_test.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [39]:
idx = df_test["PassengerId"]

In [40]:
df_test = data_processing(df=df_test)

In [41]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        4277 non-null   float64
 1   RoomService                4277 non-null   float64
 2   FoodCourt                  4277 non-null   float64
 3   ShoppingMall               4277 non-null   float64
 4   Spa                        4277 non-null   float64
 5   VRDeck                     4277 non-null   float64
 6   Expenses                   4277 non-null   float64
 7   Side                       4277 non-null   float64
 8   Group                      4277 non-null   float64
 9   Person                     4277 non-null   float64
 10  HomePlanet_Europa          4277 non-null   float64
 11  HomePlanet_Mars            4277 non-null   float64
 12  HomePlanet_Missing         4277 non-null   float64
 13  Destination_Missing        4277 non-null   float

In [42]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        4277 non-null   float64
 1   RoomService                4277 non-null   float64
 2   FoodCourt                  4277 non-null   float64
 3   ShoppingMall               4277 non-null   float64
 4   Spa                        4277 non-null   float64
 5   VRDeck                     4277 non-null   float64
 6   Expenses                   4277 non-null   float64
 7   Side                       4277 non-null   float64
 8   Group                      4277 non-null   float64
 9   Person                     4277 non-null   float64
 10  HomePlanet_Europa          4277 non-null   float64
 11  HomePlanet_Mars            4277 non-null   float64
 12  HomePlanet_Missing         4277 non-null   float64
 13  Destination_Missing        4277 non-null   float

In [43]:
df_test.isna().sum()

Age                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
Expenses                     0
Side                         0
Group                        0
Person                       0
HomePlanet_Europa            0
HomePlanet_Mars              0
HomePlanet_Missing           0
Destination_Missing          0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
Deck_B                       0
Deck_C                       0
Deck_D                       0
Deck_E                       0
Deck_F                       0
Deck_G                       0
Deck_Missing                 0
Deck_T                       0
dtype: int64

In [44]:
df_test[Scaled_features] = min_max_scaler.transform(df_test[Scaled_features])
df_test.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Expenses,Side,Group,Person,...,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_Missing,Deck_T
0,-0.124707,-0.336913,-0.28589,-0.295791,-0.276034,-0.263727,-0.513931,2.0,13.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.682835,-0.336913,-0.280306,-0.295791,2.247802,-0.263727,0.506194,2.0,18.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.154357,-0.336913,-0.28589,-0.295791,-0.276034,-0.263727,-0.513931,2.0,19.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.642719,-0.336913,3.841173,-0.295791,-0.114215,0.269249,2.158133,2.0,21.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.613069,-0.321964,-0.28589,0.783926,-0.276034,-0.263727,-0.281593,2.0,23.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [45]:
test_pred = xgb_grid.predict(df_test)

In [46]:
pred_df = pd.DataFrame({"Transported":test_pred},index=idx)
pred_df["Transported"] = [True if i==1 else False for i in pred_df["Transported"]]
pred_df

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,False
0018_01,False
0019_01,True
0021_01,True
0023_01,False
...,...
9266_02,True
9269_01,True
9271_01,True
9273_01,True
