In [125]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb
import eli5

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from eli5.sklearn import PermutationImportance



In [126]:
test = pd.read_csv('./test.csv')
train = pd.read_csv('./train.csv')


In [127]:
train.info(verbose = True)
test.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  

In [128]:
train['set'] = 1
test['set'] = 0
train_test=train.append(other = test)
train_test.reset_index(drop = False, inplace = True)

In [129]:
train_test.isnull().sum()


index              0
PassengerId        0
HomePlanet       288
CryoSleep        310
Cabin            299
Destination      274
Age              270
VIP              296
RoomService      263
FoodCourt        289
ShoppingMall     306
Spa              284
VRDeck           268
Name             294
Transported     4277
set                0
dtype: int64

In [130]:
# Dealing with missing value in CryoSleep with billing feature vice versa
non_sleeping_features = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
train_test.loc[:,non_sleeping_features]=train_test.apply(lambda x: 0 if x.CryoSleep == True else x,axis =1)
train_test['Expenses'] = train_test.loc[:,non_sleeping_features].sum(axis=1)
train_test.loc[:,['CryoSleep']]=train_test.apply(lambda x: True if x.Expenses == 0 and pd.isna(x.CryoSleep) else x,axis =1)
train_test.loc[:,['CryoSleep']]=train_test.apply(lambda x: False if x.Expenses != 0 and pd.isna(x.CryoSleep) else x,axis =1)

In [131]:
# Extract which group the passenger was from
train_test.loc[:,['group']] = train_test.PassengerId.apply(lambda x: x[0:4] )
df = train_test[train_test.duplicated('group', keep=False)].sort_values('group')
df.to_csv('group_inspect.csv', index=False)

train_test.loc[:,['FirstName']] = train_test.Name.str.split(" ",expand=True).iloc[:,0]
train_test.loc[:,['LastName']] = train_test.Name.str.split(" ",expand=True).iloc[:,1]

In [132]:
# From inspecting impact of same group's member, we can found out that same group's members come from 
# same planet and has high probability is from same family (i.e same last name), same cabin

group_Cabin     = train_test.loc[:,['group','Cabin']].dropna().drop_duplicates('group')
group_LastName = train_test.loc[:,['group','LastName']].dropna().drop_duplicates('group')
group_HomePlanet  = train_test.loc[:,['group','HomePlanet']].dropna().drop_duplicates('group')
train_test      = pd.merge(train_test,group_Cabin,how="left",on='group',suffixes=('','_y'))
train_test      = pd.merge(train_test,group_LastName,how="left",on='group',suffixes=('','_y'))
train_test      = pd.merge(train_test,group_HomePlanet,how="left",on='group',suffixes=('','_y'))

# Fill in the missing value related to group
train_test.loc[:,['Cabin']]=train_test.apply(lambda x:  x.Cabin_y if pd.isna(x.Cabin) else x,axis=1)
train_test.loc[:,['HomePlanet']]=train_test.apply(lambda x:  x.HomePlanet_y if pd.isna(x.HomePlanet) else x,axis=1)
train_test.loc[:,['LastName']]=train_test.apply(lambda x:  x.LastName_y if pd.isna(x.LastName) else x,axis=1)

In [133]:

train_test.loc[:,['Cabin_deck']] = train_test.Cabin.str.split("/",expand=True).iloc[:,0]
train_test.loc[:,['Cabin_num']] = train_test.Cabin.str.split("/",expand=True).iloc[:,1]
train_test.loc[:,['Cabin_side']] = train_test.Cabin.str.split("/",expand=True).iloc[:,2]

train_test['Cabin_num' ].fillna(value = -1, inplace = True)
train_test['Cabin_num'] = train_test['Cabin_num'].astype(dtype = 'int')


In [134]:
df = train_test[train_test.duplicated('group', keep=False)].sort_values('group')
df.to_csv('group_inspect.csv', index=False)

In [135]:
num_cols = ['ShoppingMall','FoodCourt','RoomService','Spa','VRDeck','Expenses','Age']
cat_cols = ['CryoSleep','Cabin_deck', 'Cabin_side','VIP','HomePlanet','Destination']
transported=['Transported']
train_test_1 = train_test[num_cols+cat_cols+transported+['Cabin_num','set']].copy()

num_imp = SimpleImputer(strategy='mean')
cat_imp = SimpleImputer(strategy='most_frequent')
ohe = OneHotEncoder (handle_unknown='ignore',sparse = False)
le = LabelEncoder()

# Filling other missing value unrelated to group
train_test[num_cols] = pd.DataFrame(num_imp.fit_transform(train_test_1[num_cols]),columns=num_cols)
train_test[cat_cols] = pd.DataFrame(cat_imp.fit_transform(train_test_1[cat_cols]),columns=cat_cols)
temp_train = pd.DataFrame(ohe.fit_transform(train_test_1[cat_cols]),columns= ohe.get_feature_names_out())
train_test_1 = train_test_1.drop(cat_cols,axis=1)
train_test_1 = pd.concat([train_test_1,temp_train],axis=1)




In [136]:
def get_score(model,X,y):
    n = cross_val_score(model,X,y,scoring ='accuracy',cv=20)
    return n

In [137]:
train_test_1

Unnamed: 0,ShoppingMall,FoodCourt,RoomService,Spa,VRDeck,Expenses,Age,Transported,Cabin_num,set,...,VIP_True,VIP_nan,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_nan,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_nan
0,0.0,0.0,0.0,0.0,0.0,0.0,39.0,False,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,25.0,9.0,109.0,549.0,44.0,736.0,24.0,True,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,3576.0,43.0,6715.0,49.0,10383.0,58.0,False,0,1,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,371.0,1283.0,0.0,3329.0,193.0,5176.0,33.0,False,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,151.0,70.0,303.0,565.0,2.0,1091.0,16.0,True,1,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,0.0,0.0,0.0,0.0,0.0,0.0,34.0,,1496,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12966,17.0,847.0,0.0,10.0,144.0,1018.0,42.0,,-1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12967,0.0,0.0,0.0,0.0,0.0,0.0,,,296,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
12968,0.0,2680.0,0.0,0.0,523.0,3203.0,,,297,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [138]:
train = train_test_1[train_test['set']== 1].copy()
train.Transported =train.Transported.astype('int')
test = train_test_1[train_test['set'] == 0].drop("Transported",axis=1)

In [139]:
X = train.drop(['set','Transported'],axis=1)
y = train.Transported

In [140]:
X,y = shuffle(X,y)
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [141]:
params_XGB_best= {'lambda': 3.0610042624477543, 
             'alpha': 4.581902571574289, 
             'colsample_bytree': 0.9241969052729379, 
             'subsample': 0.9527591724824661, 
             'learning_rate': 0.06672065863100594, 
             'n_estimators': 800, 
             'max_depth': 5, 
             'min_child_weight': 1, 
             'num_parallel_tree': 1}
print(get_score(xgb.XGBClassifier(**params_XGB_best),X,y).mean())

0.8101848614863076


In [149]:
perm = PermutationImportance(xgb.XGBClassifier(**params_XGB_best), random_state=1,n_iter =10,cv=5).fit(X, y)
eli5.show_weights(perm, feature_names = X.columns.tolist(),top=50)

Weight,Feature
0.0384  ± 0.0158,Spa
0.0343  ± 0.0138,VRDeck
0.0306  ± 0.0130,FoodCourt
0.0283  ± 0.0131,CryoSleep_False
0.0221  ± 0.0134,RoomService
0.0212  ± 0.0127,Expenses
0.0211  ± 0.0141,Cabin_num
0.0081  ± 0.0116,ShoppingMall
0.0065  ± 0.0154,Age
0.0061  ± 0.0056,Cabin_deck_E


In [152]:
drop_list=['ShoppingMall', 'Age', 'Cabin_deck_E', 'Destination_TRAPPIST-1e', 'Cabin_side_S',
           'Cabin_side_P', 'Cabin_deck_C', 'Cabin_deck_F', 'HomePlanet_Mars', 'HomePlanet_Earth',
           'HomePlanet_Europa', 'CryoSleep_True', 'Cabin_deck_G', 'Cabin_deck_D', 'Cabin_deck_A',
           'VIP_False', 'Cabin_deck_B', 'VIP_nan', 'Cabin_deck_T', 'Cabin_deck_nan', 'Cabin_side_None',
           'VIP_True', 'Destination_nan', 'Destination_55 Cancri e', 'HomePlanet_nan', 'Destination_PSO J318.5-22']

In [153]:
X=X.drop(drop_list,axis=1)
test=test.drop(drop_list,axis=1)

In [145]:
print(get_score(xgb.XGBClassifier(**params_XGB_best),X,y).mean())

In [146]:
test

Unnamed: 0,ShoppingMall,FoodCourt,RoomService,Spa,VRDeck,Expenses,Age,Cabin_num,set,CryoSleep_False,...,VIP_True,VIP_nan,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_nan,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_nan
8693,0.0,0.0,0.0,0.0,0.0,0.0,27.0,3,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8694,0.0,9.0,0.0,2823.0,0.0,2832.0,19.0,4,0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8695,0.0,0.0,0.0,0.0,0.0,0.0,31.0,0,0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
8696,0.0,6652.0,0.0,181.0,585.0,7418.0,38.0,1,0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8697,635.0,0.0,10.0,0.0,0.0,645.0,20.0,5,0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,0.0,0.0,0.0,0.0,0.0,0.0,34.0,1496,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12966,17.0,847.0,0.0,10.0,144.0,1018.0,42.0,-1,0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12967,0.0,0.0,0.0,0.0,0.0,0.0,,296,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
12968,0.0,2680.0,0.0,0.0,523.0,3203.0,,297,0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [148]:
test = test.drop('set', axis=1)
pred_XGB_best = (xgb.XGBClassifier(**params_XGB_best).fit(X,y)).predict(test)
sample = pd.read_csv('./sample_submission.csv')
sample['Transported'] = pred_XGB_best
sample['Transported']=sample['Transported']>0.5
sample.to_csv('311605002.csv', index=False)