In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.utils import shuffle
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, recall_score,roc_auc_score
import optuna
from xgboost import XGBClassifier

In [2]:
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')
train = pd.read_csv('train.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [6]:
train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [None]:
combine=train.append(test)

In [8]:
px.box(combine,x="FoodCourt",template="plotly_dark")

In [9]:
Expenses_columns = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

In [10]:
# if passenger was in CryoSleep, expenses must be 0

combine.loc[:,Expenses_columns]=combine.apply(lambda x: 0 if x.CryoSleep == True else x,axis =1)

In [11]:
combine['Expenses'] = combine.loc[:,Expenses_columns].sum(axis=1)
combine.loc[:,['CryoSleep']]=combine.apply(lambda x: True if x.Expenses == 0 and pd.isna(x.CryoSleep) else x,axis =1)

In [12]:
combine[combine.Cabin.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Expenses
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,Justie Pooles,False,908.0
93,0101_01,Mars,True,,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,Book Trad,True,0.0
103,0110_01,Europa,False,,TRAPPIST-1e,32.0,False,0.0,410.0,6.0,3929.0,764.0,Graviph Aloubtled,False,5109.0
222,0239_01,Mars,False,,TRAPPIST-1e,37.0,False,637.0,0.0,0.0,92.0,319.0,Diedow Resty,False,1048.0
227,0244_01,Mars,True,,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Froos Sad,True,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4209,9138_01,Europa,,,TRAPPIST-1e,41.0,False,0.0,1998.0,0.0,1023.0,867.0,Misamak Trupistic,,3888.0
4248,9223_01,Mars,True,,TRAPPIST-1e,24.0,False,0.0,0.0,0.0,0.0,0.0,Weessh Sun,,0.0
4249,9223_02,Mars,True,,TRAPPIST-1e,17.0,False,0.0,0.0,0.0,0.0,0.0,Perit Sun,,0.0
4258,9238_05,Earth,True,,TRAPPIST-1e,14.0,False,0.0,0.0,0.0,0.0,0.0,Caseye Emenez,,0.0


In [13]:
surnames=combine.Name.map(lambda x: "".join(x.split(' ')[-1]) if pd.notnull(x) else np.nan)
surnames.value_counts()[surnames.value_counts() >10]

Buckentry      19
Belley         19
Hinglendez     18
Fowlesterez    18
Casonston      18
               ..
Wolffy         11
Rigginsen      11
Coolerson      11
Hamberterry    11
Dal            11
Name: Name, Length: 168, dtype: int64

In [14]:
b=combine.Cabin.apply(lambda x: "".join(x.split('/')[1]) if pd.notnull(x)  else np.nan)
b.value_counts() 

82      34
4       28
56      28
95      27
31      27
        ..
1848     1
1847     1
1846     1
1844     1
1890     1
Name: Cabin, Length: 1894, dtype: int64

In [15]:
combine["ID"] =combine.PassengerId.str.split("_",expand=True)[1]
combine.loc[:,['Cabin_1']] = combine.Cabin.str.split("/",expand=True).iloc[:,0]
combine.loc[:,['Cabin_3']] = combine.Cabin.str.split("/",expand=True).iloc[:,2]

In [16]:
combine.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Expenses,ID,Cabin_1,Cabin_3
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0.0,1,B,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,736.0,1,F,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,10383.0,1,A,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,5176.0,2,A,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,1091.0,1,F,S


In [17]:
combine.drop(["PassengerId","Name","Cabin"],axis=1,inplace=True)

In [18]:
pd.crosstab(combine.HomePlanet,combine.Transported) # dont need mars

Transported,False,True
HomePlanet,Unnamed: 1_level_1,Unnamed: 2_level_1
Earth,2651,1951
Europa,727,1404
Mars,839,920


In [19]:
pd.crosstab(combine.CryoSleep,combine.Transported)

Transported,False,True
CryoSleep,Unnamed: 1_level_1,Unnamed: 2_level_1
False,3650,1789
True,581,2554


In [20]:
pd.crosstab(combine.Destination,combine.Transported) #drop PSO J318.5-22

Transported,False,True
Destination,Unnamed: 1_level_1,Unnamed: 2_level_1
55 Cancri e,702,1098
PSO J318.5-22,395,401
TRAPPIST-1e,3128,2787


In [21]:
pd.crosstab(combine.VIP,combine.Transported) #drop

Transported,False,True
VIP,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4093,4198
True,123,76


In [22]:
pd.crosstab(combine.Cabin_1,combine.Transported) # drop A,G,T

Transported,False,True
Cabin_1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,129,127
B,207,572
C,239,508
D,271,207
E,563,313
F,1565,1229
G,1238,1321
T,4,1


In [23]:
pd.crosstab(combine.Cabin_3,combine.Transported)

Transported,False,True
Cabin_3,Unnamed: 1_level_1,Unnamed: 2_level_1
P,2308,1898
S,1908,2380


In [24]:
pd.crosstab(combine.ID,combine.Transported) # drop 7,8 comb 2,3,4,5,6

Transported,False,True
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3258,2959
2,623,789
3,218,353
4,94,137
5,58,70
6,34,41
7,23,23
8,7,6


In [25]:
combine.isnull().sum()

HomePlanet       288
CryoSleep        174
Destination      274
Age              270
VIP              296
RoomService      170
FoodCourt        180
ShoppingMall     175
Spa              177
VRDeck           177
Transported     4277
Expenses           0
ID                 0
Cabin_1          299
Cabin_3          299
dtype: int64

In [26]:
num_cols = ['ShoppingMall','FoodCourt','RoomService','Spa','VRDeck','Expenses','Age']
cat_cols = ['CryoSleep','Cabin_1','Cabin_3','VIP','HomePlanet','Destination']
transported=['Transported']

num_imp = SimpleImputer(strategy='mean')
cat_imp = SimpleImputer(strategy='most_frequent')


combine[num_cols] = pd.DataFrame(num_imp.fit_transform(combine[num_cols]),columns=num_cols)
combine[cat_cols] = pd.DataFrame(cat_imp.fit_transform(combine[cat_cols]),columns=cat_cols)

In [27]:
combine.isnull().sum()

HomePlanet         0
CryoSleep          0
Destination        0
Age                0
VIP                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
Transported     4277
Expenses           0
ID                 0
Cabin_1            0
Cabin_3            0
dtype: int64

In [28]:
combine.ID = combine.ID.replace(["02","03","04","05","06"],"02")

In [29]:
combine.drop("VIP",axis=1,inplace=True)
combine.ID.value_counts()

01    9280
02    3605
07      66
08      19
Name: ID, dtype: int64

In [30]:
combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12970 entries, 0 to 4276
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    12970 non-null  object 
 1   CryoSleep     12970 non-null  object 
 2   Destination   12970 non-null  object 
 3   Age           12970 non-null  float64
 4   RoomService   12970 non-null  float64
 5   FoodCourt     12970 non-null  float64
 6   ShoppingMall  12970 non-null  float64
 7   Spa           12970 non-null  float64
 8   VRDeck        12970 non-null  float64
 9   Transported   8693 non-null   object 
 10  Expenses      12970 non-null  float64
 11  ID            12970 non-null  object 
 12  Cabin_1       12970 non-null  object 
 13  Cabin_3       12970 non-null  object 
dtypes: float64(7), object(7)
memory usage: 1.5+ MB


In [31]:
cat_cols =['CryoSleep', 'Cabin_1', 'Cabin_3', 'HomePlanet', 'Destination',"ID"]
dummy=pd.get_dummies(combine[cat_cols])
combine = combine.drop(cat_cols,axis=1)
combine = pd.concat([combine,dummy],axis=1)

In [32]:
combine.columns

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Transported', 'Expenses', 'CryoSleep_False', 'CryoSleep_True',
       'Cabin_1_A', 'Cabin_1_B', 'Cabin_1_C', 'Cabin_1_D', 'Cabin_1_E',
       'Cabin_1_F', 'Cabin_1_G', 'Cabin_1_T', 'Cabin_3_P', 'Cabin_3_S',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'Destination_55 Cancri e', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'ID_01', 'ID_02', 'ID_07', 'ID_08'],
      dtype='object')

In [33]:
drop_cols= ['HomePlanet_Mars','Destination_PSO J318.5-22','Cabin_1_A','Cabin_1_G','Cabin_1_T',"CryoSleep_True","ID_07","ID_08"]
combine.drop(drop_cols,axis=1,inplace=True)

In [34]:
combine.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Expenses,CryoSleep_False,Cabin_1_B,...,Cabin_1_E,Cabin_1_F,Cabin_3_P,Cabin_3_S,HomePlanet_Earth,HomePlanet_Europa,Destination_55 Cancri e,Destination_TRAPPIST-1e,ID_01,ID_02
0,39.0,0.0,0.0,0.0,0.0,0.0,False,0.0,1,1,...,0,0,1,0,0,1,0,1,1,0
1,24.0,109.0,9.0,25.0,549.0,44.0,True,736.0,1,0,...,0,1,0,1,1,0,0,1,1,0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,False,10383.0,1,0,...,0,0,0,1,0,1,0,1,1,0
3,33.0,0.0,1283.0,371.0,3329.0,193.0,False,5176.0,1,0,...,0,0,0,1,0,1,0,1,0,1
4,16.0,303.0,70.0,151.0,565.0,2.0,True,1091.0,1,0,...,0,1,0,1,1,0,0,1,1,0


In [None]:
combine.corr()[combine.corr()>0.5]

In [None]:
combine.corr()[combine.corr()<-0.5]

In [37]:
combine.drop(["FoodCourt","ShoppingMall"],axis=1,inplace=True)

In [None]:
train = combine[combine['Transported'].notnull()]
train.Transported =train.Transported.astype('int')
test = combine[combine['Transported'].isnull()].drop("Transported",axis=1)

In [39]:
train

Unnamed: 0,Age,RoomService,Spa,VRDeck,Transported,Expenses,CryoSleep_False,Cabin_1_B,Cabin_1_C,Cabin_1_D,Cabin_1_E,Cabin_1_F,Cabin_3_P,Cabin_3_S,HomePlanet_Earth,HomePlanet_Europa,Destination_55 Cancri e,Destination_TRAPPIST-1e,ID_01,ID_02
0,39.0,0.0,0.0,0.0,0,0.0,1,1,0,0,0,0,1,0,0,1,0,1,1,0
1,24.0,109.0,549.0,44.0,1,736.0,1,0,0,0,0,1,0,1,1,0,0,1,1,0
2,58.0,43.0,6715.0,49.0,0,10383.0,1,0,0,0,0,0,0,1,0,1,0,1,1,0
3,33.0,0.0,3329.0,193.0,0,5176.0,1,0,0,0,0,0,0,1,0,1,0,1,0,1
4,16.0,303.0,565.0,2.0,1,1091.0,1,0,0,0,0,1,0,1,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,41.0,0.0,1643.0,74.0,0,8536.0,1,0,0,0,0,0,1,0,0,1,1,0,1,0
8689,18.0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,0,0,1,1,0,0,0,1,0
8690,26.0,0.0,1.0,0.0,1,1873.0,1,0,0,0,0,0,0,1,1,0,0,1,1,0
8691,32.0,0.0,353.0,3235.0,0,4637.0,1,0,0,0,1,0,0,1,0,1,1,0,1,0


In [40]:
X = train.drop('Transported',axis=1)
y = train.Transported

X,y = shuffle(X,y)
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [41]:
#linear
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif,k = "all")
selector.fit(X,y)
f_values = pd.Series(selector.scores_,index=X.columns)
f_values.sort_values()

Cabin_1_D                    10.085725
Age                          48.178294
Cabin_1_F                    65.304962
ID_01                        67.355319
ID_02                        68.931629
Destination_TRAPPIST-1e      81.384065
Cabin_1_E                    84.216695
Cabin_3_S                    90.284121
Cabin_3_P                    90.284121
Cabin_1_C                   102.939703
Destination_55 Cancri e     103.961250
Cabin_1_B                   185.951263
HomePlanet_Earth            255.040494
HomePlanet_Europa           280.812746
Expenses                    360.295256
VRDeck                      385.621161
Spa                         442.283959
RoomService                 546.825540
CryoSleep_False            2427.125458
dtype: float64

In [42]:
#non linear
from sklearn.feature_selection import SelectKBest,f_regression,mutual_info_classif

selector = SelectKBest(mutual_info_classif,k="all")

selector.fit(X,y)

m_scores=pd.Series(selector.scores_,index=X.columns)
m_scores = m_scores.sort_values(ascending=False)
m_scores

Expenses                   0.134300
CryoSleep_False            0.114472
RoomService                0.075410
Spa                        0.069998
VRDeck                     0.065670
HomePlanet_Europa          0.021062
Age                        0.016519
HomePlanet_Earth           0.015152
Cabin_1_C                  0.013654
Destination_55 Cancri e    0.009982
Cabin_1_B                  0.009306
Destination_TRAPPIST-1e    0.007833
Cabin_3_P                  0.007281
Cabin_1_F                  0.005749
Cabin_1_D                  0.005466
ID_02                      0.004757
Cabin_1_E                  0.000544
Cabin_3_S                  0.000000
ID_01                      0.000000
dtype: float64

In [43]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(0.01)

selector.fit(X)

X_new = selector.transform(X)
print(X_new.shape)

(8693, 19)


In [44]:
X.shape

(8693, 19)

In [45]:
y.value_counts()

1    4378
0    4315
Name: Transported, dtype: int64

In [46]:
X.CryoSleep_False.value_counts()

1    5558
0    3135
Name: CryoSleep_False, dtype: int64

In [47]:
def objective(trial):
    
    param = { 
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'lambda': trial.suggest_float('lambda', 0, 10.0),
        'alpha': trial.suggest_float('alpha', 0, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1,1.0),
        'subsample': trial.suggest_float('subsample', 0.2,1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.0001,0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100,3000),
        'max_depth': trial.suggest_categorical('max_depth', [2,3,4,5,6,7,8,9,10]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        
        }
    
    xgb = XGBClassifier(**param,tree_method='gpu_hist', gpu_id=0
        )

    score = cross_val_score(xgb, X, y, n_jobs=-1, cv=10)
    accuracy = score.mean()
    return accuracy

study = optuna.create_study(pruner=optuna.pruners.HyperbandPruner(),
                                direction='maximize')
study.optimize(objective, n_trials=20)

[32m[I 2023-02-08 23:14:25,059][0m A new study created in memory with name: no-name-78787605-b520-4fbc-a363-2c9b8e18da7b[0m
[32m[I 2023-02-08 23:14:45,180][0m Trial 0 finished with value: 0.8049002023729217 and parameters: {'lambda': 3.362318891825198, 'alpha': 5.504155847691829, 'colsample_bytree': 0.44227451340212887, 'subsample': 0.4260190812571811, 'learning_rate': 0.09642305521792646, 'n_estimators': 2479, 'max_depth': 6, 'min_child_weight': 2}. Best is trial 0 with value: 0.8049002023729217.[0m
[32m[I 2023-02-08 23:14:53,132][0m Trial 1 finished with value: 0.8045549779770644 and parameters: {'lambda': 2.196081117484945, 'alpha': 1.8222269469352081, 'colsample_bytree': 0.709340240497011, 'subsample': 0.47072192002310703, 'learning_rate': 0.0713283747995777, 'n_estimators': 539, 'max_depth': 10, 'min_child_weight': 5}. Best is trial 0 with value: 0.8049002023729217.[0m
[32m[I 2023-02-08 23:15:11,287][0m Trial 2 finished with value: 0.8061649669986641 and parameters: {'l

In [48]:
trial = study.best_trial
print(trial.value)
print(trial.params)

0.8091550599843922
{'lambda': 6.807045586066151, 'alpha': 8.644191187246989, 'colsample_bytree': 0.9614573014669137, 'subsample': 0.9881195675289182, 'learning_rate': 0.031099990330155036, 'n_estimators': 2764, 'max_depth': 9, 'min_child_weight': 7}


In [49]:
xgbc = XGBClassifier(**trial.params)

In [50]:
from sklearn.feature_selection import RFE

selector = RFE(xgbc,n_features_to_select=1,step=1)


selector.fit(X,y)
ranks = pd.Series(selector.ranking_,index = X.columns)
ranks = ranks.sort_values()
ranks

CryoSleep_False             1
HomePlanet_Earth            2
Cabin_1_E                   3
Expenses                    4
HomePlanet_Europa           5
Cabin_3_P                   6
VRDeck                      7
Spa                         8
RoomService                 9
Cabin_1_C                  10
Cabin_3_S                  11
Cabin_1_F                  12
Destination_TRAPPIST-1e    13
Cabin_1_B                  14
Destination_55 Cancri e    15
Age                        16
Cabin_1_D                  17
ID_02                      18
ID_01                      19
dtype: int32

In [51]:
perm = PermutationImportance(xgbc, random_state=1,n_iter =10,cv=5).fit(X, y)
eli5.show_weights(perm, feature_names = X.columns.tolist())

Weight,Feature
0.0683  ± 0.0189,Spa
0.0647  ± 0.0135,VRDeck
0.0620  ± 0.0121,RoomService
0.0587  ± 0.0180,CryoSleep_False
0.0466  ± 0.0233,Expenses
0.0111  ± 0.0084,Cabin_3_P
0.0103  ± 0.0068,Cabin_1_E
0.0084  ± 0.0095,Age
0.0069  ± 0.0068,Cabin_1_C
0.0044  ± 0.0084,HomePlanet_Europa


In [52]:
params2= {'lambda': 3.0610042624477543, 
             'alpha': 4.581902571574289, 
             'colsample_bytree': 0.9241969052729379, 
             'subsample': 0.9527591724824661, 
             'learning_rate': 0.06672065863100594, 
             'n_estimators': 730,
             'max_depth': 5, 
             'min_child_weight': 1, 
             'num_parallel_tree': 1}

In [53]:
xgbc = XGBClassifier(**params2)

In [54]:
scores = cross_val_score(xgbc,X,y,cv=10,n_jobs=-1,scoring="recall")
scores.mean()

0.819322800748148

In [55]:
x2= X[["Destination_TRAPPIST-1e","Destination_55 Cancri e"]]
x3= X.drop(["Destination_TRAPPIST-1e","Destination_55 Cancri e"],axis=1).reset_index(drop=True)
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(x2)
x2 = pca.transform(x2)
x2 = pd.DataFrame(x2, columns=["a"])
x2 = pd.concat([x3,x2],axis=1)
x2

Unnamed: 0,Age,RoomService,Spa,VRDeck,Expenses,CryoSleep_False,Cabin_1_B,Cabin_1_C,Cabin_1_D,Cabin_1_E,Cabin_1_F,Cabin_3_P,Cabin_3_S,HomePlanet_Earth,HomePlanet_Europa,ID_01,ID_02,a
0,56.0,0.000000,15.0,26.0,908.0,1,0,0,0,0,0,1,0,1,0,1,0,-0.361512
1,36.0,0.000000,4664.0,89.0,5600.0,1,0,0,0,1,0,0,1,0,1,0,1,1.048479
2,14.0,0.000000,0.0,0.0,0.0,0,0,0,0,0,0,0,1,1,0,0,1,-0.361512
3,38.0,1736.000000,5.0,115.0,1979.0,1,0,0,0,0,1,0,1,1,0,1,0,-0.361512
4,73.0,799.000000,68.0,0.0,1057.0,1,0,0,1,0,0,0,1,0,0,1,0,-0.361512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,30.0,0.000000,1737.0,4987.0,10641.0,1,0,0,0,0,0,1,0,0,1,1,0,1.048479
8689,17.0,0.000000,0.0,0.0,0.0,0,0,0,0,1,0,0,1,0,0,0,1,-0.361512
8690,24.0,0.000000,0.0,0.0,718.0,1,0,0,0,0,1,0,1,1,0,0,1,-0.361512
8691,27.0,0.000000,778.0,587.0,3222.0,1,0,1,0,0,0,0,1,0,1,0,1,-0.361512


In [56]:
x2 = x2.drop(["ID_02","ID_01","Cabin_1_D"],axis=1)

In [57]:
scores = cross_val_score(xgbc,x2,y,cv=10,n_jobs=-1,scoring="accuracy")
scores.mean()

0.8066252661931405

In [58]:
scores = cross_val_score(xgbc,x2,y,cv=10,n_jobs=-1,scoring="recall")
scores.mean()

0.8181833380353802