In [80]:
import pandas as pd


train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

print(train_data.info())
print(train_data.head())
print(test_data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB
None
  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  Fals

In [81]:
print(train_data["Destination"].value_counts())
print(train_data["Cabin"].value_counts())
print(train_data["HomePlanet"].value_counts())

Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: count, dtype: int64
Cabin
G/734/S     8
G/109/P     7
B/201/P     7
G/1368/P    7
G/981/S     7
           ..
G/556/P     1
E/231/S     1
G/545/S     1
G/543/S     1
F/947/P     1
Name: count, Length: 6560, dtype: int64
HomePlanet
Earth     4602
Europa    2131
Mars      1759
Name: count, dtype: int64


In [82]:
train_data.isna().sum()

train_data.dropna()

test_data.isna().sum()


PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [83]:
binary_mapping = {False: 0, True: 1}
cabin_deck_mapping = {'A': 0, 'B': 1,'C': 2,'D': 3,'E': 4,'F': 5,'G': 6,'T': 7}
cabin_side_mapping = {'S': 0, 'P': 1}
def preprocess(data: pd.DataFrame, train = True):
    data['HomePlanet'].fillna(data['HomePlanet'].mode()[0], inplace=True)
    data['CryoSleep'].fillna(data['CryoSleep'].mode()[0], inplace=True)
    data['Destination'].fillna(data['Destination'].mode()[0], inplace=True)
    data['VIP'].fillna(data['VIP'].mode()[0], inplace=True)

    # Fill numerical
    data['Age'].fillna(data['Age'].median(), inplace=True)

    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in spending_cols:
        data[col].fillna(0, inplace=True)
    
    data = pd.get_dummies(data, columns=['Destination', 'HomePlanet'], drop_first=False)
    data['CryoSleep'] = data['CryoSleep'].map(binary_mapping)
    data['VIP'] = data['VIP'].map(binary_mapping) 
    data[['Cabin_Deck', 'Cabin_Num', 'Cabin_Side']] = data['Cabin'].str.split('/', expand=True)
    data.drop(columns = ['Name', 'Cabin'], inplace = True)
    data['Cabin_Deck'] = data['Cabin_Deck'].map(cabin_deck_mapping) 
    data['Cabin_Side'] = data['Cabin_Side'].map(cabin_side_mapping) 

    if train:
        y = data['Transported']
        data.drop(columns = ['Transported', 'PassengerId'], inplace = True)
        return data, y
    else:
        y = data['PassengerId']
        data.drop(columns = ['PassengerId'], inplace = True)
        return data, y

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import xgboost as xgb



X_train, y_train = preprocess(train_data)



X_test, passenger_id = preprocess(test_data, False)

scaler = StandardScaler()

print(X_train.head())

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


clf = xgb.XGBClassifier(tree_method="hist")

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=5,                  
    scoring='accuracy',    
    n_jobs=-1,             
    verbose=2
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": passenger_id,
    "Transported": y_pred
})
submission.to_csv("submission.csv", index=False)

   CryoSleep   Age  VIP  RoomService  FoodCourt  ShoppingMall     Spa  VRDeck  \
0          0  39.0    0          0.0        0.0           0.0     0.0     0.0   
1          0  24.0    0        109.0        9.0          25.0   549.0    44.0   
2          0  58.0    1         43.0     3576.0           0.0  6715.0    49.0   
3          0  33.0    0          0.0     1283.0         371.0  3329.0   193.0   
4          0  16.0    0        303.0       70.0         151.0   565.0     2.0   

   Destination_55 Cancri e  Destination_PSO J318.5-22  \
0                    False                      False   
1                    False                      False   
2                    False                      False   
3                    False                      False   
4                    False                      False   

   Destination_TRAPPIST-1e  HomePlanet_Earth  HomePlanet_Europa  \
0                     True             False               True   
1                     True          

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['HomePlanet'].fillna(data['HomePlanet'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['CryoSleep'].fillna(data['CryoSleep'].mode()[0], inplace=True)
  data['CryoSleep'].fillna(data['CryoSleep'].mode()[0], inplace=True)
The behavior will change

[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estima

ValueError: 
All the 360 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ardakabadayi/Desktop/Code/kaggle/myenv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ardakabadayi/Desktop/Code/kaggle/myenv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/Users/ardakabadayi/Desktop/Code/kaggle/myenv/lib/python3.10/site-packages/xgboost/sklearn.py", line 1682, in fit
    self._Booster = train(
  File "/Users/ardakabadayi/Desktop/Code/kaggle/myenv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/Users/ardakabadayi/Desktop/Code/kaggle/myenv/lib/python3.10/site-packages/xgboost/training.py", line 184, in train
    if cb_container.after_iteration(bst, i, dtrain, evals):
  File "/Users/ardakabadayi/Desktop/Code/kaggle/myenv/lib/python3.10/site-packages/xgboost/callback.py", line 267, in after_iteration
    ret = any(c.after_iteration(model, epoch, self.history) for c in self.callbacks)
  File "/Users/ardakabadayi/Desktop/Code/kaggle/myenv/lib/python3.10/site-packages/xgboost/callback.py", line 267, in <genexpr>
    ret = any(c.after_iteration(model, epoch, self.history) for c in self.callbacks)
  File "/Users/ardakabadayi/Desktop/Code/kaggle/myenv/lib/python3.10/site-packages/xgboost/callback.py", line 463, in after_iteration
    raise ValueError(msg)
ValueError: Must have at least 1 validation dataset for early stopping.
