# **1) Initiall instructions**

In [None]:
!pip install optuna
!pip install catboost
!pip install xgboost

Collecting line_profiler
  Downloading line_profiler-4.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading line_profiler-4.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (717 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/717.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m717.6/717.6 kB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: line_profiler
Successfully installed line_profiler-4.1.3


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import optuna
import os

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA

from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# **2) Data preprocessing**

In [None]:
data = pd.read_csv('/content/drive/MyDrive/train_spaceship.csv')
df = pd.DataFrame(data)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
df = df.drop(columns=['PassengerId', 'Name'])

y = df['Transported']
X = df.drop(columns=['Transported'])

In [None]:
cat_col = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin']
num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [None]:
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


In [None]:
col_transformer = ColumnTransformer(transformers=[
    ('cat_pipeline', cat_pipeline, cat_col),
    ('num_pipeline', num_pipeline, num_col)
])

# **3) Estimators evaluation**

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
list_of_models = [LogisticRegression(random_state=42),
                  RandomForestClassifier(random_state=42),
                  CatBoostClassifier(random_state=42, verbose=0),
                  XGBClassifier(random_state=42),
                  SVC(random_state=42)]
acc_lr = []
rfc_acc = []
svc_acc = []
cat_acc = []
xgb_acc = []

for elem in list_of_models:
  model = elem
  pipeline=make_pipeline(col_transformer, model)
  scores = cross_val_score(pipeline, X, y, cv=skf, scoring='accuracy').mean()


  if isinstance(model, LogisticRegression):
    acc_lr.append(scores)
  elif isinstance(model, RandomForestClassifier):
    rfc_acc.append(scores)
  elif isinstance(model, CatBoostClassifier):
    cat_acc.append(scores)
  elif isinstance(model, XGBClassifier):
    xgb_acc.append(scores)
  elif isinstance(model, SVC):
    svc_acc.append(scores)

In [None]:
print(f'Logistic Regression score: {acc_lr[0]*100:.2f} %')
print(f'Random Forest Classifier score: {rfc_acc[0]*100:.2f} %')
print(f'SVC score: {svc_acc[0]*100:.2f} %')
print(f'CatBoostClassifier score: {cat_acc[0]*100:.2f} %')
print(f'XGBClassifier score: {xgb_acc[0]*100:.2f} %')

Logistic Regression score: 78.64 %
Random Forest Classifier score: 79.16 %
SVC score: 78.89 %
CatBoostClassifier score: 79.34 %
XGBClassifier score: 78.38 %


# **4) Catboost optimization**

In [None]:
def objective(trial):

  #iterations = trial.suggest_int("iterations", 100, 1000)
  depth = trial.suggest_int("depth", 4, 10)
  learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
  l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True)
  bagging_temperature = trial.suggest_float("bagging_temperature", 0.0, 1.0)
  border_count = trial.suggest_int("border_count", 32, 255)
  random_strength = trial.suggest_float("random_strength", 0.0, 10.0)
  verbose = 0

  model = CatBoostClassifier(
                             depth=depth,
                             learning_rate=learning_rate,
                             l2_leaf_reg=l2_leaf_reg,
                             bagging_temperature=bagging_temperature,
                             border_count=border_count,
                             verbose=verbose)

  optuna_pipeline = make_pipeline(col_transformer, model)
  score = cross_val_score(optuna_pipeline, X, y, cv=skf, scoring='accuracy').mean()

  return score



In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-11-18 22:07:03,077] A new study created in memory with name: no-name-729904bc-b87e-4027-ac5f-62d74dbfe85e
[W 2024-11-18 22:07:50,250] Trial 0 failed with parameters: {'depth': 6, 'learning_rate': 0.12560169140477773, 'l2_leaf_reg': 0.005014544842250196, 'bagging_temperature': 0.6457012666508211, 'border_count': 145, 'random_strength': 6.380128626613493} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-13-810637b86446>", line 21, in objective
    score = cross_val_score(optuna_pipeline, X, y, cv=skf, scoring='accuracy').mean()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 712, in cross_val_score
    c

KeyboardInterrupt: 

In [None]:
best_model = study.best_trial.params

iterations = best_model['iterations']
depth = best_model['depth']
learning_rate = best_model['learning_rate']
l2_leaf_reg = best_model['l2_leaf_reg']
bagging_temperature = best_model['bagging_temperature']
border_count = best_model['border_count']
random_strength = best_model['random_strength']
verbose = 0

catboost_best_model = CatBoostClassifier(iterations=iterations,
                                         depth=depth,
                                         learning_rate=learning_rate,
                                         l2_leaf_reg=l2_leaf_reg,
                                         bagging_temperature=bagging_temperature,
                                         border_count=border_count,
                                         random_strength=random_strength,
                                         verbose=verbose)


# **5) Submission data**

In [None]:
data_test = pd.read_csv('/content/drive/MyDrive/test_spaceship.csv')
df_test = pd.DataFrame(data_test)
df_test = df_test.drop(columns=['PassengerId', 'Name'])

data_submission = pd.read_csv('/content/drive/MyDrive/sample_submission_spaceship.csv')
df_submission = pd.DataFrame(data_submission)

# **6) Prediction and submission save**

In [None]:
test_pipe = make_pipeline(col_transformer, catboost_best_model)
test_pipe.fit(X, y)

prediction = test_pipe.predict(df_test)

In [None]:
filepath = os.path.join(os.getcwd(), 'spaceship_submission.csv')
df_submission['Transported'] = prediction

In [None]:
df_submission.to_csv(filepath, index=False)