In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector

In [2]:
train_data = pd.read_csv('handled_train.csv')
target = train_data['Transported'].astype('int')
train_data.drop(columns = 'Transported', inplace=True)

In [3]:
x_train, x_test, y_train, y_test = train_test_split(train_data, target, test_size=0.2)

In [4]:
parameters =  {'depth': 5,
               'iterations': 2000,
               'learning_rate': 0.01, 
               'verbose': False}

In [5]:
# cbc = CatBoostClassifier(**parameters)

# sfs = SequentialFeatureSelector(cbc, 
#                                 scoring='accuracy', 
#                                 direction = 'backward')
# sfs.fit(x_train, y_train)

# final_features = list(sfs.get_feature_names_out())
# print(final_features)

In [6]:
# Result: 
final_features = ['Deck', 'Num', 'Side', 'RoomService', 'Spa', 'VRDeck', 'SpendedMoney']
train_data = train_data.loc[:, final_features]
x_train, x_test, y_train, y_test = train_test_split(train_data, target, test_size=0.2)

In [7]:
training_attempts = 100
max_perf = 0
for i in range(training_attempts):
    cbc = CatBoostClassifier(**parameters)
    cbc.fit(x_train, y_train)
    predictions = cbc.predict(x_test)
    perf = metrics.f1_score(predictions, y_test)
    if perf > max_perf:
        max_perf = perf
        best_cbc = cbc

In [9]:
best_train_perf = metrics.f1_score(best_cbc.predict(x_train), y_train)
print(f'Best Train Accuracy: {best_train_perf}')

Best Train Accuracy: 0.8415211801326296


In [12]:
best_test_perf = metrics.f1_score(best_cbc.predict(x_test), y_test)
print(f'Best Test Accuracy: {best_test_perf}')

Best Test Accuracy: 0.8223896663078579


In [13]:
test_data = pd.read_csv('handled_test.csv')
test_passengers_id = pd.read_csv('test.csv')['PassengerId']

test_predictions = pd.Series(best_cbc.predict(test_data).astype('bool'))
submission_df = pd.DataFrame({'PassengerId': test_passengers_id,
                              'Transported': test_predictions})
submission_df.to_csv(f'CatBoostClassifier with f1 metric and {training_attempts} training attempts.csv', index=False)