In [53]:
import pandas as pd
from numpy.random import choice
from catboost import CatBoostClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector

train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')

In [54]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [55]:
for column in train.columns:
    print(f'{column}\n{train[column].unique()}')

PassengerId
['0001_01' '0002_01' '0003_01' ... '9279_01' '9280_01' '9280_02']
HomePlanet
['Europa' 'Earth' 'Mars' nan]
CryoSleep
[False True nan]
Cabin
['B/0/P' 'F/0/S' 'A/0/S' ... 'G/1499/S' 'G/1500/S' 'E/608/S']
Destination
['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]
Age
[39. 24. 58. 33. 16. 44. 26. 28. 35. 14. 34. 45. 32. 48. 31. 27.  0.  1.
 49. 29. 10.  7. 21. 62. 15. 43. 47.  2. 20. 23. 30. 17. 55.  4. 19. 56.
 nan 25. 38. 36. 22. 18. 42. 37. 13.  8. 40.  3. 54.  9.  6. 64. 67. 61.
 50. 41. 57. 11. 52. 51. 46. 60. 63. 59.  5. 79. 68. 74. 12. 53. 65. 71.
 75. 70. 76. 78. 73. 66. 69. 72. 77.]
VIP
[False True nan]
RoomService
[   0.  109.   43. ... 1569. 8586.  745.]
FoodCourt
[   0.    9. 3576. ... 3208. 6819. 4688.]
ShoppingMall
[   0.   25.  371. ... 1085.  510. 1872.]
Spa
[   0.  549. 6715. ... 2868. 1107. 1643.]
VRDeck
[   0.   44.   49. ... 1164.  971. 3235.]
Name
['Maham Ofracculy' 'Juanna Vines' 'Altark Susent' ... 'Fayey Connon'
 'Celeon Hontichre' 'Propsh Hontichre']

In [56]:
# Filling nulls:
train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [57]:
filt = train['CryoSleep'] == True
print(train.loc[filt, 'RoomService':'VRDeck'])

      RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
7             0.0        0.0           0.0  0.0     NaN
9             0.0        0.0           0.0  0.0     0.0
10            0.0        0.0           NaN  0.0     0.0
18            0.0        0.0           0.0  0.0     0.0
21            0.0        0.0           0.0  0.0     0.0
...           ...        ...           ...  ...     ...
8679          0.0        0.0           0.0  0.0     0.0
8680          0.0        0.0           0.0  0.0     0.0
8681          0.0        0.0           0.0  0.0     0.0
8684          0.0        0.0           0.0  0.0     0.0
8689          0.0        0.0           0.0  0.0     0.0

[3037 rows x 5 columns]


In [58]:
for column in train.loc[:, 'RoomService':'VRDeck'].columns:
    train[column].fillna(train.loc[:, 'RoomService':'VRDeck'].mean(axis=1), inplace=True)

In [59]:
train.insert(12, 'SpendedMoney', train.loc[:, 'RoomService':'VRDeck'].sum(axis=1))

In [60]:
train.loc[(train['CryoSleep'].isnull()) & (train['SpendedMoney'] == 0), 'CryoSleep'] = True
train.loc[(train['CryoSleep'].isnull()) & (train['SpendedMoney'] > 0), 'CryoSleep'] = False

In [61]:
train['Age'].fillna(train['Age'].mean().round(), inplace=True)

In [62]:
mean_VIP_SpendedMoney = train.loc[train['VIP'] == True, 'SpendedMoney'].mean()
train.loc[train['VIP'].isnull() & (train['SpendedMoney'] > mean_VIP_SpendedMoney), 'VIP'] = True
train.loc[train['VIP'].isnull() & (train['SpendedMoney'] <= mean_VIP_SpendedMoney), 'VIP'] = False

In [63]:
columns_to_fill_nulls = ['HomePlanet', 'Cabin', 'Destination']
for column in columns_to_fill_nulls:
    column_probabilities = [train[column].value_counts()[unique_value]/train[column].value_counts().sum()
                           for unique_value in train[column].value_counts().index]
    for i, cell in enumerate(train[column].isnull()):
        if cell:
            train.loc[i, column] = choice(train[column].value_counts().index, p=column_probabilities)

In [65]:
splitted_cabin = train['Cabin'].str.split('/')
splitted_cabin_df = pd.DataFrame.from_dict(dict(zip(splitted_cabin.index, splitted_cabin.values))).T
splitted_cabin_df.rename(columns= {0: 'Deck',
                                   1: 'Num',
                                   2: 'Side'}, inplace=True)

In [65]:
splitted_cabin_df
train_part1 = train.iloc[:, 0:4]
train_part2 = train.iloc[:, 4:]
train = pd.concat([train_part1, splitted_cabin_df, train_part2], axis=1)
train.drop(columns=['PassengerId',
                    'Cabin',
                    'Name'], inplace=True)

In [65]:
train.isnull().sum()

In [66]:
train.to_csv('/kaggle/working/filled_nans_train.csv', index=False)

In [67]:
# Mapping:
train = pd.read_csv('/kaggle/working/filled_nans_train.csv')
test = pd.read_csv('/kaggle/working/filled_nans_test.csv')
columns_to_map = ['HomePlanet', 'Deck', 'Side', 'Destination']
for column in columns_to_map:
    column_data = train[column]
    unique_values = column_data.unique()
    categories = {unique_values: i for i, unique_values in enumerate(unique_values)}
    train[column] = column_data.map(categories)
    test[column] = test[column].map(categories)
train.to_csv('/kaggle/working/preprocessed_train.csv', index=False)
test.to_csv('/kaggle/working/preprocessed_test.csv', index=False)


In [68]:
train_data = pd.read_csv('/kaggle/working/preprocessed_train.csv')
target = train_data['Transported'].astype('int')
train_data.drop(columns = 'Transported', inplace=True)

In [69]:
parameters =  {'depth': 5,
               'iterations': 2000,
               'learning_rate': 0.01, 
               'verbose': False}

In [70]:
# # Commented after final_features was found
# cbc = CatBoostClassifier(**parameters)

# sfs = SequentialFeatureSelector(cbc, 
#                                 scoring='accuracy', 
#                                 direction = 'backward')
# sfs.fit(x_train, y_train)

# final_features = list(sfs.get_feature_names_out())
# print(final_features)

In [71]:
# Result: 
final_features = ['Deck', 'Num', 'Side', 'RoomService', 'Spa', 'VRDeck', 'SpendedMoney']
train_data = train_data.loc[:, final_features]
x_train, x_test, y_train, y_test = train_test_split(train_data, target, test_size=0.2)

In [72]:
training_attempts = 100
max_perf = 0
for i in range(training_attempts):
    cbc = CatBoostClassifier(**parameters)
    cbc.fit(x_train, y_train)
    predictions = cbc.predict(x_test)
    perf = metrics.f1_score(predictions, y_test)
    if perf > max_perf:
        max_perf = perf
        best_cbc = cbc

In [73]:
best_train_perf = metrics.f1_score(best_cbc.predict(x_train), y_train)
print(f'Best Train Accuracy: {best_train_perf}')

Best Train Accuracy: 0.8433865296186097


In [74]:
best_test_perf = metrics.f1_score(best_cbc.predict(x_test), y_test)
print(f'Best Test Accuracy: {best_test_perf}')

Best Test Accuracy: 0.8130790190735695


In [76]:
test_data = pd.read_csv('/kaggle/working/preprocessed_test.csv')
test_passengers_id = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')['PassengerId']

test_predictions = pd.Series(best_cbc.predict(test_data).astype('bool'))
submission_df = pd.DataFrame({'PassengerId': test_passengers_id,
                              'Transported': test_predictions})
submission_df.to_csv(f'/kaggle/working/CatBoostClassifier with f1 metric and {training_attempts} training attempts.csv', index=False)