<a href="https://colab.research.google.com/github/Eserhimas/Basics/blob/main/Catboost_Basics_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [13]:
train_path = "https://raw.githubusercontent.com/imarranz/spaceship-titanic/master/data/train.csv"
test_path = "https://raw.githubusercontent.com/imarranz/spaceship-titanic/master/data/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [14]:
train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [15]:
train.shape

(8693, 14)

In [16]:
train.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [17]:
# drop name col
train = train.drop('Name', axis=1)
test = test.drop('Name', axis=1)

# formatting the target variable
train['Transported'] = train['Transported'].apply(lambda x: int(x))

# variable list
target = ['Transported']
cat_features =['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
num_features = [v for v in train.columns.tolist() if v not in target + cat_features + ['PassengerId', 'Cabin']]

In [21]:
# fill the age with median
train.Age = train.Age.fillna(train.Age.median())
test.Age = test.Age.fillna(test.Age.median())

# fill the rest of numericals with 0
for v in (num_features):
  train['missing_'+str(v)] = np.where(train[v].isnull(), 1, 0)
  train[v] = train[v].fillna(0)
  test['missing_'+str(v)] = np.where(test[v].isnull(), 1 ,0)
  test[v] = test[v].fillna(0)

# save null track features to list

na_features = [v for v in train.columns if 'missing' in v]

In [23]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,missing_Age,missing_RoomService,missing_FoodCourt,missing_ShoppingMall,missing_Spa,missing_VRDeck
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,1,0,0,0,0,0,0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0,0,0,0,0,0,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0,0,0,0,0,0,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1,0,0,0,0,0,0


In [25]:
# obtain group number
train[['group_num', 'id']] = train['PassengerId'].str.split('_', expand=True)
test[['group_num', 'id']] = test['PassengerId'].str.split('_', expand=True)

# split the cabin deck,num and side
train[['deck', 'num', 'side']] = train['Cabin'].str.split('/', expand=True)
test[['deck', 'num', 'side']] = test['Cabin'].str.split('/', expand=True)

# drop unique ID
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

# calculate number of people per group
train_group_count = train.group_num.value_counts().sort_index().reset_index().rename(columns = {'index': 'group_num', 'group_num':'group_size'})
test_group_count = test.group_num.value_counts().sort_index().reset_index().rename(columns = {'index': 'group_num', 'group_num':'group_size'})

# merge with df
train = train.merge(train_group_count, how='left', on='group_num')
test = test.merge(test_group_count, how='left', on='group_num')

# update cat_features list
cat_features = cat_features + ['deck', 'num', 'side', 'group_num', 'group_size']

In [28]:
money_features = [v for v in num_features if 'Age' not in v]

# Total spend
train['totalspend'] = train[money_features].sum(axis=1)
test['totalspend'] = test[money_features].sum(axis=1)

# percentage of spend
for v in (money_features):
  train['pct_' + str(v)] = train[v]/train['totalspend']
  test['pct_'+str(v)] = test[v]/test['totalspend']

# save percentage features to list
pct_features = [v for v in train.columns if 'pct' in v]

# update numerical list
num_features = num_features + pct_features + ['totalspend'] + na_features

In [31]:
train.dtypes

PassengerId              object
HomePlanet               object
CryoSleep                object
Cabin                    object
Destination              object
Age                     float64
VIP                      object
RoomService             float64
FoodCourt               float64
ShoppingMall            float64
Spa                     float64
VRDeck                  float64
Transported               int64
missing_Age               int64
missing_RoomService       int64
missing_FoodCourt         int64
missing_ShoppingMall      int64
missing_Spa               int64
missing_VRDeck            int64
group_num                object
deck                     object
num                      object
side                     object
group_size                int64
totalspend              float64
pct_RoomService         float64
pct_FoodCourt           float64
pct_ShoppingMall        float64
pct_Spa                 float64
pct_VRDeck              float64
dtype: object

In [32]:
# predictor list
predictors = num_features + cat_features


In [33]:
# cast categorical to string
train[cat_features] = train[cat_features].astype(str)
test[cat_features] = test[cat_features].astype(str)

train.dtypes

PassengerId              object
HomePlanet               object
CryoSleep                object
Cabin                    object
Destination              object
Age                     float64
VIP                      object
RoomService             float64
FoodCourt               float64
ShoppingMall            float64
Spa                     float64
VRDeck                  float64
Transported               int64
missing_Age               int64
missing_RoomService       int64
missing_FoodCourt         int64
missing_ShoppingMall      int64
missing_Spa               int64
missing_VRDeck            int64
group_num                object
deck                     object
num                      object
side                     object
group_size               object
totalspend              float64
pct_RoomService         float64
pct_FoodCourt           float64
pct_ShoppingMall        float64
pct_Spa                 float64
pct_VRDeck              float64
dtype: object

In [34]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(train[predictors],
                                                    train[target], test_size=0.2, random_state=1642)

In [37]:
clf = CatBoostClassifier(iterations=10000, eval_metric='Accuracy', verbose=500)

clf.fit(X_train[predictors], y_train, eval_set=(X_test, y_test), cat_features=cat_features)

Learning rate set to 0.018791
0:	learn: 0.7828588	test: 0.7734330	best: 0.7734330 (0)	total: 169ms	remaining: 28m 9s
500:	learn: 0.8285879	test: 0.7901093	best: 0.7918344 (187)	total: 16.7s	remaining: 5m 16s
1000:	learn: 0.8485764	test: 0.7906843	best: 0.7924094 (603)	total: 31.7s	remaining: 4m 45s
1500:	learn: 0.8681334	test: 0.7941346	best: 0.7952846 (1489)	total: 47.2s	remaining: 4m 27s
2000:	learn: 0.8856773	test: 0.7929845	best: 0.7952846 (1489)	total: 1m 2s	remaining: 4m 10s
2500:	learn: 0.8983319	test: 0.7958597	best: 0.7970098 (2434)	total: 1m 19s	remaining: 3m 59s
3000:	learn: 0.9099799	test: 0.7958597	best: 0.7975848 (2569)	total: 1m 35s	remaining: 3m 43s
3500:	learn: 0.9209088	test: 0.7941346	best: 0.7981599 (3119)	total: 1m 51s	remaining: 3m 26s
4000:	learn: 0.9301122	test: 0.7958597	best: 0.7981599 (3119)	total: 2m 6s	remaining: 3m 9s
4500:	learn: 0.9371585	test: 0.7947096	best: 0.7981599 (3119)	total: 2m 23s	remaining: 2m 55s
5000:	learn: 0.9444924	test: 0.7889592	best: 0

<catboost.core.CatBoostClassifier at 0x79a625cc3370>

In [38]:
# make predictions
pred = clf.predict(test[predictors])
pred = [True if i == 1 else False for i in pred]

# add to df
sample_submission = pd.DataFrame({"PassengerId":test.PassengerId,
                                  "Transported":pred})

#sample_submission.to_csv("path", index=False)
sample_submission.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
