# Predicting the Number of Appearances of Marvel Characters using Catboost

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, EShapCalcType, EFeaturesSelectionAlgorithm

In [16]:
df = pd.read_csv('characters_cleaned.csv', index_col = [0])
df.head(5)

Unnamed: 0,Name,Gender,Marital_Status,Height,Weight,Eye_Color,Hair_Color,Living_Status,Reality,Identity,Appearances,Year_Introduced,Average_Annual_Appearances
0,6-Ball (Earth-616),Male,Single,,,,,Alive,Earth-616,Secret,1.0,1991,0.032258
1,6R (Earth-616),Agender,,,,,,Deceased,Earth-616,,1.0,1969,0.018868
2,762 (Legion Personality) (Earth-616),Male,Single,,,,Black,Alive,Earth-616,Secret,1.0,2010,0.083333
3,627 (Skullbot) (Earth-12041),Agender,,,,Blue,No hair at all,Deceased,Earth-12041,Public,1.0,2017,0.2
4,749 (Legion Personality) (Earth-616),Male,Single,,,,Black,Alive,Earth-616,Secret,1.0,2011,0.090909


In [17]:
cat_features = ['Gender', 'Marital_Status', 'Eye_Color', 'Hair_Color', 'Living_Status', 'Reality', 'Identity']
all_features = ['Height', 'Weight', 'Year_Introduced']
all_features.extend(cat_features)
X = df[all_features].copy()
y = df['Average_Annual_Appearances'].copy()
print(X.shape, y.shape)

(64064, 10) (64064,)


In [18]:
X[cat_features] = X[cat_features].astype('str')
X['Year_Introduced'] = X['Year_Introduced'].astype('float64')
X.dtypes

Height             float64
Weight             float64
Year_Introduced    float64
Gender              object
Marital_Status      object
Eye_Color           object
Hair_Color          object
Living_Status       object
Reality             object
Identity            object
dtype: object

In [19]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.20)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size = 0.20)
print(train_X.shape, val_X.shape, test_X.shape, train_y.shape, val_y.shape, test_y.shape)

(41000, 10) (10251, 10) (12813, 10) (41000,) (10251,) (12813,)


In [25]:
cat_reg = CatBoostRegressor()
cat_reg.fit(X = train_X, y = train_y, cat_features = cat_features, use_best_model = True, eval_set = (val_X, val_y), 
            early_stopping_rounds = 30)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.091223
0:	learn: 3.5903568	test: 2.9701603	best: 2.9701603 (0)	total: 103ms	remaining: 1m 43s
1:	learn: 3.5275119	test: 2.9578484	best: 2.9578484 (1)	total: 187ms	remaining: 1m 33s
2:	learn: 3.4576049	test: 2.9428442	best: 2.9428442 (2)	total: 263ms	remaining: 1m 27s
3:	learn: 3.3858643	test: 2.9326668	best: 2.9326668 (3)	total: 328ms	remaining: 1m 21s
4:	learn: 3.3213155	test: 2.9190619	best: 2.9190619 (4)	total: 424ms	remaining: 1m 24s
5:	learn: 3.2569911	test: 2.9219033	best: 2.9190619 (4)	total: 495ms	remaining: 1m 21s
6:	learn: 3.2098241	test: 2.9217168	best: 2.9190619 (4)	total: 558ms	remaining: 1m 19s
7:	learn: 3.1576158	test: 2.9262415	best: 2.9190619 (4)	total: 657ms	remaining: 1m 21s
8:	learn: 3.1243336	test: 2.9254832	best: 2.9190619 (4)	total: 756ms	remaining: 1m 23s
9:	learn: 3.0793773	test: 2.9227936	best: 2.9190619 (4)	total: 906ms	remaining: 1m 29s
10:	learn: 3.0333211	test: 2.9257192	best: 2.9190619 (4)	total: 1.08s	remaining: 1m 36s
11:	learn: 2

<catboost.core.CatBoostRegressor at 0x20d033d0>

In [26]:
cat_reg.score(test_X, test_y)

0.13007648405314554

In [33]:
cat_reg.best_score_

{'learn': {'RMSE': 2.3744930055062548},
 'validation': {'RMSE': 2.9190618960509824}}

In [28]:
cat_reg_rfe = CatBoostRegressor(cat_features = cat_features)
rfe_results = cat_reg_rfe.select_features(X = train_X, y = train_y, eval_set = (val_X, val_y), features_for_select = [i for i in range(10)], 
                        num_features_to_select = 6, steps = 3, algorithm = EFeaturesSelectionAlgorithm.RecursiveByShapValues,
                        shap_calc_type = EShapCalcType.Regular, logging_level='Silent')

In [30]:
rfe_results['selected_features_names']

['Height',
 'Weight',
 'Year_Introduced',
 'Eye_Color',
 'Living_Status',
 'Reality']

In [31]:
rfe_results['eliminated_features_names']

['Marital_Status', 'Identity', 'Hair_Color', 'Gender']

In [32]:
cat_reg_rfe.score(test_X, test_y)

0.1540277405099837

In [34]:
cat_reg_rfe.best_score_

{'learn': {'RMSE': 1.0229286841074865},
 'validation': {'RMSE': 2.6919874500152408}}

In [None]:
# To do: train cat_reg_rfe with hyperparameter tuning in mind then compare results