# Predicting the Number of Appearances of Marvel Characters using Catboost

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, EShapCalcType, EFeaturesSelectionAlgorithm

In [2]:
df = pd.read_csv('characters_cleaned.csv', index_col = [0])
df.head(5)

Unnamed: 0,Name,Gender,Marital_Status,Height,Weight,Eye_Color,Hair_Color,Living_Status,Reality,Identity,Year_Introduced,Appearances,Average_Annual_Appearances
0,A'Lars (Earth-9997),Male,Married,,,Blue,Black,Alive,Earth-9997,Secret,2000.0,2.0,0.090909
1,A'Kurru U'mbaya (Earth-616),Male,Single,,,White,No hair at all,Alive,Earth-616,Secret,1992.0,2.0,0.066667
2,A'Lars (Earth-616),Male,Married,1.85,95.25,Blue,Black,Alive,Earth-616,Public,1972.0,65.0,1.3
3,'Lectron (Earth-12772),Male,,,,,,Alive,Earth-12772,Secret,1997.0,1.0,0.04
4,A'Kane (Earth-616),Female,Widowed,,,,Black,Deceased,Earth-616,No Dual Identity,1995.0,1.0,0.037037


In [3]:
cat_features = ['Gender', 'Marital_Status', 'Eye_Color', 'Hair_Color', 'Living_Status', 'Reality', 'Identity']
all_features = ['Height', 'Weight', 'Year_Introduced']
all_features.extend(cat_features)
X = df[all_features].copy()
y = df['Average_Annual_Appearances'].copy()
print(X.shape, y.shape)

(64193, 10) (64193,)


In [4]:
X[cat_features] = X[cat_features].astype('str')
X.dtypes

Height             float64
Weight             float64
Year_Introduced    float64
Gender              object
Marital_Status      object
Eye_Color           object
Hair_Color          object
Living_Status       object
Reality             object
Identity            object
dtype: object

In [5]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.20)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size = 0.20)
print(train_X.shape, val_X.shape, test_X.shape, train_y.shape, val_y.shape, test_y.shape)

(41083, 10) (10271, 10) (12839, 10) (41083,) (10271,) (12839,)


In [6]:
cat_reg = CatBoostRegressor()
cat_reg.fit(X = train_X, y = train_y, cat_features = cat_features, use_best_model = True, eval_set = (val_X, val_y), 
            early_stopping_rounds = 30)

Learning rate set to 0.091252
0:	learn: 1.2958810	test: 1.2063255	best: 1.2063255 (0)	total: 288ms	remaining: 4m 48s
1:	learn: 1.2799105	test: 1.1915980	best: 1.1915980 (1)	total: 498ms	remaining: 4m 8s
2:	learn: 1.2647888	test: 1.1781478	best: 1.1781478 (2)	total: 672ms	remaining: 3m 43s
3:	learn: 1.2521944	test: 1.1664152	best: 1.1664152 (3)	total: 816ms	remaining: 3m 23s
4:	learn: 1.2430409	test: 1.1557187	best: 1.1557187 (4)	total: 971ms	remaining: 3m 13s
5:	learn: 1.2333710	test: 1.1469008	best: 1.1469008 (5)	total: 1.12s	remaining: 3m 5s
6:	learn: 1.2242849	test: 1.1396765	best: 1.1396765 (6)	total: 1.25s	remaining: 2m 57s
7:	learn: 1.2176004	test: 1.1319584	best: 1.1319584 (7)	total: 1.41s	remaining: 2m 54s
8:	learn: 1.2114107	test: 1.1263988	best: 1.1263988 (8)	total: 1.64s	remaining: 3m 1s
9:	learn: 1.2054172	test: 1.1232895	best: 1.1232895 (9)	total: 1.78s	remaining: 2m 55s
10:	learn: 1.2000427	test: 1.1192335	best: 1.1192335 (10)	total: 1.91s	remaining: 2m 51s
11:	learn: 1.1

<catboost.core.CatBoostRegressor at 0x1aaa69d0>

In [7]:
cat_reg.score(test_X, test_y)

0.2681235834862088

In [8]:
cat_reg.best_score_

{'learn': {'RMSE': 1.0830684452868142},
 'validation': {'RMSE': 1.0617558462619412}}

In [9]:
cat_reg_rfe = CatBoostRegressor(cat_features = cat_features)
rfe_results = cat_reg_rfe.select_features(X = train_X, y = train_y, eval_set = (val_X, val_y), features_for_select = [i for i in range(10)], 
                        num_features_to_select = 6, steps = 3, algorithm = EFeaturesSelectionAlgorithm.RecursiveByShapValues,
                        shap_calc_type = EShapCalcType.Regular, logging_level='Silent')

In [10]:
rfe_results['selected_features_names']

['Height',
 'Year_Introduced',
 'Gender',
 'Marital_Status',
 'Reality',
 'Identity']

In [11]:
rfe_results['eliminated_features_names']

['Hair_Color', 'Weight', 'Eye_Color', 'Living_Status']

In [12]:
cat_reg_rfe.score(test_X, test_y)

0.2519105114984882

In [13]:
cat_reg_rfe.best_score_

{'learn': {'RMSE': 0.9196034604436184},
 'validation': {'RMSE': 1.0430938433805355}}

In [None]:
# To do: use a for loop to iterate through all num_features_to_select from 3 to 9 and get the best result. tune both the old and the rfe models.