## Imports

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import catboost as cb

## Processing

In [3]:
df = (pd
    .read_csv("./data/superset.csv")
    .set_axis([df["Unnamed: 0"]]) 
    .drop(["Unnamed: 0"], axis=1)
)

In [6]:
df.Subgroup = df.Subgroup.replace(
    {cat:num for num, cat in enumerate(df.Subgroup.unique())}
    ) # Create numerical target (easier for the models to use)

## Splitting

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_ = df.drop(columns=["Subgroup"])
y_ = df.Subgroup

# Initial Datasets
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.25, random_state=42)

## Building

### CatBoost

https://catboost.ai/docs/concepts/about.html

In [10]:
print(cb.__version__) # Consistency

0.25.1


In [13]:
X_trainset, X_val, y_trainset, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Apply a validation-like training set
train_set = cb.Pool(X_trainset, label=y_trainset)
eval_set = cb.Pool(X_val, label=y_val)

entire_train = cb.Pool(X_train, label=y_train)

In [16]:
# Set grid search params
cbc_params = {
    'random_seed': 88,
    'verbose': 10,
    'loss_function': 'MultiClass'}

CBC = cb.CatBoostClassifier(**cbc_params)

grid_params ={
    'iterations': [100,500,1000],
    'learning_rate': [0.01,0.05,0.1],
    'l2_leaf_reg': [1,3,5,7,9,11,20],
    'depth': [6, 8, 10],
    'random_strength': [0.01, 0.05, 0.1]}

# Grid Search does automatic CV. So we only need to parse the initial training data
CBC_grid = CBC.grid_search(grid_params, X=entire_train, partition_random_seed=12)

330:	learn: 0.0366226	test: 0.5515693	best: 0.5515693 (330)	total: 25.6s	remaining: 13.1s
340:	learn: 0.0354084	test: 0.5485445	best: 0.5484990 (339)	total: 26.4s	remaining: 12.3s
350:	learn: 0.0341918	test: 0.5462328	best: 0.5457299 (347)	total: 27.2s	remaining: 11.5s
360:	learn: 0.0330496	test: 0.5466947	best: 0.5457299 (347)	total: 28s	remaining: 10.8s
370:	learn: 0.0321555	test: 0.5439941	best: 0.5439941 (370)	total: 28.8s	remaining: 10s
380:	learn: 0.0311775	test: 0.5422310	best: 0.5418757 (378)	total: 29.5s	remaining: 9.23s
390:	learn: 0.0304271	test: 0.5439746	best: 0.5418757 (378)	total: 30.3s	remaining: 8.44s
400:	learn: 0.0296431	test: 0.5433384	best: 0.5418757 (378)	total: 31.1s	remaining: 7.67s
410:	learn: 0.0288073	test: 0.5429789	best: 0.5418757 (378)	total: 31.9s	remaining: 6.9s
420:	learn: 0.0281301	test: 0.5415842	best: 0.5415842 (420)	total: 32.7s	remaining: 6.13s
430:	learn: 0.0274486	test: 0.5412600	best: 0.5405381 (426)	total: 33.4s	remaining: 5.35s
440:	learn: 0.0

In [None]:
CBC_grid.params

Unnamed: 0,Feature Id,Importances
0,2184,28.928688
1,672,1.648304
2,1687,1.583237
3,833,1.455189
4,1656,1.437726
...,...,...
2829,2826,0.000000
2830,2827,0.000000
2831,2828,0.000000
2832,2830,0.000000


In [None]:
FCBC = cb.CatBoostClassifier(**CBC_grid.params)
FCBC.fit(train_set)

In [None]:
final_y_preds = FCBC.predict(eval_set)
final_y_preds = np.reshape(y_preds, y_val.values.shape)

In [None]:
f"{round(100 - len(*np.nonzero(y_preds - y_val.values)) / len(y_preds) * 100)}% Accuracy"

'79% Accuracy'

In [None]:
# my_best_model.save_model('catboost_model.bin')

### Default CatBoost Test

In [None]:
default_CBC = cb.CatBoostClassifier()

In [None]:
default_CBC.fit(
    train_set,
    eval_set=(X_val, y_val),
    verbose=False,
)

KeyboardInterrupt: 

In [None]:
def_y_preds = default_CBC.predict(eval_set)
def_y_preds = np.reshape(def_y_preds, y_val.values.shape)

In [None]:
f"{round(100 - len(*np.nonzero(y_preds - y_val.values)) / len(y_preds) * 100)}% Accuracy"

'79% Accuracy'

### Default CatBoost Decomp

In [None]:
y_trainset.values.shape

(56,)

In [None]:
y_val.values.shape

(19,)

In [None]:
avg_expected_loss_CBC, avg_bias_CBC, avg_var_CBC = bias_variance_decomp(
        default_CBC, X_trainset.values, y_trainset.values, X_val, y_val, # Used .values since mlxtend extracts better from numpy arrays than pandas dfs
        loss='0-1_loss',
        random_seed=66)

print(f'Average expected loss: {avg_expected_loss_CBC}')
print(f'Average bias: {avg_bias_CBC}')
print(f'Average variance: {avg_var_CBC}')

1:	learn: 1.0426174	total: 151ms	remaining: 1m 15s
2:	learn: 1.0160193	total: 230ms	remaining: 1m 16s
3:	learn: 1.0011996	total: 294ms	remaining: 1m 13s
4:	learn: 0.9774458	total: 381ms	remaining: 1m 15s
5:	learn: 0.9461042	total: 456ms	remaining: 1m 15s
6:	learn: 0.9244746	total: 529ms	remaining: 1m 15s
7:	learn: 0.9097368	total: 600ms	remaining: 1m 14s
8:	learn: 0.8876313	total: 676ms	remaining: 1m 14s
9:	learn: 0.8647059	total: 750ms	remaining: 1m 14s
10:	learn: 0.8488065	total: 829ms	remaining: 1m 14s
11:	learn: 0.8320039	total: 899ms	remaining: 1m 13s
12:	learn: 0.8142801	total: 981ms	remaining: 1m 14s
13:	learn: 0.7965193	total: 1.06s	remaining: 1m 14s
14:	learn: 0.7805144	total: 1.13s	remaining: 1m 14s
15:	learn: 0.7692651	total: 1.22s	remaining: 1m 14s
16:	learn: 0.7545986	total: 1.28s	remaining: 1m 13s
17:	learn: 0.7379340	total: 1.35s	remaining: 1m 13s
18:	learn: 0.7162706	total: 1.43s	remaining: 1m 14s
19:	learn: 0.7063683	total: 1.51s	remaining: 1m 14s
20:	learn: 0.6923661	

ValueError: could not broadcast input array from shape (19,1) into shape (19)

### SKlearn Based GBC

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
hard_params = {'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
                'min_samples_split': 5}

# intermediate params

params = dict(hard_params)
# params.update(inter_params)

In [None]:
# X_trainset, X_val, y_trainset, y_val
GBC = GradientBoostingClassifier(**params)
GBC.fit(X_trainset, y_trainset)

GradientBoostingClassifier(max_depth=None, max_leaf_nodes=4,
                           min_samples_split=5, n_estimators=1000,
                           random_state=2)

In [None]:
GBC.score(X_val, y_val)

0.8421052631578947

## attempt to plot the training vs validation to identify overfitting


In [None]:
# WIP
# attempt to plot the training process(https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html)
# training vs validation deviance


y_val_list = y_val.to_numpy()



test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(GBC.staged_predict(X_val)):
    test_score[i] = GBC.loss_(y_val_list, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
fig.tight_layout()
plt.show()


ValueError: operands could not be broadcast together with shapes (19,3) (19,) 

### SKlearn decomposition

In [None]:
from mlxtend.evaluate import bias_variance_decomp

In [None]:
def_GBC = GradientBoostingClassifier(**params)

avg_expected_loss_GBC, avg_bias_GBC, avg_var_GBC = bias_variance_decomp(
        def_GBC, X_trainset.values, y_trainset.values, X_val, y_val, # Used .values since mlxtend extracts better from numpy arrays than pandas dfs
        loss='0-1_loss',
        random_seed=56)

print(f'Average expected loss: {avg_expected_loss_GBC}')
print(f'Average bias: {avg_bias_GBC}')
print(f'Average variance: {avg_var_GBC}')

Average expected loss: 0.22289473684210523
Average bias: 0.15789473684210525
Average variance: 0.16210526315789475


## k-fold cross-validation:
It seems like you can implement the cross-validation for the train-validation split in this way. (Use to evaluate different models using k-fold CV on the training set
 & pick the final model, which is then used to predict the test set!)

 *Question*: 

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

model = GBC
kfold = KFold(n_splits=7,shuffle = True, random_state=10)
results = cross_val_score(model, X_trainset, y_trainset, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 76.79% (12.37%)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c915e4f9-60c2-40b5-a522-8a90cb3fd50a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>