## Imports

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import catboost as cb

In [2]:
# Data
df = pd.read_csv("./data/superset.csv")

## Processing

In [3]:
df = (
    df.set_axis([df["Unnamed: 0"]]) 
    .drop(["Unnamed: 0"], axis=1)
) if "Unnamed: 0" in df.columns else df # So we can rerun the notebook everytime

In [19]:
cat_2_num = {cat:num for num, cat in enumerate(df.Subgroup.unique())}

In [21]:
df.Subgroup = df.Subgroup.replace(cat_2_num)

## Splitting

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_ = df.drop(columns=["Subgroup"])
y_ = df.Subgroup

X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.25, random_state=42)

## Building

### CatBoost

https://catboost.ai/docs/concepts/about.html

In [24]:
print(cb.__version__)

0.25.1


In [31]:
X_trainset, X_val, y_trainset, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

train_set = cb.Pool(X_trainset, label=y_trainset)
eval_set = cb.Pool(X_val, label=y_val)

In [38]:
# Start the CatBoost Object
cbc_params = {
    'iterations':150, 
    'learning_rate':0.01, 
    'l2_leaf_reg':30, 
    'random_seed':44,
    'verbose':10,
    'loss_function':'MultiClass'}

cbc_alt_params = {
    'iterations':1000,
    'learning_rate':0.1,
    'random_strength':0.1,
    'depth':8,
    'loss_function':'MultiClass',
    'eval_metric':'Accuracy',
    'leaf_estimation_method':'Newton'
}

cbc_altered_params = {
    'iterations':1000,
    'verbose':10,
    'loss_function':'MultiClass',
}

CBC = cb.CatBoostClassifier(**cbc_altered_params)

In [39]:
CBC.fit(
    train_set,
    eval_set=(X_val, y_val),
    verbose=True,
    # plot=True # Not possible within DeepNote (Does not support IPywidgets yet)
)

6:	learn: 0.9293837	test: 1.0860747	best: 1.0713970 (5)	total: 403ms	remaining: 57.1s
7:	learn: 0.9102750	test: 1.0912555	best: 1.0713970 (5)	total: 444ms	remaining: 55s
8:	learn: 0.8910764	test: 1.0924986	best: 1.0713970 (5)	total: 499ms	remaining: 55s
9:	learn: 0.8561106	test: 1.0606149	best: 1.0606149 (9)	total: 538ms	remaining: 53.3s
10:	learn: 0.8381138	test: 1.0593808	best: 1.0593808 (10)	total: 593ms	remaining: 53.3s
11:	learn: 0.8177010	test: 1.0568947	best: 1.0568947 (11)	total: 631ms	remaining: 51.9s
12:	learn: 0.7955919	test: 1.0388640	best: 1.0388640 (12)	total: 693ms	remaining: 52.6s
13:	learn: 0.7777126	test: 1.0360278	best: 1.0360278 (13)	total: 732ms	remaining: 51.5s
14:	learn: 0.7657189	test: 1.0392075	best: 1.0360278 (13)	total: 789ms	remaining: 51.8s
15:	learn: 0.7537612	test: 1.0400279	best: 1.0360278 (13)	total: 828ms	remaining: 50.9s
16:	learn: 0.7410338	test: 1.0386712	best: 1.0360278 (13)	total: 889ms	remaining: 51.4s
17:	learn: 0.7276988	test: 1.0321857	best: 1

<catboost.core.CatBoostClassifier at 0x7f0f02914d30>

In [40]:
CBC.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,2184,28.928688
1,672,1.648304
2,1687,1.583237
3,833,1.455189
4,1656,1.437726
...,...,...
2829,2826,0.000000
2830,2827,0.000000
2831,2828,0.000000
2832,2830,0.000000


In [41]:
y_preds = CBC.predict(eval_set)
y_preds = np.reshape(y_preds, y_val.values.shape)

In [64]:
f"{round(100 - len(*np.nonzero(y_preds - y_val.values)) / len(y_preds) * 100)}% Accuracy"

'79% Accuracy'

In [None]:
# my_best_model.save_model('catboost_model.bin')

### Default CatBoost Test

In [66]:
default_CBC = cb.CatBoostClassifier()

In [71]:
default_CBC.fit(
    train_set,
    eval_set=(X_val, y_val),
    verbose=False,
)

<catboost.core.CatBoostClassifier at 0x7f0efe595ca0>

In [72]:
def_y_preds = default_CBC.predict(eval_set)
def_y_preds = np.reshape(def_y_preds, y_val.values.shape)

In [73]:
f"{round(100 - len(*np.nonzero(y_preds - y_val.values)) / len(y_preds) * 100)}% Accuracy"

'79% Accuracy'

### SKlearn Based GBC

In [74]:
from sklearn.ensemble import GradientBoostingClassifier

In [75]:
hard_params = {'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
                'min_samples_split': 5}

# intermediate params

params = dict(hard_params)
# params.update(inter_params)

In [76]:
# X_trainset, X_val, y_trainset, y_val
GBC = GradientBoostingClassifier(**params)
GBC.fit(X_trainset, y_trainset)

GradientBoostingClassifier(max_depth=None, max_leaf_nodes=4,
                           min_samples_split=5, n_estimators=1000,
                           random_state=2)

In [77]:
GBC.score(X_val, y_val)

0.8421052631578947

## k-fold cross-validation:
It seems like you can implement the cross-validation for the train-validation split in this way. (Use to evaluate different models using k-fold CV on the training set
 & pick the final model, which is then used to predict the test set!)

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

model = GBC
kfold = KFold(n_splits=7,shuffle = True, random_state=10)
results = cross_val_score(model, X_trainset, y_trainset, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 81.17% (7.44%)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c915e4f9-60c2-40b5-a522-8a90cb3fd50a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>