In [None]:
!pip install optuna

In [None]:
!pip install catboost

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, roc_auc_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, KFold

import optuna
from optuna.integration import CatBoostPruningCallback

from catboost import CatBoostClassifier

In [None]:
df = pd.read_csv('sample_data/final_df_v0.csv')
df

In [None]:
df['target'].value_counts()

In [None]:
plt.figure(figsize = (20, 20))
sns.heatmap(df.corr(), annot = True, cmap = 'viridis')

In [None]:
df.isna().sum()

In [None]:
df = df.drop(columns = ['time'])
df

In [None]:
X = df.drop(columns = ['target'])
y = df['target']

In [None]:
plt.figure(figsize = (20 , 20))

i = 1
for x in X.columns:
    plt.subplot(3, 3, i)
    sns.histplot(X[x])
    plt.title(x)
    i += 1

#Catboost + optuna

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    random_state = 42, 
                                                    shuffle = True)

In [None]:
def objective_catboost(trial, X, y):
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size = 0.25)

    param_grid_catboost = { 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.1, log = True),
                            'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
                            'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
                            'iterations': trial.suggest_int('iterations', 100, 1200),
                            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 10, log = True),
                            'border_count': trial.suggest_int('border_count', 32, 255),
                            'random_strength': trial.suggest_float("random_strength", 1e-8, 10.0, log = True),
                            'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
                            'od_wait': trial.suggest_int('od_wait', 10, 50),
                            'depth': trial.suggest_int('depth', 1, 12),
                            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1, 30),
                            'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 15),
                            'eval_metric': 'Accuracy',
                            'task_type': 'CPU',
                           }

    if param_grid_catboost['bootstrap_type'] == 'Bayesian':
        param_grid_catboost['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif param_grid_catboost['bootstrap_type'] == 'Bernoulli':
        param_grid_catboost['subsample'] = trial.suggest_float('subsample', 0.1, 1, log = True)
    
    pruning_callback = CatBoostPruningCallback(trial, 'Accuracy')

    cv = KFold(n_splits = 5, shuffle = True, random_state = 42)
    
    for idx, (train_idx, test_idx) in enumerate(cv.split(train_x, train_y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        catboost_model = CatBoostClassifier(**param_grid_catboost)
        
        catboost_model.fit(X_train, y_train, eval_set=[(valid_x, valid_y)], early_stopping_rounds = 100, callbacks = [pruning_callback])
    
    pruning_callback.check_pruned()

    return accuracy_score(catboost_model.predict(X_test), y_test).mean()

In [None]:
study_catboost = optuna.create_study(pruner = optuna.pruners.MedianPruner(n_warmup_steps = 5), direction = 'maximize')
func_catboost = lambda trial: objective_catboost(trial, X_train, y_train)
study_catboost.optimize(func_catboost, n_trials = 100)

In [None]:
optuna.visualization.plot_optimization_history(study_catboost)

In [None]:
optuna.visualization.plot_param_importances(study_catboost)

In [None]:
study_catboost.best_params

In [None]:
cv = KFold(n_splits = 5, shuffle = True)
 
for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    catboost = CatBoostClassifier(**study_catboost.best_params)

    catboost_model = catboost.fit(X_train, y_train)   
    
print(accuracy_score(catboost_model.predict(X_test), y_test))

In [None]:
catboost_model.save_model('catboost_steps.json')

In [None]:
cm = confusion_matrix(catboost_model.predict(X_test), y_test)
cm

In [None]:
catboost_model.classes_

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['No activity', 'Walk', 'Run'])
disp.plot()
plt.show()