# Explanations

- This notebook focuses on CatB

- The previous notebook showed that anomaly detection models work quite well.

- Looking in the pyod document, I found the benchmarks with this paper:
  - Benchmark: https://pyod.readthedocs.io/en/latest/benchmark.html
  - Paper: https://arxiv.org/abs/2206.09426

- Here are some conclusions from the paper:
  - The performance of unsupervised algorithms depends on the data (no absolute winners).
  - semi-supervised methods can outperform unsupervised methods with a small amount of labelled data (common)
  - unsupervised may outperform supervised under certain conditions (not common)
  - Models that I will be using (Best performing models from the paper):
    - Unsupervised: IForest, COPOD, KNN, CBLOF
    - Supervised: CatB, LGBM

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import yfinance as yf
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
import plotly.graph_objects as go
import catboost as cb
import optuna


import scripts.feature as feat
import scripts.utility as util


In [2]:
class FirmEncoder:
    def __init__(self, firms):
        self.encoder = LabelEncoder()
        self.encoder.fit(firms)

        self.scaler = StandardScaler()
        self.scaler.fit(self.encoder.transform(firms).reshape(-1, 1))
    
    def encode(self, firm):
        code = self.encoder.transform([firm]).reshape(-1, 1)
        return self.scaler.transform(code)[0][0]

In [3]:
def multi_ticker_dataset_pipeline(ticker_list, obj=[5, 0.05], normalize=False, extend=None):
    encoder = FirmEncoder(ticker_list)
    X_train, y_train, X_val, y_val = [], [], [], []

    for ticker in ticker_list:
        df = yf.download(ticker, period="5y", interval="1d", progress=False)
        df = df.drop(columns=["Adj Close"])
        firm = encoder.encode(ticker)

        Xtrain, ytrain, Xval, yval = feat.dataset_pipeline(df, 
                                                           obj=obj, 
                                                           norm=normalize, 
                                                           seq_len=None, 
                                                           extend=extend, 
                                                           firm=firm)
            
        X_train.append(Xtrain)
        y_train.append(ytrain)
        X_val.append(Xval)
        y_val.append(yval)

    X_train = np.vstack(X_train)
    y_train = np.hstack(y_train)
    X_val = np.vstack(X_val)
    y_val = np.hstack(y_val)

    return X_train, y_train, X_val, y_val

def data_prep_for_tuning(ticker_list, normalize, obj=[5, 0.05]):
    X_train, y_train, X_val, y_val = multi_ticker_dataset_pipeline(ticker_list, obj=obj, normalize=normalize, extend=3)
    print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

    y_train = np.where(y_train <= 1, 0, 1)
    y_val = np.where(y_val <= 1, 0, 1)

    print(np.unique(y_train, return_counts=True))
    print(np.unique(y_val, return_counts=True))

    return X_train, y_train, X_val, y_val

def data_prep_for_tuning_multi(ticker_list, normalize, obj=[5, 0.05]):
    X_train, y_train, X_val, y_val = multi_ticker_dataset_pipeline(ticker_list, obj=obj, normalize=normalize, extend=3)
    print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

    print(np.unique(y_train, return_counts=True))
    print(np.unique(y_val, return_counts=True))

    return X_train, y_train, X_val, y_val

In [4]:
def single_class_report(y_true, y_pred, label):
    instance = np.where(y_true == label)[0]
    predict = np.where(y_pred == label)[0]
    correct = np.intersect1d(instance, predict) 

    precision = len(correct) / (len(predict) + 0.01)
    recall = len(correct) / (len(instance)+ 0.01)
    
    def fbeta_score(b):
        return (1+b**2) * (precision * recall) / (b**2 * precision + recall + 0.01)
    f1 = fbeta_score(b=1)
    fbeta = fbeta_score(b=0.5)  # 0.5 beta focus more on precision

    # print(f"----- For Class {label} -----")
    # print(f"Recall: {recall}")
    # print(f"Precision: {precision}")
    # print(f"f1_score: {f1}")
    # print(f"f0.5_score: {fbeta}")

    return [recall, precision, f1, fbeta]


def quick_model_evaluation(model, X_train, y_train, X_val, y_val, label):
    model.fit(X_train, y_train)

    print("train set")
    y_pred = model.predict(X_train)
    train_result = single_class_report(y_train, y_pred, label=label)

    print("val set")
    y_pred = model.predict(X_val)
    val_result = single_class_report(y_val, y_pred, label=label)

    return np.round(train_result + val_result, 3)


def quick_unsupervised_evaluation(model, X_train, y_train, X_val, y_val, label):
    model.fit(X_train)

    # class 1 is normal, -1 is anomaly for sklearn
    # class 0 is normal,  1 is anomaly for pyod
    # print("train set")
    y_pred = model.predict(X_train)
    # y_pred = np.where(y_pred > 0, 0, 1)
    train_result = single_class_report(y_train, y_pred, label=label)

    # print("val set")
    y_pred = model.predict(X_val)
    # y_pred = np.where(y_pred > 0, 0, 1)
    val_result = single_class_report(y_val, y_pred, label=label)

    return np.round(train_result + val_result, 3)

def update_result_table(df, result, idx_name):
    new = pd.DataFrame(data=[result],
                       columns=["train_recall", "train_precision", "train_f1", "train_fbeta",
                                "val_recall", "val_precision", "val_f1", "val_fbeta"],
                       index=[idx_name])
    df = pd.concat([df, new])
    return df


# optuna tuning


In [19]:
def objective(trial):
    param = {
        "loss_function": "Logloss",
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_uniform("l2_leaf_reg", 0.1, 10),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.3, 1.0),
        "bagging_temperature": trial.suggest_uniform("bagging_temperature", 0.0, 1.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "random_strength": trial.suggest_uniform("random_strength", 0.0, 10.0),
        "verbose": 0
    }

    gbm = cb.CatBoostClassifier(**param)
    gbm.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)
    # gbm.fit(X_train, y_train)

    y_pred = gbm.predict(X_val)
    score = single_class_report(y_val, y_pred, label=1)

    return score[3]


def multi_objective(trial):
    param = {
        "loss_function": "MultiClass",
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_uniform("l2_leaf_reg", 0.1, 10),
        # "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.3, 1.0),
        "bagging_temperature": trial.suggest_uniform("bagging_temperature", 0.0, 1.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "random_strength": trial.suggest_uniform("random_strength", 0.0, 10.0),
        "verbose": 0
    }

    gbm = cb.CatBoostClassifier(**param)
    gbm.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

    y_pred = gbm.predict(X_val)
    score = single_class_report(y_val, y_pred, label=2)

    return score[3]


# Tickers (Settings)

In [6]:
ticker_list = ["TSLA", "NVDA", "AMZN", "AAPL", "MSFT", "AMD", "GOOG", "META", "NFLX"]
# ticker_list = ["TSLA", "NVDA", "AMZN"]

n_trials = 100

# Binary classification w raw data

## goal: 5 day 5% gain (high anomaly %)

In [7]:
X_train, y_train, X_val, y_val = data_prep_for_tuning(ticker_list, normalize=False, obj=[5, 0.05])

(8793, 101) (8793,) (2205, 101) (2205,)
(array([0, 1]), array([5895, 2898], dtype=int64))
(array([0, 1]), array([1640,  565], dtype=int64))


In [8]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials)

[I 2024-07-24 00:28:27,892] A new study created in memory with name: no-name-5ca2dc2e-0f68-4905-9563-17b3156565e0
  "l2_leaf_reg": trial.suggest_uniform("l2_leaf_reg", 0.1, 10),
  "bagging_temperature": trial.suggest_uniform("bagging_temperature", 0.0, 1.0),
  "random_strength": trial.suggest_uniform("random_strength", 0.0, 10.0),
[I 2024-07-24 00:28:28,754] Trial 0 finished with value: 0.1636116398114601 and parameters: {'depth': 4, 'learning_rate': 0.2232029712539639, 'l2_leaf_reg': 4.1662936779032576, 'subsample': 0.8914964177381322, 'colsample_bylevel': 0.4497577408706829, 'bagging_temperature': 0.8248913374890957, 'border_count': 189, 'random_strength': 3.532084085207984}. Best is trial 0 with value: 0.1636116398114601.
[I 2024-07-24 00:28:30,084] Trial 1 finished with value: 0.2590179227067643 and parameters: {'depth': 8, 'learning_rate': 0.2897433854905231, 'l2_leaf_reg': 6.036223129232943, 'subsample': 0.5124248701014514, 'colsample_bylevel': 0.9241597045802843, 'bagging_temper

In [9]:
optuna.visualization.plot_optimization_history(**{"study": study})

## goal: 3 day 5% gain (mid anomaly %)

In [10]:
X_train, y_train, X_val, y_val = data_prep_for_tuning(ticker_list, normalize=False, obj=[3, 0.05])

(8793, 101) (8793,) (2205, 101) (2205,)
(array([0, 1]), array([6781, 2012], dtype=int64))
(array([0, 1]), array([1862,  343], dtype=int64))


In [11]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials)

[I 2024-07-24 00:33:47,333] A new study created in memory with name: no-name-16794f90-8a0b-4077-aeef-d2cea04c3d74

suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.


suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.


suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.

[I 2024-07-24 00:33:50,022] Trial 0 finished with value: 0.1234217257281999 and parameters: {'depth': 10, 'learning_rate': 0.26175520250791723, 'l2_leaf_reg': 3.7991870219772013, 'subsample': 0.5786171033111902, 'colsample_bylevel': 0.3140591024916058, 'bagging_temperature': 0.6225633143614729, 'border_count': 123, 'random_strength': 1.1705118937276981}. Best is

In [12]:
optuna.visualization.plot_optimization_history(**{"study": study})

## goal: 3 day 7% gain (low anomaly %)

In [13]:
X_train, y_train, X_val, y_val = data_prep_for_tuning(ticker_list, normalize=False, obj=[3, 0.07])

(8793, 101) (8793,) (2205, 101) (2205,)
(array([0, 1]), array([7689, 1104], dtype=int64))
(array([0, 1]), array([2027,  178], dtype=int64))


In [14]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials)

[I 2024-07-24 00:41:44,974] A new study created in memory with name: no-name-04837a7f-483f-4174-8ee5-5f128cf45fa2

suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.


suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.


suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.

[I 2024-07-24 00:41:49,741] Trial 0 finished with value: 0.0 and parameters: {'depth': 6, 'learning_rate': 0.0799426751815176, 'l2_leaf_reg': 1.6873451580087753, 'subsample': 0.7367059877574562, 'colsample_bylevel': 0.6690832400676734, 'bagging_temperature': 0.29593221252360147, 'border_count': 146, 'random_strength': 6.950972494226173}. Best is trial 0 with val

In [15]:
optuna.visualization.plot_optimization_history(**{"study": study})

# Multi classification w raw data

## goal: 5 day 5% gain (high anomaly %)

In [16]:
X_train, y_train, X_val, y_val = data_prep_for_tuning_multi(ticker_list, normalize=False, obj=[5, 0.05])

(8793, 101) (8793,) (2205, 101) (2205,)
(array([0, 1, 2], dtype=int64), array([2332, 3563, 2898], dtype=int64))
(array([0, 1, 2], dtype=int64), array([ 437, 1203,  565], dtype=int64))


In [20]:
study = optuna.create_study(direction="maximize")
study.optimize(multi_objective, n_trials=n_trials)

[I 2024-07-24 01:06:02,683] A new study created in memory with name: no-name-cdf21934-fb2d-4e32-aef6-6cfd281e05c0

suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.


suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.


suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.

[I 2024-07-24 01:06:07,796] Trial 0 finished with value: 0.3748742970941386 and parameters: {'depth': 7, 'learning_rate': 0.08173884827735055, 'l2_leaf_reg': 8.438926957159211, 'colsample_bylevel': 0.7617875730114663, 'bagging_temperature': 0.6494093326699012, 'border_count': 69, 'random_strength': 9.740466098011598}. Best is trial 0 with value: 0.37487429709413

### best

In [21]:
optuna.visualization.plot_optimization_history(**{"study": study})

## goal: 3 day 5% gain (mid anomaly %)

In [22]:
X_train, y_train, X_val, y_val = data_prep_for_tuning_multi(ticker_list, normalize=False, obj=[3, 0.05])

(8793, 101) (8793,) (2205, 101) (2205,)
(array([0, 1, 2], dtype=int64), array([1675, 5106, 2012], dtype=int64))
(array([0, 1, 2], dtype=int64), array([ 277, 1585,  343], dtype=int64))


In [23]:
study = optuna.create_study(direction="maximize")
study.optimize(multi_objective, n_trials=n_trials)

[I 2024-07-24 01:14:34,569] A new study created in memory with name: no-name-9b1cbcfa-0d4d-4943-9aa6-cabcea39a770

suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.


suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.


suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.

[I 2024-07-24 01:14:41,797] Trial 0 finished with value: 0.25759879013504894 and parameters: {'depth': 7, 'learning_rate': 0.19295210847089975, 'l2_leaf_reg': 8.840660271473656, 'colsample_bylevel': 0.7642032284377576, 'bagging_temperature': 0.547929470503602, 'border_count': 222, 'random_strength': 6.783274653498738}. Best is trial 0 with value: 0.2575987901350

In [24]:
optuna.visualization.plot_optimization_history(**{"study": study})

## goal: 3 day 7% gain (low anomaly %)

In [25]:
X_train, y_train, X_val, y_val = data_prep_for_tuning_multi(ticker_list, normalize=False, obj=[3, 0.07])

(8793, 101) (8793,) (2205, 101) (2205,)
(array([0, 1, 2], dtype=int64), array([ 948, 6741, 1104], dtype=int64))
(array([0, 1, 2], dtype=int64), array([ 115, 1912,  178], dtype=int64))


In [28]:
study = optuna.create_study(direction="maximize")
study.optimize(multi_objective, n_trials=30)

[I 2024-07-24 01:54:48,555] A new study created in memory with name: no-name-a27fa1ec-fc3d-45bb-b9e4-b2f29a40df0e

suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.


suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.


suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.

[I 2024-07-24 01:54:51,677] Trial 0 finished with value: 0.10845602696540751 and parameters: {'depth': 5, 'learning_rate': 0.22200828137228326, 'l2_leaf_reg': 8.85048141410085, 'colsample_bylevel': 0.505932916063614, 'bagging_temperature': 0.2708889307125124, 'border_count': 238, 'random_strength': 5.27474149190278}. Best is trial 0 with value: 0.108456026965407

In [29]:
optuna.visualization.plot_optimization_history(**{"study": study})