# Explanations

- This notebook focuses on lightgbm

- The previous notebook showed that anomaly detection models work quite well.

- Looking in the pyod document, I found the benchmarks with this paper:
  - Benchmark: https://pyod.readthedocs.io/en/latest/benchmark.html
  - Paper: https://arxiv.org/abs/2206.09426

- Here are some conclusions from the paper:
  - The performance of unsupervised algorithms depends on the data (no absolute winners).
  - semi-supervised methods can outperform unsupervised methods with a small amount of labelled data (common)
  - unsupervised may outperform supervised under certain conditions (not common)
  - Models that I will be using (Best performing models from the paper):
    - Unsupervised: IForest, COPOD, KNN, CBLOF
    - Supervised: CatB, LGBM

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import yfinance as yf
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
import plotly.graph_objects as go
import lightgbm as lgb
import optuna


import scripts.feature as feat
import scripts.utility as util


In [2]:
class FirmEncoder:
    def __init__(self, firms):
        self.encoder = LabelEncoder()
        self.encoder.fit(firms)

        self.scaler = StandardScaler()
        self.scaler.fit(self.encoder.transform(firms).reshape(-1, 1))
    
    def encode(self, firm):
        code = self.encoder.transform([firm]).reshape(-1, 1)
        return self.scaler.transform(code)[0][0]

In [3]:
def multi_ticker_dataset_pipeline(ticker_list, obj=[5, 0.05], normalize=False, extend=None):
    encoder = FirmEncoder(ticker_list)
    X_train, y_train, X_val, y_val = [], [], [], []

    for ticker in ticker_list:
        df = yf.download(ticker, period="5y", interval="1d", progress=False)
        df = df.drop(columns=["Adj Close"])
        firm = encoder.encode(ticker)

        Xtrain, ytrain, Xval, yval = feat.dataset_pipeline(df, 
                                                           obj=obj, 
                                                           norm=normalize, 
                                                           seq_len=None, 
                                                           extend=extend, 
                                                           firm=firm)
            
        X_train.append(Xtrain)
        y_train.append(ytrain)
        X_val.append(Xval)
        y_val.append(yval)

    X_train = np.vstack(X_train)
    y_train = np.hstack(y_train)
    X_val = np.vstack(X_val)
    y_val = np.hstack(y_val)

    return X_train, y_train, X_val, y_val

def data_prep_for_tuning(ticker_list, normalize, obj=[5, 0.05]):
    X_train, y_train, X_val, y_val = multi_ticker_dataset_pipeline(ticker_list, obj=obj, normalize=normalize, extend=3)
    print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

    y_train = np.where(y_train <= 1, 0, 1)
    y_val = np.where(y_val <= 1, 0, 1)

    print(np.unique(y_train, return_counts=True))
    print(np.unique(y_val, return_counts=True))

    return X_train, y_train, X_val, y_val

def data_prep_for_tuning_multi(ticker_list, normalize, obj=[5, 0.05]):
    X_train, y_train, X_val, y_val = multi_ticker_dataset_pipeline(ticker_list, obj=obj, normalize=normalize, extend=3)
    print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

    print(np.unique(y_train, return_counts=True))
    print(np.unique(y_val, return_counts=True))

    return X_train, y_train, X_val, y_val

In [4]:
def single_class_report(y_true, y_pred, label):
    instance = np.where(y_true == label)[0]
    predict = np.where(y_pred == label)[0]
    correct = np.intersect1d(instance, predict) 

    precision = len(correct) / (len(predict) + 0.01)
    recall = len(correct) / (len(instance)+ 0.01)
    
    def fbeta_score(b):
        return (1+b**2) * (precision * recall) / (b**2 * precision + recall + 0.01)
    f1 = fbeta_score(b=1)
    fbeta = fbeta_score(b=0.5)  # 0.5 beta focus more on precision

    # print(f"----- For Class {label} -----")
    # print(f"Recall: {recall}")
    # print(f"Precision: {precision}")
    # print(f"f1_score: {f1}")
    # print(f"f0.5_score: {fbeta}")

    return [recall, precision, f1, fbeta]


def quick_model_evaluation(model, X_train, y_train, X_val, y_val, label):
    model.fit(X_train, y_train)

    print("train set")
    y_pred = model.predict(X_train)
    train_result = single_class_report(y_train, y_pred, label=label)

    print("val set")
    y_pred = model.predict(X_val)
    val_result = single_class_report(y_val, y_pred, label=label)

    return np.round(train_result + val_result, 3)


def quick_unsupervised_evaluation(model, X_train, y_train, X_val, y_val, label):
    model.fit(X_train)

    # class 1 is normal, -1 is anomaly for sklearn
    # class 0 is normal,  1 is anomaly for pyod
    # print("train set")
    y_pred = model.predict(X_train)
    # y_pred = np.where(y_pred > 0, 0, 1)
    train_result = single_class_report(y_train, y_pred, label=label)

    # print("val set")
    y_pred = model.predict(X_val)
    # y_pred = np.where(y_pred > 0, 0, 1)
    val_result = single_class_report(y_val, y_pred, label=label)

    return np.round(train_result + val_result, 3)

def update_result_table(df, result, idx_name):
    new = pd.DataFrame(data=[result],
                       columns=["train_recall", "train_precision", "train_f1", "train_fbeta",
                                "val_recall", "val_precision", "val_f1", "val_fbeta"],
                       index=[idx_name])
    df = pd.concat([df, new])
    return df


# optuna tuning


In [5]:
def objective(trial):
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        # controls tree structure
        "num_leaves": trial.suggest_int("num_leaves", 16, 512),
        "max_depth": trial.suggest_int("max_depth", 3, 16),
        # for accuracy
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        # controls overfitting
        "lambda_l1": trial.suggest_int("lambda_l1", 0.1, 100),
        "lambda_l2": trial.suggest_int("lambda_l2", 0.1, 100),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.3, 1),
        "feature_fraction": trial.suggest_float("bagging_fraction", 0.3, 1),
    }

    gbm = lgb.LGBMClassifier(**param)
    gbm.fit(X_train, y_train)

    y_pred = gbm.predict(X_val)
    score = single_class_report(y_val, y_pred, label=1)

    return score[3]


def multi_objective(trial):
    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "num_class": 3,
        "verbosity": -1,
        # controls tree structure
        "num_leaves": trial.suggest_int("num_leaves", 16, 512),
        "max_depth": trial.suggest_int("max_depth", 3, 16),
        # for accuracy
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        # controls overfitting
        "lambda_l1": trial.suggest_int("lambda_l1", 0.1, 100),
        "lambda_l2": trial.suggest_int("lambda_l2", 0.1, 100),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.3, 1),
        "feature_fraction": trial.suggest_float("bagging_fraction", 0.3, 1),
    }

    gbm = lgb.LGBMClassifier(**param)
    gbm.fit(X_train, y_train)

    y_pred = gbm.predict(X_val)
    score = single_class_report(y_val, y_pred, label=2)

    return score[3]


# Tickers (Settings)

In [9]:
ticker_list = ["TSLA", "NVDA", "AMZN", "AAPL", "MSFT", "AMD", "GOOG", "META", "NFLX"]
# ticker_list = ["TSLA", "NVDA", "AMZN"]

n_trials = 100

# Binary classification w raw data

## goal: 5 day 5% gain (high anomaly %)

In [10]:
X_train, y_train, X_val, y_val = data_prep_for_tuning(ticker_list, normalize=False, obj=[5, 0.05])

(8793, 101) (8793,) (2205, 101) (2205,)
(array([0, 1]), array([5895, 2898], dtype=int64))
(array([0, 1]), array([1640,  565], dtype=int64))


In [11]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials)

[I 2024-07-23 23:09:35,543] A new study created in memory with name: no-name-1b93f7f8-58d5-416f-85c1-4c5d1cebf143
[I 2024-07-23 23:09:35,759] Trial 0 finished with value: 0.04255990041876114 and parameters: {'num_leaves': 130, 'max_depth': 13, 'learning_rate': 0.026099789965358847, 'lambda_l1': 44, 'lambda_l2': 57, 'bagging_fraction': 0.6835243625639664}. Best is trial 0 with value: 0.04255990041876114.
[I 2024-07-23 23:09:36,024] Trial 1 finished with value: 0.2955433870699944 and parameters: {'num_leaves': 455, 'max_depth': 16, 'learning_rate': 0.17441250542087267, 'lambda_l1': 24, 'lambda_l2': 33, 'bagging_fraction': 0.8772941374675947}. Best is trial 1 with value: 0.2955433870699944.
[I 2024-07-23 23:09:36,175] Trial 2 finished with value: 0.09259837033195141 and parameters: {'num_leaves': 206, 'max_depth': 9, 'learning_rate': 0.15262675117564936, 'lambda_l1': 73, 'lambda_l2': 65, 'bagging_fraction': 0.7371262737867691}. Best is trial 1 with value: 0.2955433870699944.
[I 2024-07-23

In [12]:
optuna.visualization.plot_optimization_history(**{"study": study})

## goal: 3 day 5% gain (mid anomaly %)

In [13]:
X_train, y_train, X_val, y_val = data_prep_for_tuning(ticker_list, normalize=False, obj=[3, 0.05])

(8793, 101) (8793,) (2205, 101) (2205,)
(array([0, 1]), array([6781, 2012], dtype=int64))
(array([0, 1]), array([1862,  343], dtype=int64))


In [14]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials)

[I 2024-07-23 23:10:28,687] A new study created in memory with name: no-name-c33eee06-9f29-493a-942e-03e8152dd906
[I 2024-07-23 23:10:28,776] Trial 0 finished with value: 0.02112195597287959 and parameters: {'num_leaves': 322, 'max_depth': 7, 'learning_rate': 0.227914052519964, 'lambda_l1': 49, 'lambda_l2': 43, 'bagging_fraction': 0.6491677477814222}. Best is trial 0 with value: 0.02112195597287959.
[I 2024-07-23 23:10:28,870] Trial 1 finished with value: 0.0 and parameters: {'num_leaves': 79, 'max_depth': 11, 'learning_rate': 0.20908924513758553, 'lambda_l1': 59, 'lambda_l2': 62, 'bagging_fraction': 0.6252768743031439}. Best is trial 0 with value: 0.02112195597287959.
[I 2024-07-23 23:10:28,965] Trial 2 finished with value: 0.02385837656220654 and parameters: {'num_leaves': 299, 'max_depth': 8, 'learning_rate': 0.20646594059146162, 'lambda_l1': 62, 'lambda_l2': 55, 'bagging_fraction': 0.39749272061139185}. Best is trial 2 with value: 0.02385837656220654.
[I 2024-07-23 23:10:29,129] Tr

In [15]:
optuna.visualization.plot_optimization_history(**{"study": study})

## goal: 3 day 7% gain (low anomaly %)

In [16]:
X_train, y_train, X_val, y_val = data_prep_for_tuning(ticker_list, normalize=False, obj=[3, 0.07])

(8793, 101) (8793,) (2205, 101) (2205,)
(array([0, 1]), array([7689, 1104], dtype=int64))
(array([0, 1]), array([2027,  178], dtype=int64))


In [17]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials)

[I 2024-07-23 23:11:01,060] A new study created in memory with name: no-name-d57fd282-1958-47c8-8929-45299a88fba8
[I 2024-07-23 23:11:01,218] Trial 0 finished with value: 0.0 and parameters: {'num_leaves': 415, 'max_depth': 7, 'learning_rate': 0.018099721494321444, 'lambda_l1': 50, 'lambda_l2': 47, 'bagging_fraction': 0.34464380735287664}. Best is trial 0 with value: 0.0.
[I 2024-07-23 23:11:01,344] Trial 1 finished with value: 0.0 and parameters: {'num_leaves': 389, 'max_depth': 9, 'learning_rate': 0.27257582952919723, 'lambda_l1': 50, 'lambda_l2': 51, 'bagging_fraction': 0.41725378683119574}. Best is trial 0 with value: 0.0.
[I 2024-07-23 23:11:01,490] Trial 2 finished with value: 0.0 and parameters: {'num_leaves': 499, 'max_depth': 14, 'learning_rate': 0.18201005187508884, 'lambda_l1': 86, 'lambda_l2': 32, 'bagging_fraction': 0.9681655732327965}. Best is trial 0 with value: 0.0.
[I 2024-07-23 23:11:01,632] Trial 3 finished with value: 0.0 and parameters: {'num_leaves': 26, 'max_dept

In [18]:
optuna.visualization.plot_optimization_history(**{"study": study})

# Multi classification w raw data

## goal: 5 day 5% gain (high anomaly %)

In [19]:
X_train, y_train, X_val, y_val = data_prep_for_tuning_multi(ticker_list, normalize=False, obj=[5, 0.05])

(8793, 101) (8793,) (2205, 101) (2205,)
(array([0, 1, 2], dtype=int64), array([2332, 3563, 2898], dtype=int64))
(array([0, 1, 2], dtype=int64), array([ 437, 1203,  565], dtype=int64))


In [20]:
study = optuna.create_study(direction="maximize")
study.optimize(multi_objective, n_trials=n_trials)

[I 2024-07-23 23:11:48,903] A new study created in memory with name: no-name-30f4ca28-af2e-4e04-bdfd-b37a0e6df117
[I 2024-07-23 23:11:50,316] Trial 0 finished with value: 0.35631245001182765 and parameters: {'num_leaves': 44, 'max_depth': 8, 'learning_rate': 0.23535584517650454, 'lambda_l1': 1, 'lambda_l2': 88, 'bagging_fraction': 0.8898445057808082}. Best is trial 0 with value: 0.35631245001182765.
[I 2024-07-23 23:11:52,117] Trial 1 finished with value: 0.3754341123962292 and parameters: {'num_leaves': 410, 'max_depth': 10, 'learning_rate': 0.1027752333022908, 'lambda_l1': 5, 'lambda_l2': 93, 'bagging_fraction': 0.7048780118227373}. Best is trial 1 with value: 0.3754341123962292.
[I 2024-07-23 23:11:52,357] Trial 2 finished with value: 0.41399814334641444 and parameters: {'num_leaves': 416, 'max_depth': 9, 'learning_rate': 0.28651013234290873, 'lambda_l1': 55, 'lambda_l2': 42, 'bagging_fraction': 0.7765467608257597}. Best is trial 2 with value: 0.41399814334641444.
[I 2024-07-23 23:1

### best

In [21]:
optuna.visualization.plot_optimization_history(**{"study": study})

## goal: 3 day 5% gain (mid anomaly %)

In [22]:
X_train, y_train, X_val, y_val = data_prep_for_tuning_multi(ticker_list, normalize=False, obj=[3, 0.05])

(8793, 101) (8793,) (2205, 101) (2205,)
(array([0, 1, 2], dtype=int64), array([1675, 5106, 2012], dtype=int64))
(array([0, 1, 2], dtype=int64), array([ 277, 1585,  343], dtype=int64))


In [23]:
study = optuna.create_study(direction="maximize")
study.optimize(multi_objective, n_trials=n_trials)

[I 2024-07-23 23:12:21,511] A new study created in memory with name: no-name-6e477ea9-daff-46ca-9b30-6adc06df5056
[I 2024-07-23 23:12:21,677] Trial 0 finished with value: 0.28662165374113546 and parameters: {'num_leaves': 225, 'max_depth': 6, 'learning_rate': 0.23178236612988726, 'lambda_l1': 71, 'lambda_l2': 21, 'bagging_fraction': 0.6509519218671602}. Best is trial 0 with value: 0.28662165374113546.
[I 2024-07-23 23:12:21,817] Trial 1 finished with value: 0.29665130659487104 and parameters: {'num_leaves': 114, 'max_depth': 3, 'learning_rate': 0.22630950648639028, 'lambda_l1': 59, 'lambda_l2': 62, 'bagging_fraction': 0.35120514889871585}. Best is trial 1 with value: 0.29665130659487104.
[I 2024-07-23 23:12:22,441] Trial 2 finished with value: 0.24621321982582278 and parameters: {'num_leaves': 105, 'max_depth': 14, 'learning_rate': 0.21397627333279365, 'lambda_l1': 23, 'lambda_l2': 97, 'bagging_fraction': 0.9807077709850351}. Best is trial 1 with value: 0.29665130659487104.
[I 2024-07-

In [24]:
optuna.visualization.plot_optimization_history(**{"study": study})

## goal: 3 day 7% gain (low anomaly %)

In [25]:
X_train, y_train, X_val, y_val = data_prep_for_tuning_multi(ticker_list, normalize=False, obj=[3, 0.07])

(8793, 101) (8793,) (2205, 101) (2205,)
(array([0, 1, 2], dtype=int64), array([ 948, 6741, 1104], dtype=int64))
(array([0, 1, 2], dtype=int64), array([ 115, 1912,  178], dtype=int64))


In [26]:
study = optuna.create_study(direction="maximize")
study.optimize(multi_objective, n_trials=n_trials)

[I 2024-07-23 23:12:53,538] A new study created in memory with name: no-name-518322dd-3041-4d20-965c-cd69b6d742af
[I 2024-07-23 23:12:53,889] Trial 0 finished with value: 0.0 and parameters: {'num_leaves': 461, 'max_depth': 13, 'learning_rate': 0.2910287150161726, 'lambda_l1': 78, 'lambda_l2': 44, 'bagging_fraction': 0.9826615361021804}. Best is trial 0 with value: 0.0.
[I 2024-07-23 23:12:54,100] Trial 1 finished with value: 0.06043954830737273 and parameters: {'num_leaves': 109, 'max_depth': 5, 'learning_rate': 0.2982461837850387, 'lambda_l1': 36, 'lambda_l2': 96, 'bagging_fraction': 0.70515696734947}. Best is trial 1 with value: 0.06043954830737273.
[I 2024-07-23 23:12:54,406] Trial 2 finished with value: 0.03522014680306622 and parameters: {'num_leaves': 237, 'max_depth': 5, 'learning_rate': 0.16538643809356585, 'lambda_l1': 24, 'lambda_l2': 27, 'bagging_fraction': 0.7504972901079152}. Best is trial 1 with value: 0.06043954830737273.
[I 2024-07-23 23:12:54,603] Trial 3 finished wit

In [27]:
optuna.visualization.plot_optimization_history(**{"study": study})