### Libraries

In [None]:
# %% [markdown]
# ### Libraries

# %%
import os
import psutil

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import requests_cache


from datetime import datetime, timedelta
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from plotly.subplots import make_subplots
from pybroker import YFinance, StrategyConfig, Strategy
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

import indicators  # own .py file

# %% [markdown]
# ### Fetch data
# %%
import yfinance as yf

# Start time
start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")


TICKERS = ["AAPL", "EVO.ST", "HM-B.ST", "CRM", "IBM", "ASSA-B.ST"]


session = requests_cache.CachedSession("magic/.pybrokercache/yfinance.cache")
session.headers["User-agent"] = "my-program/1.0"


# Get today's date
date = datetime.today() - timedelta(days=1)

# # Format date as D/M/YYYY
date_string = date.strftime("%Y-%m-%d")

data = yf.download(
    TICKERS, start="2018-01-01", end=date_string, group_by="ticker", session=session
)


split_data = {}
if len(TICKERS) == 1:
    split_data[TICKERS[0]] = data
    split_data[TICKERS[0]].reset_index(inplace=True)
    split_data[TICKERS[0]].columns = [
        col.lower() for col in split_data[TICKERS[0]].columns
    ]

else:
    # Split data into individual DataFrames
    for ticker in TICKERS:
        split_data[ticker] = data[ticker]
        # Turn columns to lowercase
        split_data[ticker].reset_index(inplace=True)
        split_data[ticker].columns = [col.lower() for col in split_data[ticker].columns]


# Print number of rows and columns for each DataFrame
for ticker in TICKERS:
    print(f"{ticker}: {split_data[ticker].shape}")


# %% [markdown]
# ### TA indicators

# %%




### Fetch data

In [None]:
import yfinance as yf

# TICKERS = ["AAPL", "EVO.ST", "GOOGL", "MSFT", "TSLA"]
TICKERS = ["AAPL", "EVO.ST", "HM-B.ST", "CRM", "IBM", "ASSA-B.ST"]
# TICKERS = ["AAPL", "EVO.ST"]
# TICKERS = ["EVO.ST"]

session = requests_cache.CachedSession('magic/.pybrokercache/yfinance.cache')
session.headers['User-agent'] = 'my-program/1.0'


# Get today's date
date = datetime.today() - timedelta(days=1)
date
# # Format date as D/M/YYYY
date_string = date.strftime("%Y-%m-%d")

data = yf.download(TICKERS, start="2018-01-01", end=date_string, group_by="ticker", session=session)


split_data = {}
if len(TICKERS) == 1:
    split_data[TICKERS[0]] = data
    split_data[TICKERS[0]].reset_index(inplace=True)
    split_data[TICKERS[0]].columns = [col.lower() for col in split_data[TICKERS[0]].columns]

else:
    # Split data into individual DataFrames
    for ticker in TICKERS:
        split_data[ticker] = data[ticker]
        # Turn columns to lowercase
        split_data[ticker].reset_index(inplace=True)
        split_data[ticker].columns = [col.lower() for col in split_data[ticker].columns]

    # type(split_data["AAPL"])

# Print number of rows and columns for each DataFrame
for ticker in TICKERS:
    print(f"{ticker}: {split_data[ticker].shape}")
    
# split_data["EVO.ST"].head()

### TA indicators

In [None]:
# Function to calculate TA indicators
def calculate_ta_indicators(data):
    data.dropna(inplace=True)
    periods = [5, 8, 21, 55, 89, 200]
    ema_cols = {}
    sma_cols = {}
    wma_cols = {}

    for period in periods:
        ema_cols[f"EMA_{period}"] = indicators.calculate_ema(data, period=period)
        sma_cols[f"SMA_{period}"] = indicators.calculate_sma(data, period=period)
        wma_cols[f"WMA_{period}"] = indicators.calculate_wma(data, period=period)

    # Add moving averages to data
    data = pd.concat(
        [data, pd.DataFrame(ema_cols), pd.DataFrame(sma_cols), pd.DataFrame(wma_cols)],
        axis=1,
    )

    # Calculate TA's
    ta_cols = {
        "BB_upper": indicators.calculate_bbands(data)[0],
        "BB_middle": indicators.calculate_bbands(data)[1],
        "BB_lower": indicators.calculate_bbands(data)[2],
        "ADX": indicators.calculate_adx(data),
        "MACD": indicators.calculate_macd(data)[0],
        "MACD_signal": indicators.calculate_macd(data)[1],
        "MACD_hist": indicators.calculate_macd(data)[2],
        "RSI": indicators.calculate_rsi(data),
        "ADR": indicators.calculate_adr(data),
        "PivotHigh": indicators.find_pivots(data, return_boolean=True)[0],
        "PivotLow": indicators.find_pivots(data, return_boolean=True, rb=21, lb=21)[1],
        "RealStrength": indicators.enhanced_real_time_strength_index(data),
        "SAR": indicators.calculate_sar(data),
        "OBV": indicators.calculate_obv(data, ema_period=55)[0],
        "OBV_EMA_55": indicators.calculate_obv(data, ema_period=55)[1],
        "OBV_EMA_8": indicators.calculate_obv(data, ema_period=8)[1],
        "OBV_EMA_21": indicators.calculate_obv(data, ema_period=21)[1],
        "OBV_EMA_144": indicators.calculate_obv(data, ema_period=144)[1],
        "ATR": indicators.calculate_atr(data),
        "Hammer": indicators.calculate_hammer(data),
        "VWAP": indicators.calculate_vwap(data),
        "ADOSC": indicators.calculate_adosc(data, fastperiod=3, slowperiod=10),
        "HT_SINE": indicators.calculate_ht_sine(data)[0],
        "HT_LEADSINE": indicators.calculate_ht_sine(data)[1],
        "HT_TRENDMODE": indicators.calculate_ht_trendmode(data),
        "ROC": indicators.calculate_roc(data, period=10),
        "MOM": indicators.calculate_mom(data, period=10),
        "APZ_upper": indicators.calculate_apz(data)[0],
        "APZ_lower": indicators.calculate_apz(data)[1],
    }

    # Add TA's to data
    data = pd.concat([data, pd.DataFrame(ta_cols)], axis=1)

    # Calculate short-term and long-term differences for EMA, SMA and WMA
    diff_cols = {
        "SMA_diff_short_5_8": data["SMA_5"] - data["SMA_8"],
        "SMA_diff_short_8_21": data["SMA_8"] - data["SMA_21"],
        "SMA_diff_short_21_55": data["SMA_21"] - data["SMA_55"],
        "EMA_diff_short_5_8": data["EMA_5"] - data["EMA_8"],
        "EMA_diff_short_8_21": data["EMA_8"] - data["EMA_21"],
        "EMA_diff_short_21_55": data["EMA_21"] - data["EMA_55"],
        "WMA_diff_short_5_8": data["WMA_5"] - data["WMA_8"],
        "WMA_diff_short_8_21": data["WMA_8"] - data["WMA_21"],
        "WMA_diff_short_21_55": data["WMA_21"] - data["WMA_55"],
        "SMA_diff_long_55_89": data["SMA_55"] - data["SMA_89"],
        "SMA_diff_long_55_200": data["SMA_55"] - data["SMA_200"],
        "EMA_diff_long_55_89": data["EMA_55"] - data["EMA_89"],
        "EMA_diff_long_55_200": data["EMA_55"] - data["EMA_200"],
        "WMA_diff_long_55_89": data["WMA_55"] - data["WMA_89"],
        "WMA_diff_long_55_200": data["WMA_55"] - data["WMA_200"],
        "OBV_diff_short_8_21": data["OBV"] - data["OBV_EMA_8"],
        "OBV_diff_short_21_55": data["OBV"] - data["OBV_EMA_21"],
        "OBV_diff_long_55_144": data["OBV"] - data["OBV_EMA_144"],
    }

    # Add differences to data
    data = pd.concat([data, pd.DataFrame(diff_cols)], axis=1)

    # Additional features
    data["RSI_divergence"] = data["RSI"] - data["RSI"].rolling(window=5).mean()
    data["MACD_acceleration"] = data["MACD"] - data["MACD"].shift(1)
    data["BB_percentage"] = (data["close"] - data["BB_lower"]) / (
        data["BB_upper"] - data["BB_lower"]
    )
    data["volume_price_trend"] = data["volume"] * (
        data["close"] - data["close"].shift(1)
    )
    data["trend_strength"] = abs(data["EMA_21"] - data["SMA_21"]) / data["SMA_21"]

    return data.copy()


# %% [markdown]
# ### Feauture engineering


# %%


### Feauture engineering

In [None]:
# Function to perform feature engineering
def perform_feature_engineering(data):
    data = data.copy()
    data.set_index("date", inplace=True)
    data.index = pd.to_datetime(data.index)
    # Create Lag Features
    lagged_features = []
    for col in [
        "open",
        "high",
        "low",
        "close",
        "volume",
        "RealStrength",
        "ADX",
        "MACD",
        "MACD_signal",
        "MACD_hist",
        "RSI",
        "ADR",
        "BB_upper",
        "BB_middle",
        "BB_lower",
        "APZ_upper",
        "APZ_lower",
        "SAR",
        "OBV",
        "ATR",
        "Hammer",
        "VWAP",
        "OBV_EMA_21",
        "OBV_EMA_55",
        "OBV_diff_short_21_55",
        "OBV_diff_short_8_21",
        "OBV_diff_long_55_144",
        "ADOSC",
        "HT_SINE",
        "HT_LEADSINE",
        "HT_TRENDMODE",
        "ROC",
        "MOM",
        "SMA_5",
        "SMA_8",
        "SMA_21",
        "SMA_55",
        "SMA_89",
        "SMA_200",
        "EMA_5",
        "EMA_8",
        "EMA_21",
        "EMA_55",
        "EMA_89",
        "EMA_200",
        "WMA_5",
        "WMA_8",
        "WMA_21",
        "WMA_55",
        "WMA_89",
        "WMA_200",
        "SMA_diff_short_5_8",
        "SMA_diff_short_8_21",
        "SMA_diff_short_21_55",
        "EMA_diff_short_5_8",
        "EMA_diff_short_8_21",
        "EMA_diff_short_21_55",
        "WMA_diff_short_5_8",
        "WMA_diff_short_8_21",
        "WMA_diff_short_21_55",
        "SMA_diff_long_55_89",
        "SMA_diff_long_55_200",
        "EMA_diff_long_55_89",
        "EMA_diff_long_55_200",
        "WMA_diff_long_55_89",
        "WMA_diff_long_55_200",
        "RSI_divergence",
        "MACD_acceleration",
        "BB_percentage",
        "volume_price_trend",
        "trend_strength",
    ]:
        for lag in range(1, 9):  # Same as the pivot look window
            lagged_feature = data[col].shift(lag)
            lagged_feature.name = f"{col}_lag_{lag}"
            lagged_features.append(lagged_feature)

    # Concatenate all lagged features with the original dataframe
    lagged_features_df = pd.concat(lagged_features, axis=1)
    data = pd.concat([data, lagged_features_df], axis=1)

    # Create the label column
    data["Label"] = 0  # Initialize with 'do nothing'
    data.loc[data["PivotLow"], "Label"] = 1  # Buy signal

    # Drop the PivotHigh and PivotLow columns
    # data.drop(columns=["PivotHigh", "PivotLow"], inplace=True)
    data.drop(columns=["PivotLow"], inplace=True)

    # Rename "PivotHigh" to "PivotHigh_lag_1"
    data.rename(columns={"PivotHigh": "PivotHigh_lag_1"}, inplace=True)

    # Drop NaN values
    data.dropna(inplace=True)

    # Populate 'Label' with extra 1 in pos -1 and +1 if Label is 1 since 1's are scarce, commented out for now
    # data.loc[data["Label"].shift(-1, fill_value=False) & (data["Label"] == 0), "Label"] = 1
    # data.loc[data["Label"].shift(-2, fill_value=False) & (data["Label"] == 0), "Label"] = 1
    # data.loc[
    #     data["Label"].shift(1, fill_value=False) & (data["Label"] == 0), "Label"
    # ] = 1
    # data.loc[data["Label"].shift(2, fill_value=False) & (data["Label"] == 0), "Label"] = 1

    # Add time such as day and month
    data["day_of_week_lag_1"] = data.index.dayofweek + 1
    data["day_of_month_lag_1"] = data.index.month

    return data


# Apply functions to each ticker
for ticker in TICKERS:
    split_data[ticker] = calculate_ta_indicators(split_data[ticker].copy())
    split_data[ticker] = perform_feature_engineering(split_data[ticker].copy())


# %% [markdown]


### Standardize the data

In [None]:
# ### Standardize the data

# %%
# Step 1: Perform train-test split first
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif

# Assuming `split_data` and `TICKERS` are already defined earlier in your code

# Step 1: Perform train-test split first
train_data = {}
test_data = {}
for ticker in TICKERS:
    train_data[ticker], test_data[ticker] = train_test_split(
        split_data[ticker], test_size=0.2, shuffle=False
    )

# Step 2: Prepare features for scaling
all_train_features = []
for ticker in TICKERS:
    features = train_data[ticker].drop(columns=["Label"])
    all_train_features.append(features)

# Concatenate all training features
combined_train_features = pd.concat(all_train_features, ignore_index=True)

# Step 3: Fit the scaler on combined training features
scaler = StandardScaler()
scaler.fit(combined_train_features)

# Step 4: Apply the scaler to each ticker's data (both train and test)
normalized_train_data = {}
normalized_test_data = {}

for ticker in TICKERS:
    # Training data
    train_features = train_data[ticker].drop(columns=["Label"])
    train_labels = train_data[ticker]["Label"]
    normalized_train_features = scaler.transform(train_features)
    normalized_train_data[ticker] = pd.DataFrame(
        normalized_train_features, columns=train_features.columns
    )
    normalized_train_data[ticker]["Label"] = train_labels.reset_index(drop=True)

    # Test data
    test_features = test_data[ticker].drop(columns=["Label"])
    test_labels = test_data[ticker]["Label"]
    normalized_test_features = scaler.transform(test_features)
    normalized_test_data[ticker] = pd.DataFrame(
        normalized_test_features, columns=test_features.columns
    )
    normalized_test_data[ticker]["Label"] = test_labels.reset_index(drop=True)


# Step 5: Concatenate the normalized training data
combined_train_data = pd.concat(normalized_train_data.values(), ignore_index=True)
combined_test_data = pd.concat(normalized_test_data.values(), ignore_index=True)


# Select the features and target
features = [col for col in combined_train_data.columns if "lag" in col]
target = "Label"
X_train_all = combined_train_data[features]
y_train = combined_train_data[target]




### Prepare data for training

In [None]:
# # Step 7: Feature selection function, half of the features
# def select_features(X, y, correlation_threshold=0.90, n_features=233):
#     # Remove highly correlated features
#     corr_matrix = X.corr().abs()
#     upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
#     to_drop = [
#         column for column in upper.columns if any(upper[column] > correlation_threshold)
#     ]
#     X = X.drop(to_drop, axis=1)

#     # Select top features based on mutual information
#     mi_scores = mutual_info_classif(X, y)
#     mi_scores = pd.Series(mi_scores, index=X.columns)
#     top_features = mi_scores.nlargest(n_features).index.tolist()

#     return X[top_features], top_features


# def stable_feature_selection(X, y, n_rounds=10):
#     feature_counts = {col: 0 for col in X.columns}
#     for _ in range(n_rounds):
#         _, selected_features = select_features(X, y)
#         for feature in selected_features:
#             feature_counts[feature] += 1

#     stable_features = [
#         feature for feature, count in feature_counts.items() if count >= n_rounds // 2
#     ]
#     return X[stable_features], stable_features


# # Step 8: Apply feature selection to the entire dataset
# # Step 9: Apply feature selection to training data
# X_train, selected_columns = stable_feature_selection(X_train_all, y_train)

# # Step 11: Apply feature selection to test data
# X_test = combined_test_data[selected_columns]
# y_test = combined_test_data[target]

# # %% [markdown]
# # ### model



import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

RANDOM_STATE = 42

def remove_correlated_features(X, correlation_threshold=0.95):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > correlation_threshold)]
    return X.drop(columns=to_drop)

def rfe_feature_selection(X, y, n_features_to_select=100, cv=5):
    # Remove highly correlated features first
    X = remove_correlated_features(X)
    
    # Initialize RandomForestClassifier
    rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
    
    # Initialize RFE
    rfe = RFE(estimator=rf, n_features_to_select=n_features_to_select, step=10)
    
    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    
    feature_scores = np.zeros(X.shape[1])
    
    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        # Fit RFE
        rfe.fit(X_train, y_train)
        
        # Get selected features
        feature_scores += rfe.ranking_ == 1
        
        # Evaluate performance
        X_val_selected = X_val.iloc[:, rfe.support_]
        rf.fit(X_train.iloc[:, rfe.support_], y_train)
        y_pred = rf.predict_proba(X_val_selected)[:, 1]
        # auc = roc_auc_score(y_val, y_pred)
    
    # Select features that were chosen in at least half of the folds
    selected_features = X.columns[feature_scores >= cv // 2].tolist()
    
    return X[selected_features], selected_features

# Apply feature selection
n_features_to_select = min(100, X_train_all.shape[1] // 2)  # Select half of the features or 100, whichever is smaller
X_train, selected_columns = rfe_feature_selection(X_train_all, y_train, n_features_to_select=n_features_to_select)

# Apply feature selection to test data
X_test = combined_test_data[selected_columns]
y_test = combined_test_data[target]

print(f"Number of selected features: {len(selected_columns)}")
print("Selected features:")
print(selected_columns)

### model

In [None]:
# %%
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_score,
    f1_score,
    average_precision_score,
    roc_auc_score,
)
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from imblearn.combine import SMOTETomek
from sklearn.calibration import CalibratedClassifierCV
import optuna
from imblearn.combine import SMOTEENN
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Constants
N_TRIALS = 15
NB_CV = 8
THREADS = psutil.cpu_count(logical=True)
RANDOM_STATE = 42
RESAMPLE = False

# Define the TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=NB_CV)

# SMOTETomek for handling imbalanced data
smote_tomek = SMOTETomek(sampling_strategy=0.5, random_state=RANDOM_STATE)
smoteenn = SMOTEENN(random_state=RANDOM_STATE, sampling_strategy=0.5)


# Function to calculate multiple metrics
def calculate_metrics(y_true, y_pred, y_pred_proba):
    return {
        "precision": precision_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "average_precision": average_precision_score(y_true, y_pred_proba),
        "roc_auc": roc_auc_score(y_true, y_pred_proba),
    }


# Tune XGBoost Classifier
def tune_xgb(trial):
    params = {
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 1e-8, 5, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.5, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "n_estimators": trial.suggest_int("n_estimators", 100, 3000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 10, 100),
        # "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 10, 50),
        "max_leaves": trial.suggest_int("max_leaves", 0, 1000),
        "grow_policy": trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"]
        ),
        "max_bin": trial.suggest_int("max_bin", 128, 512),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["auto", "exact", "approx", "hist"]
        ),
    }

    model = XGBClassifier(
        **params, random_state=RANDOM_STATE, eval_metric="aucpr", n_jobs=-1
    )

    # scores = cross_val_score(
    #     model, X_train, y_train, cv=tscv, scoring="average_precision", n_jobs=-1
    # )

    scores = []
    for train_index, test_index in tscv.split(X_train):
        X_fold_train, X_fold_test = X_train.iloc[train_index], X_train.iloc[test_index]
        y_fold_train, y_fold_test = y_train.iloc[train_index], y_train.iloc[test_index]

        if RESAMPLE:
            X_resampled, y_resampled = smoteenn.fit_resample(X_fold_train, y_fold_train)
        else:
            X_resampled, y_resampled = X_fold_train, y_fold_train

        model.fit(X_resampled, y_resampled)
        y_pred_proba = model.predict_proba(X_fold_test)[:, 1]
        # score = average_precision_score(y_fold_test, y_pred_proba)
        score = precision_score(y_fold_test, model.predict(X_fold_test), zero_division = 0)
        scores.append(score)

    return np.mean(scores)


# Tune RandomForest Classifier
def tune_rf(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 3000),
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_float("max_features", 0.1, 1.0),
        "max_samples": trial.suggest_float("max_samples", 0.5, 1.0),
        "class_weight": trial.suggest_categorical(
            "class_weight", ["balanced", "balanced_subsample"]
        ),
        "min_impurity_decrease": trial.suggest_float("min_impurity_decrease", 0, 0.1),
        "ccp_alpha": trial.suggest_float("ccp_alpha", 0, 0.1),
    }

    model = RandomForestClassifier(**params, random_state=RANDOM_STATE, n_jobs=-1)

    # scores = cross_val_score(
    #     model, X_train, y_train, cv=tscv, scoring="average_precision", n_jobs=-1
    # )

    scores = []
    for train_index, test_index in tscv.split(X_train):
        X_fold_train, X_fold_test = X_train.iloc[train_index], X_train.iloc[test_index]
        y_fold_train, y_fold_test = y_train.iloc[train_index], y_train.iloc[test_index]

        if RESAMPLE:
            X_resampled, y_resampled = smoteenn.fit_resample(X_fold_train, y_fold_train)
        else:
            X_resampled, y_resampled = X_fold_train, y_fold_train

        model.fit(X_resampled, y_resampled)
        y_pred_proba = model.predict_proba(X_fold_test)[:, 1]
        # score = average_precision_score(y_fold_test, y_pred_proba)
        score = precision_score(y_fold_test, model.predict(X_fold_test), zero_division = 0)

        scores.append(score)

    return np.mean(scores)


# Tune CatBoost Classifier
def tune_catboost(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 500, 5000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 10, 100),
        # "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 10, 50),
        "random_strength": trial.suggest_float("random_strength", 0.1, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 10),
        "grow_policy": trial.suggest_categorical(
            "grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]
        ),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
    }

    model = CatBoostClassifier(
        **params, random_seed=RANDOM_STATE, verbose=0, thread_count=THREADS
    )

    # scores = cross_val_score(
    #     model, X_train, y_train, cv=tscv, scoring="average_precision", n_jobs=-1
    # )

    scores = []
    for train_index, test_index in tscv.split(X_train):
        X_fold_train, X_fold_test = X_train.iloc[train_index], X_train.iloc[test_index]
        y_fold_train, y_fold_test = y_train.iloc[train_index], y_train.iloc[test_index]

        if RESAMPLE:
            X_resampled, y_resampled = smoteenn.fit_resample(X_fold_train, y_fold_train)
        else:
            X_resampled, y_resampled = X_fold_train, y_fold_train

        model.fit(X_resampled, y_resampled)
        y_pred_proba = model.predict_proba(X_fold_test)[:, 1]
        # score = average_precision_score(y_fold_test, y_pred_proba)
        score = precision_score(y_fold_test, model.predict(X_fold_test), zero_division = 0)
        scores.append(score)

    return np.mean(scores)


def tune_svm(trial):
    params = {
        "C": trial.suggest_float("C", 1, 100, log=True),
        "kernel": trial.suggest_categorical("kernel", ["rbf", "poly"]),
        "gamma": trial.suggest_float("gamma", 1e-5, 1, log=True),
        "class_weight": trial.suggest_categorical("class_weight", ["balanced", None]),
    }

    model = SVC(**params, random_state=RANDOM_STATE, probability=True)

    # scores = cross_val_score(
    #     model, X_train, y_train, cv=tscv, scoring="average_precision", n_jobs=-1
    # )

    scores = []
    for train_index, test_index in tscv.split(X_train):
        X_fold_train, X_fold_test = X_train.iloc[train_index], X_train.iloc[test_index]
        y_fold_train, y_fold_test = y_train.iloc[train_index], y_train.iloc[test_index]

        if RESAMPLE:
            X_resampled, y_resampled = smoteenn.fit_resample(X_fold_train, y_fold_train)
        else:
            X_resampled, y_resampled = X_fold_train, y_fold_train

        model.fit(X_resampled, y_resampled)
        y_pred_proba = model.predict_proba(X_fold_test)[:, 1]
        # score = average_precision_score(y_fold_test, y_pred_proba)
        score = precision_score(y_fold_test, model.predict(X_fold_test), zero_division = 0)
        scores.append(score)

    return np.mean(scores)


# Optimize models

# Initialize MLP model with best parameters
study_svm = optuna.create_study(direction="maximize", study_name="SVM")
study_svm.optimize(tune_svm, n_trials=N_TRIALS, show_progress_bar=True)

study_xgb = optuna.create_study(direction="maximize", study_name="XGBoost")
study_xgb.optimize(tune_xgb, n_trials=N_TRIALS, show_progress_bar=True)

study_rf = optuna.create_study(direction="maximize", study_name="RandomForest")
study_rf.optimize(tune_rf, n_trials=N_TRIALS, show_progress_bar=True)

study_catboost = optuna.create_study(direction="maximize", study_name="CatBoost")
study_catboost.optimize(tune_catboost, n_trials=N_TRIALS, show_progress_bar=True)


# Initialize models with best parameters
svm_model = SVC(**study_svm.best_params, random_state=RANDOM_STATE, probability=True)

xgb_model = XGBClassifier(
    **study_xgb.best_params,
    random_state=RANDOM_STATE,
    eval_metric="aucpr",
    n_jobs=-1,
)
rf_model = RandomForestClassifier(
    **study_rf.best_params, random_state=RANDOM_STATE, n_jobs=-1
)
catboost_model = CatBoostClassifier(
    **study_catboost.best_params,
    random_seed=RANDOM_STATE,
    verbose=0,
    thread_count=THREADS,
)


# Create StackingClassifier
stacked_model = StackingClassifier(
    estimators=[
        ("xgb", xgb_model),
        ("rf", rf_model),
        ("catboost", catboost_model),
        ("svm", svm_model),
    ],
    final_estimator=xgb_model,
    passthrough=True,
    n_jobs=-1,
    cv=NB_CV,
)


# Tune meta-model (Logistic Regression)
def tune_meta_model(trial):
    params = {
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 1e-8, 5, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.5, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "n_estimators": trial.suggest_int("n_estimators", 100, 3000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 10, 100),
        # "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 10, 50),
        "max_leaves": trial.suggest_int("max_leaves", 0, 1000),
        "grow_policy": trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"]
        ),
        "max_bin": trial.suggest_int("max_bin", 128, 512),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["auto", "exact", "approx", "hist"]
        ),
    }

    stacked_model.final_estimator.set_params(**params)

    scores = []
    for train_index, test_index in tscv.split(X_train):
        X_fold_train, X_fold_test = X_train.iloc[train_index], X_train.iloc[test_index]
        y_fold_train, y_fold_test = y_train.iloc[train_index], y_train.iloc[test_index]

        if RESAMPLE:
            X_resampled, y_resampled = smoteenn.fit_resample(X_fold_train, y_fold_train)
        else:
            X_resampled, y_resampled = X_fold_train, y_fold_train

        stacked_model.fit(X_resampled, y_resampled)
        y_pred_proba = stacked_model.predict_proba(X_fold_test)[:, 1]
        # score = average_precision_score(y_fold_test, y_pred_proba)
        score = precision_score(y_fold_test, stacked_model.predict(X_fold_test), zero_division = 0)
        scores.append(score)

    return np.mean(scores)


study_meta = optuna.create_study(direction="maximize", study_name="MetaModel")
study_meta.optimize(tune_meta_model, n_trials=N_TRIALS, show_progress_bar=True)

# Set best parameters for meta-model
stacked_model.final_estimator.set_params(**study_meta.best_params)


# Train final model
stacked_model.fit(X_train, y_train)

# Calibrate the model
calibrated_model = CalibratedClassifierCV(
    estimator=stacked_model, method="isotonic", cv="prefit", n_jobs=-1
)
calibrated_model.fit(X_test, y_test)


print("Model saved successfully!")
print(f"Start time: {start_time}")
print(f"End time: {datetime.now().strftime(("%Y-%m-%d %H:%M:%S"))}")
# %%




In [None]:
# Save selected features
import joblib

joblib.dump(selected_columns, "../saved_models/selected_columns_v6.pkl")

# Save the scaler
joblib.dump(scaler, "../saved_models/scaler_v6.pkl")

# Save the model
joblib.dump(stacked_model, "../saved_models/stacked_model_v6.pkl")

joblib.dump(calibrated_model, "../saved_models/calibrated_model_v6.pkl")