In [None]:
# Imports
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import openpyxl
import warnings
import sys
import os

np.random.seed(42)

In [None]:
# ignore Warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    warnings.filterwarnings('ignore')
    os.environ["PYTHONWARNINGS"] = "ignore"

In [None]:
def transform_numeric_columns(df, exclude_cols):
    # Create a copy of the DataFrame to avoid modifying the original
    df_transformed = df.copy()
    
    # Select only numeric columns
    num_cols = df_transformed.select_dtypes(include=['int64', 'float64']).columns
    
    # Exclude specified columns
    cols_to_transform = [col for col in num_cols if col not in exclude_cols]
    
    # Apply the PowerTransformer with the yeo-johnson method
    pt = PowerTransformer(method='yeo-johnson')
    df_transformed[cols_to_transform] = pt.fit_transform(df_transformed[cols_to_transform])
    
    return df_transformed


In [None]:
def sbgrp_performance(df, groupby_col, label_map):
    """
    This function takes in a DataFrame, a column name and a label map and creates a confusion matrix for each unique value in the specified column.

    :param df: DataFrame containing true labels, predicted labels and groupby column
    :type df: pandas.DataFrame
    :param groupby_col: Column name to group data by
    :type groupby_col: str
    :param label_map: Dictionary that maps integer labels to string labels
    :type label_map: dict
    """
    
    # Get a list of unique values in the specified column
    groups = df[groupby_col].unique()

    # Sort the groups in ascending order
    groups.sort()

    # Create a figure with subplots for each group
    fig, axs = plt.subplots(1, len(groups), figsize=(5 * len(groups), 5))

    # Loop through each group
    for i, group in enumerate(groups):
        # Filter the DataFrame to only include data for the current group
        df_group = df[df[groupby_col] == group]

        # Get the true and predicted labels for the current group
        y_true = df_group['Movement']
        y_pred = df_group['prediction_label']

        # Calculate the confusion matrix for the current group
        cm = confusion_matrix(y_true, y_pred)

        # Convert the confusion matrix to a DataFrame with appropriate column and row labels
        columns = [label_map[i] for i in range(len(label_map))]
        df_cm = pd.DataFrame(cm, columns=columns, index=columns)

        # Create a seaborn heatmap to visualize the confusion matrix on the corresponding subplot
        ax = sns.heatmap(df_cm, annot=True, fmt='d', annot_kws={"size": 12}, cmap='Greens', cbar=False, ax=axs[i]) # font size

        # Calculate performance metrics for the current group
        acc = accuracy_score(y_true, y_pred)
        auc = roc_auc_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

        # Update the layout of the heatmap to include a title and axis labels
        ax.set_title(f'Confusion Matrix for {groupby_col} {group}\n Accuracy: {acc:.2f}, AUC: {auc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}')
        ax.set_xlabel('Predicted Label')
        ax.set_ylabel('True Label')

    # Show the figure with all subplots
    plt.show()

### Data Import and Preparation

In [None]:
df=pd.read_csv("../Data/FinalDF/FinalDF.csv", encoding="utf-8", sep="~")

In [None]:
# Define the columns you want to exclude
cols_to_exclude = ['Company', 'year', "day", "month", 'Is Trading Day', "day_of_week","day_of_year","quarter", 'cos_day','sin_day']

# Transform the numeric features of the dataset
df = transform_numeric_columns(df, cols_to_exclude)

In [None]:
# convert the "Date" column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

In [None]:
# Map companies and Movement an create inversions for future refference
company_map={'AMAZON': 0, 'APPLE': 1, 'TESLA': 2, 'MICROSOFT':3}
movement_map={"Up":1, "Down":0}

inverted_company_map={v: k for k, v in company_map.items()}
inverted_movement_map={v: k for k, v in movement_map.items()}

df['Company']=df['Company'].map(company_map)

df['Movement']=df['Movement'].map(movement_map)
df['PWD Movement']=df['PWD Movement'].map(movement_map)

In [None]:
# Create a TimeSeriesSplit object with the desired number of splits
val_size=0.3

train_data, val_data = train_test_split(df, test_size=val_size)

In [None]:
target="Movement"
# Drop Close and Movement columns to avoid data leakage
class_df_train=train_data.drop(columns=["Date", 'Close', "Adj Close", "Price Change"])

class_df_val=val_data.drop(columns=["Date", 'Close', "Adj Close", "Price Change"])

## Classification

In [None]:
from pycaret.classification import *

In [None]:
# Create the Classification Experiment Enviroment
setup=setup(class_df_train, target=target, session_id = 124, n_jobs=-1, fold=20, use_gpu=True, feature_selection=True, fix_imbalance=True,
            keep_features=['Company', 'year', "day", "month", "day_of_week", "day_of_year","quarter", 'cos_day','sin_day'])

In [None]:
# Find the top model in terms of auc
top_models = compare_models(n_select = 8, sort = 'acc', include=["knn", "mlp", "rf", "et", "lr", "lightgbm", 'ridge','rbfsvm'])

In [None]:
top_models_table=pull()
top_models_table.to_excel("../Predictions/top_models.xlsx")

In [None]:
test_df = pd.DataFrame(columns = ['Model', 'Accuracy', 'AUC', 'Recall', 'Prec.', 'F1', 'Kappa', 'MCC'])


for model in top_models:
    predict_test = predict_model(model)
    predict_test_df = pull()
    test_df = pd.concat([test_df, predict_test_df], ignore_index=True, sort=False)
test_df.to_excel("../Predictions/test_top_models.xlsx")

In [None]:
# Tune top 3 models
#tuned_top3 = [tune_model(i, optimize = 'acc', early_stopping=True) for i in top_models]

In [None]:
# Ensemble Top Model
#bagged_model=ensemble_model(top_models[0], method = 'Bagging', choose_better=True, optimize='auc')

In [None]:
# Blend top 3 models
#blender = blend_models(top_models, choose_better=True, optimize='auc')

In [None]:
# Find the best model of the classification experiment
#best_model=automl(optimize = 'acc')

In [None]:
# Retrain the model with also the test data
#final_model=finalize_model(best_model)

list_of_models = []

for model in top_models:
    final_model = finalize_model(model)
    list_of_models.append(final_model)
# Copy the final Training Results
trainning_results=pull()

In [None]:
# Print the final Training Results
trainning_results.iloc[0]

In [None]:
# Get model parameters
parameters=plot_model(final_model, plot='parameter')

In [None]:
# Use the model to predict on the validation set
#predictions = predict_model(final_model, data=class_df_val)

In [None]:
#evaluate_model(final_model)
validation_df = pd.DataFrame(columns = ['Model', 'Accuracy', 'AUC', 'Recall', 'Prec.', 'F1', 'Kappa', 'MCC'])

for model in list_of_models:
    predictions = predict_model(model, data=class_df_val)
    validaton_results = pull()
    validation_df = pd.concat([validation_df, validaton_results], ignore_index=True, sort=False)
    if model ==list_of_models[0]:
        predictions2 = predictions
    
validation_df.to_excel("../Predictions/validation_top_models.xlsx")

In [None]:
# Plot Feature Importance
plot_model(list_of_models[0], plot='feature')


In [None]:
predictions = predictions2
predictions = predictions[predictions['Is Trading Day'] == 1]

In [None]:
# Inverse Mapping for Movement and Company collumns for Confusion Matrices
predictions['Company'] = predictions['Company'].map(inverted_company_map)

In [None]:
# Plot Confussion Matrix for Unseen Data
# Get the true and predicted labels for the current group
y_true = predictions['Movement']
y_pred = predictions['prediction_label']

# Calculate the confusion matrix for the current group
cm = confusion_matrix(y_true, y_pred)

df_cm = pd.DataFrame(cm)

# Convert the confusion matrix to a DataFrame with appropriate column and row labels
columns = [inverted_movement_map[i] for i in range(len(inverted_movement_map))]
df_cm = pd.DataFrame(cm, columns=columns, index=columns)

# Calculate performance metrics for the current group
acc = accuracy_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Create a seaborn heatmap to visualize the confusion matrix
ax = sns.heatmap(df_cm, annot=True, fmt='d', annot_kws={"size": 12}, cmap='Greens', cbar=False)

# Update the layout of the heatmap to include a title and axis labels
ax.set_title(f'Confusion matrix for unseen data\n Accuracy: {acc:.2f}  AUC: {auc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}')
ax.set_xlabel('Predicted Label')
ax.set_ylabel('True Label')

# Show the heatmap
plt.show()

In [None]:
# Plot Confussion Matrix for Unseen Data
# Get the true and predicted labels for the current group
for company in predictions['Company'].unique():
    predictions_per_company = predictions[predictions['Company']== company]
    y_true = predictions_per_company['Movement']
    y_pred = predictions_per_company['prediction_label']

    # Calculate the confusion matrix for the current group
    cm = confusion_matrix(y_true, y_pred)

    df_cm = pd.DataFrame(cm)

    # Convert the confusion matrix to a DataFrame with appropriate column and row labels
    columns = [inverted_movement_map[i] for i in range(len(inverted_movement_map))]
    df_cm = pd.DataFrame(cm, columns=columns, index=columns)

    # Calculate performance metrics for the current group
    acc = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Create a seaborn heatmap to visualize the confusion matrix
    ax = sns.heatmap(df_cm, annot=True, fmt='d', annot_kws={"size": 12}, cmap='Greens', cbar=False)

    # Update the layout of the heatmap to include a title and axis labels
    ax.set_title(f'Confusion matrix for unseen data - {company} \n Accuracy: {acc:.2f}  AUC: {auc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}')
    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')

    # Show the heatmap
    print(plt.show())

In [None]:
# Plot Confussion Matrix for Unseen Data
# Get the true and predicted labels for the current group
for year in predictions['year'].unique():
    predictions_per_company = predictions[predictions['year']== year]
    y_true = predictions_per_company['Movement']
    y_pred = predictions_per_company['prediction_label']

    # Calculate the confusion matrix for the current group
    cm = confusion_matrix(y_true, y_pred)

    df_cm = pd.DataFrame(cm)

    # Convert the confusion matrix to a DataFrame with appropriate column and row labels
    columns = [inverted_movement_map[i] for i in range(len(inverted_movement_map))]
    df_cm = pd.DataFrame(cm, columns=columns, index=columns)

    # Calculate performance metrics for the current group
    acc = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Create a seaborn heatmap to visualize the confusion matrix
    ax = sns.heatmap(df_cm, annot=True, fmt='d', annot_kws={"size": 12}, cmap='Greens', cbar=False)

    # Update the layout of the heatmap to include a title and axis labels
    ax.set_title(f'Confusion matrix for unseen data - {year} \n Accuracy: {acc:.2f}  AUC: {auc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}')
    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')

    # Show the heatmap
    print(plt.show())

In [None]:
predictions.to_excel("../Predictions/predictions.xlsx")

In [None]:
predictions

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, cohen_kappa_score, matthews_corrcoef, classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer

# Select actual and predicted classes
y_true = predictions['Movement']
y_pred = predictions['prediction_label']

# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
recall = recall_score(y_true, y_pred, average='micro')  # 'micro' calculates metrics globally
precision = precision_score(y_true, y_pred, average='micro')  # 'micro' calculates metrics globally
f1 = f1_score(y_true, y_pred, average='micro')  # 'micro' calculates metrics globally

report = classification_report(y_true, y_pred, output_dict=True)

# In order to compute ROC AUC for multiclass, we need to binarize the labels
lb = LabelBinarizer()
lb.fit(y_true)
y_true_bin = lb.transform(y_true)
y_pred_bin = lb.transform(y_pred)

auc = roc_auc_score(y_true_bin, y_pred_bin, average='macro')
auc_micro = roc_auc_score(y_true_bin, y_pred_bin, average='micro')
auc_weighted = roc_auc_score(y_true_bin, y_pred_bin, average='weighted')
kappa = cohen_kappa_score(y_true, y_pred)
mcc = matthews_corrcoef(y_true, y_pred)

# Create DataFrame for the results
results = pd.DataFrame({
    "Model": ["lgbm"],  # Replace with your model name
    "Accuracy": [accuracy],
    "AUC": [auc],
    "AUC Micro": [auc_micro],
    "AUC Weighted": [auc_weighted],
    "Recall": [recall],
    "Precision": [precision],
    "F1": [f1],
    "Kappa": [kappa],
    "MCC": [mcc],
    "Macro Precision": [report['macro avg']['precision']],
    "Macro Recall": [report['macro avg']['recall']],
    "Macro F1": [report['macro avg']['f1-score']],
    "Weighted Recall": [report['weighted avg']['recall']],
    "Weighted Precision": [report['weighted avg']['precision']],
    "Weighted F1": [report['weighted avg']['f1-score']],
})

results.to_excel("../Predictions/trading_days_validation_lgbm_results.xlsx")

In [None]:
results

In [None]:
# Check class distribution
class_distribution = y_true.value_counts(normalize=True)

print(class_distribution)
