In [None]:
import pandas as pd
import pandas as pd
import numpy as np
import warnings
import os
import sys
import plotly.express as px
import plotly.figure_factory as ff
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from pycaret.regression import *
np.random.seed(42)

In [None]:
# ignore Warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    warnings.filterwarnings('ignore')
    os.environ["PYTHONWARNINGS"] = "ignore"

In [None]:
def confusion_matrix_per_year(df):
    """
    This function takes in a DataFrame containing true labels, predicted labels and year information and creates a confusion matrix per year plotted in Plotly.

    :param df: DataFrame containing true labels, predicted labels and year information
    :type df: pandas.DataFrame
    """
    
    # Get a list of unique years in the DataFrame
    years = df['year'].unique()
    
    # Loop through each year
    for year in years:
        # Filter the DataFrame to only include data for the current year
        df_year = df[df['year'] == year]
        
        # Get the true and predicted labels for the current year
        y_true = df_year['Movement']
        y_pred = df_year['prediction_label']
        
        # Calculate the confusion matrix for the current year
        cm = confusion_matrix(y_true, y_pred)
        
        # Create a Plotly heatmap to visualize the confusion matrix
        fig = ff.create_annotated_heatmap(cm)
        
        # Update the layout of the heatmap to include a title
        fig.update_layout(title=f'Confusion Matrix for Year {year}')
        
        # Show the heatmap
        fig.show()

In [None]:
def confusion_matrix_per_company(df):
    """
    This function takes in a DataFrame containing true labels, predicted labels and year information and creates a confusion matrix per year plotted in Plotly.

    :param df: DataFrame containing true labels, predicted labels and year information
    :type df: pandas.DataFrame
    """
   
    # Get a list of unique years in the DataFrame
    companies = df['Company'].unique()
    
    # Loop through each year
    for company in companies:
        # Filter the DataFrame to only include data for the current year
        df_company = df[df['Company'] == companies]
        
        # Get the true and predicted labels for the current year
        y_true = df_company['Movement']
        y_pred = df_company['prediction_label']
        
        # Calculate the confusion matrix for the current year
        cm = confusion_matrix(y_true, y_pred)
        
        # Create a Plotly heatmap to visualize the confusion matrix
        fig = ff.create_annotated_heatmap(cm)
        
        # Update the layout of the heatmap to include a title
        fig.update_layout(title=f'Confusion Matrix for Company {company}')
        
        # Show the heatmap
        fig.show()

### Data Import and Preparation

In [None]:
df=pd.read_csv("../Data/FinalDF/FinalDF.csv", encoding="utf-8", sep="~")

In [None]:
# Keep only Traiding days
df = df[df['Is Trading Day'] == 1]
#df = df[df['Company'] != "TESLA"]

In [None]:
# convert the "Date" column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

In [None]:
# Map companies and Movement
company_map = {'AMAZON': 0, 'APPLE': 1, 'TESLA': 2, 'MICROSOFT':3}
df['Company'] = df['Company'].map(company_map)

movement_map ={"Up":0, "Down":1}
df['Movement'] = df['Movement'].map(movement_map)
df['PWD Movement'] = df['PWD Movement'].map(movement_map)

In [None]:
# Create a TimeSeriesSplit object with the desired number of splits
val_size=0.3

train_data, val_data = train_test_split(df, test_size=val_size)

In [None]:
target="Movement"

# Drop Close and Movement columns to avoid data leakage
class_df_train=train_data.drop(columns=["Date", 'Close', "Adj Close", "Price Change", 'Is Trading Day'])

class_df_val=val_data.drop(columns=["Date", 'Close', "Adj Close", "Price Change", 'Is Trading Day'])

## Classification

In [None]:
from pycaret.classification import *

In [None]:
# Create the Classification Experiment Enviroment
setup=setup(class_df_train, target = target, session_id = 123, n_jobs=-1, fold=20, use_gpu=True, remove_multicollinearity=True, keep_features=['Company', 'year', "day", "month"], transformation=True, )

In [None]:
# Find the top 3 models in terms of auc
top3 = compare_models(n_select = 3, sort = 'auc', include=["knn","svm","rf","et","xgboost","lightgbm"])

In [None]:
# Tune top 3 models
tuned_top3 = [tune_model(i, optimize = 'auc', search_library="scikit-optimize", early_stopping=True) for i in top3]

In [None]:
# Find the best model of the classification experiment
best_model= automl(optimize = 'auc')

In [None]:
# Find the optimal threshold of the model
optimal_threshold_model = optimize_threshold(best_model)

In [None]:
# Retrain the model with also the test data
final_model = finalize_model(optimal_threshold_model)

# Copy the final Training Results
trainning_results = pull()

In [None]:
# Print the final Training Results
trainning_results

In [None]:
# Use the model to predict on the validation set
predictions = predict_model(final_model, data=class_df_val)

In [None]:
#evaluate_model(final_model)
validaton_results = pull()

In [None]:
validaton_results

In [None]:
# Plot model parameters
plot_model(final_model, plot='parameter')

In [None]:
# Plot Feature Importance
plot_model(final_model, plot='feature_all')

In [None]:
# Plot Precision Recall Curve
plot_model(final_model, plot='pr')

In [None]:
# Plot CV Confusion Matrix
plot_model(final_model, plot='confusion_matrix')

In [None]:
# Plot Precision Recall Curve
plot_model(final_model, plot='class_report')

In [None]:
# Plot Precision Recall Curve
plot_model(final_model, plot='learning')

In [None]:
interpret_model(final_model[-1])

In [None]:
pipeline = final_model

In [None]:
predictions

In [None]:
# Get a list of unique years in the DataFrame
years = predictions['year'].unique()

# Loop through each year
for year in years:
    # Filter the DataFrame to only include data for the current year
    predictions_year = predictions[predictions['year'] == year]

    # Get the true and predicted labels for the current year
    y_true = predictions_year['Movement']
    y_pred = predictions_year['prediction_label']

    # Calculate the confusion matrix for the current year
    cm = confusion_matrix(y_true, y_pred)

    # Create a Plotly heatmap to visualize the confusion matrix
    fig = ff.create_annotated_heatmap(cm)

    # Update the layout of the heatmap to include a title
    fig.update_layout(title=f'Confusion Matrix for Year {year}')

    # Show the heatmap
    fig.show()

In [None]:
confusion_matrix_per_company(predictions)

In [None]:
confusion_matrix_per_year(predictions)

In [None]:
y_true

In [None]:
y_pred

In [None]:
predictions.isnull().sum()

In [None]:
df['Company'].isnull().sum()

In [None]:
predictions

In [None]:
dashboard(final_model[-1])