In [1]:
import sys
import os
from datetime import datetime
import mlflow
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
import mlflow
import os
import json
sys.path.insert(1, '../')


from src.utils import my_yf_download, get_sp500_tickers
from src.models.predictive_sma20_crossover_model import PredictiveSma20CrossoverModel
from src.models.predictive_macd_crossover_model import PredictiveMacdCrossoverModel
from src.models.bollinger_bands_metalabel import BollingerBandsMetalabel
from src.models.rolling_precision_recall_model import RollingPrecisionRecallModel
from src.data_processing import check_if_today_starts_with_vertical_green_overlay
from src.cache_utils import load_model
from src.plotting_utils import (
    plot_candlesticks,
)

In [2]:
tickers = get_sp500_tickers()

### Download data so training would feel faster

In [3]:
today = datetime.today()
today_str = today.strftime('%Y-%m-%d')

end_date = today_str
for ticker in tickers:
    my_yf_download(ticker, "../cache", end=end_date)

### Select strategy

In [4]:
model_names = [
        "RollingPrecisionRecallModel",
        'BollingerBandsMetalabel',
        "PredictiveMacdCrossoverModel",
        "PredictiveSma20CrossoverModel",
    ]
selected_model_class = globals()[model_names[3]]

In [5]:
last_n_days = "30"
train_until = "2019-01-01"

### Scan

In [6]:

# Directory for saving artifacts of this run
os.makedirs("mlflow_artifacts", exist_ok=True)
mlist = []

# too large text to log
keys_to_exclude = ['x_train', 'x_test', 'df_test']


with mlflow.start_run(run_name=f'Main Training Run for model {model_names[0]}') as parent_run:
    # only first 50 stocks
    for so in tickers[:50]:
        with mlflow.start_run(run_name=f'Training for {so}', nested=True):
            model = selected_model_class(so, train_until, data_source="yf")
            # Train model results
            model_results = model.run_train()
            mlflow.log_param("Stock", so)
            
            # logging individual components of model_results'
            if isinstance(model_results, dict) == False:
                model_results =  model_results.to_dict()
            for key, value in model_results.items():
                if key not in keys_to_exclude:
                    if isinstance(value, (pd.core.series.Series, pd.DataFrame)):
                        array_str = np.array2string(value.values[:10], separator=', ')
                        mlflow.log_param(key, array_str)
                    else:
                        mlflow.log_param(key, value)
            
            for phase, prefix in [("y_train", "train"), ("y_test", "test")]:
                y_true = model_results[phase]
                y_pred = model_results[f"{phase}_pred"]
                report = classification_report(y_true, y_pred)
                timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
                report_file = f"mlflow_artifacts/classification_report_{prefix}_{so}_{timestamp}.txt"
                with open(report_file, "w") as f:
                    f.write(report)
                mlflow.log_artifact(report_file)


            if (model_results["train_accuracy"] > 0.6
                and model_results["test_accuracy"] > 0.6
                and model_results["test_precision"] > 0.6
                and model_results["train_precision"] > 0.6
                ):
                df_test = model.run_test(
                    so, last_n_days, data_source="yf"
                )
                if check_if_today_starts_with_vertical_green_overlay(df_test):
                    mlist.append(so)
                    
    mlflow.log_param("mlist", mlist)
            
with mlflow.start_run(run_name=f'Main Testing Run for model {model_names[0]}') as parent_run:
    for so in tickers[:50]:
        model = selected_model_class(so, train_until, data_source="yf")
        df_test = model.run_test(
                    so, last_n_days, data_source="yf"
                )
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        chart_file = f"mlflow_artifacts/plot_test_{so}_{timestamp}.html"
        plot_candlesticks(df_test, chart_file)
        mlflow.log_artifact(chart_file)


Column 'Date' converted to datetime.


2024-01-19 23:12:16.602 
  command:

    streamlit run C:\Users\doraemon\anaconda3\envs\rfenv\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
Column 'Date' converted to datetime.
C