# Imports

In [None]:
# !pip install yfinance
# !pip install pmdarima
# !pip install hyperopt
# !pip install xgboost
# !pip install numpy -U

In [2]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import train
import preprocessing

# Download stock daily prices & indexes

In [2]:
### Gets all ticker names (no argument given)
market_cap_min_mm = 1000
market_cap_max_mm = None

ticker_list = utils.get_ticker_names(
    market_cap_min_mm=market_cap_min_mm,
    market_cap_max_mm=market_cap_max_mm,
    remove_industries=[
        # "diversified_financials",
        "pharmaceuticals_biotechnology_and_life_sciences",
    ],
)

In [3]:
### Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2021, 1, 18)
### Date of today
date_to = datetime.today()
### How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 5.1

In [4]:
_, _ = utils.download_stonk_prices(
    ticker_list.index, period_years=period_years, date_to=date_to
)
_, _ = utils.download_stonk_prices(
    ["^VIX"], period_years=period_years, date_to=date_to, fname_prefix="vix"
)
_, _ = utils.download_stonk_prices(
    ["^GSPC"], period_years=period_years, date_to=date_to, fname_prefix="sp500"
)
_, _ = utils.download_stonk_prices(
    ["CL=F"], period_years=period_years, date_to=date_to, fname_prefix="oil"
)
_, _ = utils.download_stonk_prices(
    ["DX=F"], period_years=period_years, date_to=date_to, fname_prefix="usd"
)
_, _ = utils.download_stonk_prices(
    ["^TNX"], period_years=period_years, date_to=date_to, fname_prefix="yield"
)
_, _ = utils.download_stonk_prices(
    ["HG=F"], period_years=period_years, date_to=date_to, fname_prefix="copper"
)

[*********************100%***********************]  2442 of 2442 completed

25 Failed downloads:
- JOBS: No data found, symbol may be delisted
- FB: No data found, symbol may be delisted
- RLGY: No data found, symbol may be delisted
- POST WI: No data found, symbol may be delisted
- BLL: No data found, symbol may be delisted
- BIP.PRB: No data found, symbol may be delisted
- MIME: No data found, symbol may be delisted
- O.WI: No data found, symbol may be delisted
- OCDX: No data found, symbol may be delisted
- BIP.PRA: No data found, symbol may be delisted
- SNX.WI: No data found, symbol may be delisted
- APSG: No data found, symbol may be delisted
- TSC: No data found, symbol may be delisted
- EPAY: No data found, symbol may be delisted
- ANAT: No data found, symbol may be delisted
- RXN WI: No data found, symbol may be delisted
- T WD: No data found, symbol may be delisted
- NCBS: No data found, symbol may be delisted
- DELL WI: No data found, symbol may be delisted
- CERN: No data f

# Run data pipeline

In [19]:
industries = [
    # 'health_care_equipment_and_services',
    # 'software_and_services',
    # 'retailing',
    # 'telecommunication_services',
    # "capital_goods",
    # "energy",
    # # 'pharmaceuticals_biotechnology_and_life_sciences',
    # 'consumer_staples',
    # 'banks',
    # 'diversified_financials',
    # 'metals_and_mining',
    # 'technology_hardware_and_equipment',
    # 'utilities',
    # 'chemicals',
    # 'automobiles_and_components',
    # "semiconductors_and_semiconductor_equipment",
    # 'media_and_entertainment',
    # 'real_estate',
    # 'consumer_services',
    # 'consumer_durables_and_apparel',
    # 'insurance',
    # 'transportation',
    # 'commercial_and_professional_services',
    "paper_and_forest_products",
    "containers_and_packaging",
    "construction_materials",
]

l_reg = 3
l_roll = 2
dt = 10
last_residual_cutoff = 2.5
adf_pval_cutoff = 0.1
adf_pass_rate_filter = 0.5
mean_max_residual_dt = 21
arima_forecast_months = 3
arima_eval_models = 5

market_cap_max_string = "max" if market_cap_max_mm is None else str(market_cap_max_mm)
pipeline_dir = (
    "pipeline_run_" + str(market_cap_min_mm) + "_to_" + market_cap_max_string + "_cap"
)
output_dir = os.path.join("data", pipeline_dir)

stonk_model = predict.XGBStonkModel()

market_indexes = utils.get_market_indexes()

In [22]:
datasets = []
i = 0
total_industries = len(industries)
for industry in industries:
    i += 1
    stonks = utils.get_stonk_data(
        market_cap_min_mm,
        market_cap_max_mm,
        remove_industries=[industry],
        filter_industries=True,
    )
    X, Y = processing.combine_stonk_pairs(stonks)

    print("Industry ({0}/{1}): {2}".format(i, total_industries, industry))

    features = pipelines.process_features_from_price_data(
        X=X,
        Y=Y,
        market_indexes=market_indexes,
        l_reg=l_reg,
        l_roll=l_roll,
        dt=dt,
        last_residual_cutoff=last_residual_cutoff,
        adf_pval_cutoff=adf_pval_cutoff,
        adf_pass_rate_filter=adf_pass_rate_filter,
        mean_max_residual_dt=mean_max_residual_dt,
        arima_forecast_months=arima_forecast_months,
        arima_eval_models=arima_eval_models,
    )

    if len(features) == 0:
        print("No trades")
        continue

    print(
        "Mean max value for {0}: {1}".format(industry, features["residuals_max_mean"])
    )
    dataset = utils.build_dataset_from_live_data_by_industry(
        std_residuals=features["std_residuals"],
        adfs=features["adfs"],
        subindustry=industry,
        mean_max_residual=features["residuals_max_mean"],
        vix_index=market_indexes["vix"].loc[stonks.columns[-1]],
        betas_stability_rsquared=features["beta_stability_rsquared_vals"],
        arima_forecasts=features["arima_forecasts"],
    )

    predictions, df_processed = stonk_model.predict(dataset)
    datasets.append((dataset, df_processed))
    predictions = pd.DataFrame(predictions)
    predictions.index = features["adfs"].index

    features["residuals"].insert(0, "dates", features["dates_index"].values)
    features["betas"].insert(0, "dates", features["dates_index"].values)

    features["residuals"].to_csv(
        os.path.join(output_dir, industry + "_residuals.csv"),
        header=False,
        index=True,
    )
    features["betas"].to_csv(
        os.path.join(output_dir, industry + "_betas.csv"), header=False, index=True
    )
    features["adfs_raw"].to_csv(
        os.path.join(output_dir, industry + "_adfs_raw.csv"), header=False, index=True
    )
    predictions.to_csv(
        os.path.join(output_dir, industry + "_predictions.csv"),
        header=False,
        index=True,
    )
    features["arima_forecasts"].to_csv(
        os.path.join(output_dir, industry + "_arima.csv"),
        header=False,
        index=True,
    )
    features["beta_stability_rsquared_vals"].to_csv(
        os.path.join(output_dir, industry + "_rsquared.csv"),
        header=False,
        index=True,
    )
    features["market_correlations"].to_csv(
        os.path.join(output_dir, industry + "_correlations.csv"),
        header=True,
        index=True,
    )

print("*** All done ***")

Industry (1/3): paper_and_forest_products
Industry (2/3): containers_and_packaging
Mean max value for containers_and_packaging: 2.8289999961853027
Industry (3/3): construction_materials
*** All done ***


# Data collection

In [58]:
stonks = utils.get_stonk_data(disable_filter=True)
stonks = stonks.loc[:, :"2019-07-19"]

In [None]:
pipelines.data_collection_rolling_pipeline(
    stonk_prices=stonks,
    l_reg=3,
    l_roll=2,
    dt=10,
    market_cap_min_mm=1000,
    market_cap_max_mm=None,
    last_residual_cutoff=2.5,
    mean_max_residual_dt=21,
    adf_pval_cutoff=0.1,
    adf_pass_rate_filter=0.5,
    arima_forecast_months=3,
    arima_eval_models=5,
    trade_length_months=3,
    trading_interval_weeks=2,
    remove_industries=[
        "pharmaceuticals_biotechnology_and_life_sciences",
    ],
    first_n_windows=10,
)

Total data windows: 10


In [2]:
dataset = utils.ingest_trade_pipeline_outputs(
    data_dir="data/data_collection_pipeline/1000_to_max/"
)

vix = utils.get_stonk_data(fname_prefix="vix", disable_filter=True).iloc[0]
dataset["vix"] = dataset["trade_date"].apply(lambda x: vix.loc[x])

dataset.to_csv("data/dataset_bigcap.csv", header=True, index=False)

# Model development

In [None]:
def model_validation_pipeline(
    dataset,
    data_window_train_size,
    data_window_test_size,
    data_window_gap_size,
    hp_model_evals,
    top_n_best_trades=5,
    min_industry_confidence=0.4,
    random_noise=0.005,
    hp_nth_best_model=10,
    data_dir="data",
    outputs_dir="experiments",
):
    random_state = np.random.randint(133742069)
    pipeline_dir = os.path.join(data_dir, outputs_dir)

    # Total data window size is the size of the standard data window for validation/training plus another gap and test size for final validation
    total_data_window_size = (
        data_window_train_size + data_window_gap_size + data_window_test_size
    ) + (data_window_gap_size + data_window_test_size)

    dates_sorted = np.sort(dataset["trade_date"].unique())
    total_date_count = len(dates_sorted)

    assert total_date_count >= total_data_window_size

    data_windows = range(total_date_count, total_data_window_size - 1, -1)

    print("Total data windows: " + str(len(list(data_windows))))

    all_evaluation_results = {}
    for index_end in data_windows:
        index_start = index_end - total_data_window_size

        current_data_window = dates_sorted[index_start:index_end]

        assert len(current_data_window) == total_data_window_size

        print(
            "Period "
            + str(current_data_window[0])
            + " to "
            + str(current_data_window[-1])
        )

        # Separating validation/test data windows
        dataset_window_test = (
            dataset[dataset.trade_date.isin(current_data_window)]
            .copy()
            .sample(frac=1, random_state=random_state)
        )

        current_data_window_validation = current_data_window[
            : -(data_window_gap_size + data_window_test_size)
        ]

        assert len(current_data_window_validation) + (
            data_window_gap_size + data_window_test_size
        ) == len(current_data_window)

        dataset_window_validation = (
            dataset_window_test[
                dataset_window_test.trade_date.isin(current_data_window_validation)
            ]
            .copy()
            .sample(frac=1, random_state=random_state)
        )

        validation_splits = preprocessing.split_data(
            dataset_window_validation,
            date_count_train=data_window_train_size,
            date_count_valid=data_window_test_size,
            date_count_gap=data_window_gap_size,
            random_state=random_state,
        )

        test_splits = preprocessing.split_data(
            dataset_window_test,
            date_count_train=data_window_train_size,
            date_count_valid=data_window_test_size,
            date_count_gap=data_window_gap_size,
            random_state=random_state,
        )

        assert all(
            [
                len(validation_splits["validation"].trade_date.unique())
                == len(test_splits["validation"].trade_date.unique()),
                len(validation_splits["train"].trade_date.unique())
                == len(test_splits["train"].trade_date.unique()),
            ]
        )

        # Hyperparameter tuning/search, outputs an artifact CSV with results, which is not used
        hp_trial_name = "validation_{}-{}_until_{}".format(
            current_data_window_validation[0],
            current_data_window_validation[-1],
            current_data_window[-1],
        )
        df_trial_results = train.model_hp_search(
            validation_splits,
            n_evals=hp_model_evals,
            trial_name=hp_trial_name,
            additive_random_noise=random_noise,
            write_csv=True,
            random_state=random_state,
            data_dir=data_dir,
            output_dir=outputs_dir,
        )

        selected_hps = _select_nth_best_trial(
            df_trial_results, nth_best=hp_nth_best_model
        )

        model_params = {
            "colsample_bylevel": 1,
            "learning_rate": 0.1,
            "subsample": 1,
            "tree_method": "hist",
            "enable_categorical": True,
            "max_cat_to_onehot": 1,
            "eval_metric": ["logloss"],
            "random_state": random_state,
        }
        model_params.update(selected_hps)

        # Live/production model training with given hyperparameters, outputs model and scalers JSONs
        clf_test, scalers_test = train.train_production_xgb(
            dataset=test_splits["train"],
            params=model_params,
            noise_level=random_noise,
            verbose=False,
            data_dir=pipeline_dir,
        )

        # Live/production model evaluation on test set
        live_model = predict.XGBStonkModel(model_dir=pipeline_dir)

        predictions, test_dataset_processed = live_model.predict(
            test_splits["validation"]
        )

        # Model, scalers saving/loading correctly tests, get predictions
        assert live_model._model == clf_test
        live_model._model = clf_test
        live_model._scalers = scalers_test
        predictions_fortest, _ = live_model.predict(test_splits["validation"])
        assert np.all(predictions == predictions_fortest)

        df_test_scores = test_splits["validation"].copy()
        df_test_scores["score"] = predictions
        df_test_scores["prediction"] = predictions > 0.5

        # Aggregate evaluation results, as a whole and by each trade date (as separate rows)
        current_period_trade_dates = np.sort(df_test_scores.trade_date.unique())
        current_evaluation_period_row_prefix = str(current_period_trade_dates[0]) + str(
            current_period_trade_dates[-1]
        )
        current_evaluation_results = {}

        _, results = returns_on_predictions(df_test_scores)
        current_evaluation_results.update(results)

        results = performance_summary(
            y_score=df_test_scores["score"],
            y_preds=df_test_scores["prediction"],
            y_true=df_test_scores["label"],
            auc_cutoff=0.5,
        )
        current_evaluation_results.update(results)

        results = performance_on_trading_use_case(
            df=df_test_scores,
            top_n_trades=top_n_best_trades,
            min_industry_score=min_industry_confidence,
        )
        current_evaluation_results.update(results)

        current_evaluation_results = pd.Series(current_evaluation_results)
        all_evaluation_results[
            current_evaluation_period_row_prefix + "_all"
        ] = current_evaluation_results

        # By trade date
        for date, trades in df_test_scores.groupby("trade_date"):
            current_evaluation_results = {}

            _, results = returns_on_predictions(trades)
            current_evaluation_results.update(results)

            results = performance_summary(
                y_score=trades["score"],
                y_preds=trades["prediction"],
                y_true=trades["label"],
                auc_cutoff=0.5,
            )
            current_evaluation_results.update(results)

            results = performance_on_trading_use_case(
                df=trades,
                top_n_trades=top_n_best_trades,
                min_industry_score=min_industry_confidence,
            )
            current_evaluation_results.update(results)

            current_evaluation_results = pd.Series(current_evaluation_results)
            all_evaluation_results[
                current_evaluation_period_row_prefix + "_" + str(date)
            ] = current_evaluation_results

    return pd.DataFrame(all_evaluation_results)

In [None]:
def _select_nth_best_trial(df_trials: pd.DataFrame, nth_best: int) -> Dict[str, float]:
    assert nth_best > 0 and nth_best <= len(df_trials)
    return dict(
        df_trials.iloc[nth_best - 1].drop(
            columns=["f1_score", "precision", "ap", "auc", "pos_preds"]
        )
    )

In [24]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe, atpe, rand
import pickle

## Dataset ingest

In [4]:
df = pd.read_csv("data/dataset_bigcap.csv")
df = df[df.beta > 0]
df = df[df.last_residual.abs() >= 2]
df = preprocessing.assign_labels(df)

In [4]:
# updated_ticker_list = utils.get_ticker_names(
#     1000, None, remove_industries=["pharmaceuticals_biotechnology_and_life_sciences"]
# )
# df = df[df.ticker_x.isin(updated_ticker_list.index)]
# df = df[df.ticker_y.isin(updated_ticker_list.index)]

## Production model training

In [16]:
drop_dates = 26
selected_dates = np.sort(df["trade_date"].unique())[drop_dates:]
df_prod = df[df.trade_date.isin(selected_dates)].sample(frac=1)
print(len(df_prod))
print(df_prod["label"].value_counts())

71910
0    55982
1    15928
Name: label, dtype: int64


In [None]:
clf_prod, scalers_prod = train.train_production_xgb(df_prod, params, noise_level=0.005)

## Model training experiments

In [42]:
# import importlib
# importlib.reload(evaluate)

<module 'evaluate' from '/home/jupyter/stonk-rank/evaluate.py'>

In [21]:
splits = preprocessing.split_data(
    df, date_count_train=61, date_count_valid=2, date_count_gap=6, random_state=330544
)
print(len(splits["train"]))
print(len(splits["validation"]))
print(splits["train"]["label"].value_counts())
print(splits["validation"]["label"].value_counts())

70130
2306
0    54974
1    15156
Name: label, dtype: int64
0    1817
1     489
Name: label, dtype: int64


In [22]:
X_train, scalers = preprocessing.transform_features(splits["train"], noise_level=0.005)

X_valid, _ = preprocessing.transform_features(
    splits["validation"], scalers=scalers, noise_level=0
)

y_train = splits["train"]["label"]
y_valid = splits["validation"]["label"]

In [8]:
df_trials = model_hp_search(
    data_splits=splits, n_evals=50, trial_name="test", additive_random_noise=0.005
)

100%|██████████| 50/50 [00:17<00:00,  2.93trial/s, best loss: -0.41860886570489053]


In [None]:
df_trials = pd.read_csv("data/experiments/data-window-8#2.csv")
df_trials.head(20)

In [None]:
df_trials = pd.read_csv("data/experiments/data-window-8#2.csv")
df_trials.head(20)

In [25]:
params = {
    "gamma": 3.740318,
    "scale_pos_weight": 4.19,
    "max_depth": 7,
    "min_child_weight": 7,
    "max_delta_step": 3,
    "colsample_bylevel": 1,
    "n_estimators": 32,
    "learning_rate": 0.1,
    "subsample": 1,
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
    "random_state": np.random.randint(999929),
}

clf = xgb.XGBClassifier(**params)
clf = clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])

[0]	validation_0-logloss:0.68477	validation_1-logloss:0.68377
[1]	validation_0-logloss:0.67760	validation_1-logloss:0.67579
[2]	validation_0-logloss:0.67145	validation_1-logloss:0.66911
[3]	validation_0-logloss:0.66613	validation_1-logloss:0.66324
[4]	validation_0-logloss:0.66070	validation_1-logloss:0.65848
[5]	validation_0-logloss:0.65791	validation_1-logloss:0.65415
[6]	validation_0-logloss:0.65539	validation_1-logloss:0.65043
[7]	validation_0-logloss:0.64899	validation_1-logloss:0.64680
[8]	validation_0-logloss:0.64530	validation_1-logloss:0.64352
[9]	validation_0-logloss:0.63902	validation_1-logloss:0.63850
[10]	validation_0-logloss:0.63629	validation_1-logloss:0.63635
[11]	validation_0-logloss:0.63146	validation_1-logloss:0.63226
[12]	validation_0-logloss:0.62623	validation_1-logloss:0.62907
[13]	validation_0-logloss:0.62337	validation_1-logloss:0.62682
[14]	validation_0-logloss:0.62020	validation_1-logloss:0.62336
[15]	validation_0-logloss:0.61407	validation_1-logloss:0.62074
[1

In [43]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
thres = 0.5
y_preds = y_score > thres

df_results_valid = splits["validation"].copy()
df_results_valid["score"] = y_score
df_results_valid["prediction"] = y_preds

evaluate.performance_summary(
    y_score=y_score, y_preds=y_preds, y_true=y_valid, auc_cutoff=0.5
)

df_results_valid, _ = evaluate.returns_on_predictions(df_results_valid)

evaluate.performance_on_slice(df_results_valid, "subindustry")

**Validation**
Precision: 0.3192771084337349
PR-AUC/AP score: 0.5393574019093184
ROC-AUC score: 0.592475292989523
Total positive predictions: 332
Total positive labels: 489

Totals:
        prediction
result            
FN             383
FP             226
TN            1591
TP             106

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.058138          0.106154            0.109757
FP              0.001075          0.003973            0.003894
TN             -0.006720         -0.002602           -0.015820
TP              0.036755          0.067575            0.089925

Std:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.093194          0.066535            0.073164
FP              0.044058          0.050160            0.059183
TN              0.043610          0.054788            0.0699

  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


In [46]:
# pd.set_option("display.max_rows", 200)
# evaluate.performance_on_trading_use_case(
#         df_results_valid, top_n_trades=5, min_industry_score=0.4
#     )

In [86]:
for name, importance in zip(clf.feature_names_in_, clf.feature_importances_):
    print(name, importance)

adf_pass_rate 0.05284967
last_residual 0.061928246
residual_mean_max 0.10744238
vix 0.33485457
betas_rsquared 0.09239178
arima_forecast 0.13933738
industry 0.14954282
residual_inter 0.061653122


In [44]:
pd.set_option("display.max_rows", 200)

In [None]:
df_results_valid[df_results_valid.result == "FP"].iloc[:].drop(
    columns=["beta", "intercept", "data_window_start", "label", "prediction"]
).iloc[:100]

In [None]:
# df_results_valid[df_results_valid.return_three_month < -0.2]

In [None]:
# df_results_valid[df_results_valid.subindustry == 'consumer_services'].iloc[0:100]