# Imports

In [None]:
# !pip install yfinance
# !pip install pmdarima
# !pip install hyperopt
# !pip install xgboost
# !pip install numpy -U

In [1]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import train
import preprocessing

# Download stock daily prices & indexes

In [44]:
### Gets all ticker names (no argument given)
market_cap_min_mm = 1000
market_cap_max_mm = None

ticker_list = utils.get_ticker_names(
    market_cap_min_mm=market_cap_min_mm,
    market_cap_max_mm=market_cap_max_mm,
    remove_industries=[
        "pharmaceuticals_biotechnology_and_life_sciences",
        "health_care_equipment_and_services"
    ],
)

In [4]:
### Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2021, 1, 18)
### Date of today
date_to = datetime.today()
### How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 11

In [5]:
_, _ = utils.download_stonk_prices(
    ticker_list.index, period_years=period_years, date_to=date_to
)
_, _ = utils.download_stonk_prices(
    ["^VIX"], period_years=period_years, date_to=date_to, fname_prefix="vix"
)
_, _ = utils.download_stonk_prices(
    ["^GSPC"], period_years=period_years, date_to=date_to, fname_prefix="sp500"
)
_, _ = utils.download_stonk_prices(
    ["CL=F"], period_years=period_years, date_to=date_to, fname_prefix="oil"
)
_, _ = utils.download_stonk_prices(
    ["DX=F"], period_years=period_years, date_to=date_to, fname_prefix="usd"
)
_, _ = utils.download_stonk_prices(
    ["^TNX"], period_years=period_years, date_to=date_to, fname_prefix="yield"
)
_, _ = utils.download_stonk_prices(
    ["HG=F"], period_years=period_years, date_to=date_to, fname_prefix="copper"
)

[*********************100%***********************]  2289 of 2289 completed

29 Failed downloads:
- BIP.PRA: No data found, symbol may be delisted
- GGPI: No data found, symbol may be delisted
- T WD: No data found, symbol may be delisted
- COHR: No data found, symbol may be delisted
- FOE: No data found, symbol may be delisted
- PLAN: No data found, symbol may be delisted
- POST WI: No data found, symbol may be delisted
- MGP: No data found, symbol may be delisted
- ZNGA: No data found, symbol may be delisted
- DELL WI: No data found, symbol may be delisted
- OAS: No data found, symbol may be delisted
- RXN WI: No data found, symbol may be delisted
- SNX.WI: No data found, symbol may be delisted
- BIP.PRB: No data found, symbol may be delisted
- ENIA: No data found, symbol may be delisted
- MIME: No data found, symbol may be delisted
- O.WI: No data found, symbol may be delisted
- TSC: No data found, symbol may be delisted
- EPAY: No data found, symbol may be delisted
- FB: No data fou

# Run data pipeline

In [45]:
industries = [
    # 'health_care_equipment_and_services',
    "software_and_services",
    "retailing",
    "telecommunication_services",
    "capital_goods",
    "energy",
    # 'pharmaceuticals_biotechnology_and_life_sciences',
    "consumer_staples",
    "banks",
    "diversified_financials",
    "metals_and_mining",
    "technology_hardware_and_equipment",
    "utilities",
    "chemicals",
    "automobiles_and_components",
    "semiconductors_and_semiconductor_equipment",
    "media_and_entertainment",
    "real_estate",
    "consumer_services",
    "consumer_durables_and_apparel",
    "insurance",
    "transportation",
    "commercial_and_professional_services",
    "paper_and_forest_products",
    "containers_and_packaging",
    "construction_materials",
]

l_reg = 3
l_roll = 2
dt = 10
last_residual_cutoff = 2.5
adf_pval_cutoff = 0.1
adf_pass_rate_filter = 0.5
mean_max_residual_dt = 21
arima_forecast_months = 3
arima_eval_models = 5

market_cap_max_string = "max" if market_cap_max_mm is None else str(market_cap_max_mm)
pipeline_dir = (
    "pipeline_run_" + str(market_cap_min_mm) + "_to_" + market_cap_max_string + "_cap"
)
output_dir = os.path.join("data", pipeline_dir)

stonk_model = predict.XGBStonkModel()

market_indexes = utils.get_market_indexes()

In [46]:
datasets = []
i = 0
total_industries = len(industries)
for industry in industries:
    i += 1
    stonks = utils.get_stonk_data(
        market_cap_min_mm,
        market_cap_max_mm,
        remove_industries=[industry],
        filter_industries=True,
    )
    X, Y = processing.combine_stonk_pairs(stonks)

    print("Industry ({0}/{1}): {2}".format(i, total_industries, industry))

    features = pipelines.process_features_from_price_data(
        X=X,
        Y=Y,
        market_indexes=market_indexes,
        l_reg=l_reg,
        l_roll=l_roll,
        dt=dt,
        last_residual_cutoff=last_residual_cutoff,
        adf_pval_cutoff=adf_pval_cutoff,
        adf_pass_rate_filter=adf_pass_rate_filter,
        mean_max_residual_dt=mean_max_residual_dt,
        arima_forecast_months=arima_forecast_months,
        arima_eval_models=arima_eval_models,
    )

    if len(features) == 0:
        print("No trades")
        continue

    print(
        "Mean max value for {0}: {1}".format(industry, features["residuals_max_mean"])
    )
    dataset = utils.build_dataset_from_live_data_by_industry(
        std_residuals=features["std_residuals"],
        adfs=features["adfs"],
        subindustry=industry,
        mean_max_residual=features["residuals_max_mean"],
        residual_quantile=features["residuals_quantile"],
        vix_index=market_indexes["vix"].loc[stonks.columns[-1]],
        betas_stability_rsquared=features["beta_stability_rsquared_vals"],
        arima_forecasts=features["arima_forecasts"],
    )

    predictions, df_processed = stonk_model.predict(dataset)
    datasets.append((dataset, df_processed))
    predictions = pd.DataFrame(predictions)
    predictions.index = features["adfs"].index

    features["residuals"].insert(0, "dates", features["dates_index"].values)
    features["betas"].insert(0, "dates", features["dates_index"].values)

    features["residuals"].to_csv(
        os.path.join(output_dir, industry + "_residuals.csv"),
        header=False,
        index=True,
    )
    features["betas"].to_csv(
        os.path.join(output_dir, industry + "_betas.csv"), header=False, index=True
    )
    features["adfs_raw"].to_csv(
        os.path.join(output_dir, industry + "_adfs_raw.csv"), header=False, index=True
    )
    predictions.to_csv(
        os.path.join(output_dir, industry + "_predictions.csv"),
        header=False,
        index=True,
    )
    features["arima_forecasts"].to_csv(
        os.path.join(output_dir, industry + "_arima.csv"),
        header=False,
        index=True,
    )
    features["beta_stability_rsquared_vals"].to_csv(
        os.path.join(output_dir, industry + "_rsquared.csv"),
        header=False,
        index=True,
    )
    features["market_correlations"].to_csv(
        os.path.join(output_dir, industry + "_correlations.csv"),
        header=True,
        index=True,
    )

print("*** All done ***")

Industry (1/24): software_and_services
Mean max value for software_and_services: 3.934999942779541
Industry (2/24): retailing
Mean max value for retailing: 4.3429999351501465
Industry (3/24): telecommunication_services
Mean max value for telecommunication_services: 3.015000104904175
Industry (4/24): capital_goods
Mean max value for capital_goods: 4.150000095367432
Industry (5/24): energy
Mean max value for energy: 4.099999904632568
Industry (6/24): consumer_staples
Mean max value for consumer_staples: 3.26200008392334
Industry (7/24): banks
Mean max value for banks: 4.0229997634887695
Industry (8/24): diversified_financials
Mean max value for diversified_financials: 3.365000009536743
Industry (9/24): metals_and_mining
Mean max value for metals_and_mining: 3.052999973297119
Industry (10/24): technology_hardware_and_equipment
Mean max value for technology_hardware_and_equipment: 4.176000118255615
Industry (11/24): utilities
Mean max value for utilities: 3.4040000438690186
Industry (12/24

# Data collection

In [6]:
stonks = utils.get_stonk_data(disable_filter=True)
stonks = stonks.loc[:, :"2022-09-05"]

In [None]:
pipelines.data_collection_rolling_pipeline(
    stonk_prices=stonks,
    l_reg=3,
    l_roll=2,
    dt=10,
    market_cap_min_mm=1000,
    market_cap_max_mm=None,
    last_residual_cutoff=2.5,
    mean_max_residual_dt=21,
    adf_pval_cutoff=0.1,
    adf_pass_rate_filter=0.5,
    arima_forecast_months=3,
    arima_eval_models=5,
    trade_length_months=3,
    trading_interval_weeks=2,
    remove_industries=[
        "pharmaceuticals_biotechnology_and_life_sciences",
        "health_care_equipment_and_services"
    ],
    first_n_windows=1,
)

In [8]:
dataset = utils.ingest_trade_pipeline_outputs(
    data_dir="data/data_collection_pipeline/1000_to_max/"
)

vix = utils.get_stonk_data(fname_prefix="vix", disable_filter=True).iloc[0]
dataset["vix"] = dataset["trade_date"].apply(lambda x: vix.loc[x])

from utils import map_subindustries_to_industries
dataset.loc[:, "industry"] = dataset.apply(map_subindustries_to_industries, axis=1)

dataset.loc[:, "arima_forecast_normalized"] = dataset.apply(utils.normalize_arima_forecast, axis=1)

new_dataset = []
for date, date_group in dataset.groupby(by="trade_date"):
    for industry, industry_group in date_group.groupby(by="subindustry"):
        residual_quantile = industry_group["last_residual"].abs().quantile(q=0.9)
        industry_group.loc[:, "residual_quantile"] = np.full(len(industry_group), residual_quantile)
        new_dataset.append(industry_group)
        
new_dataset = pd.concat(new_dataset)

new_dataset.to_csv("data/dataset_bigcap.csv", header=True, index=False)

# Model development

In [26]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe, atpe, rand
import pickle

In [None]:
validation_results = pipelines.model_validation_pipeline(
    dataset=df,
    filename_prefix="dynamic-train-window",
    fixed_train_window_size=False,
    data_window_max_train_size=62,
    data_window_test_size=2,
    data_window_gap_size=6,
    hp_model_evals=1000,
    top_n_best_trades=5,
    min_industry_confidence=0.4,
    random_noise=0.005,
    hp_nth_best_model=1,
    verbose=False
)

## Dataset ingest

In [2]:
df = pd.read_csv("data/dataset_bigcap.csv")
df = df[df.beta > 0]
df = df[df.last_residual.abs() >= 2.5]
df = preprocessing.assign_labels(df)

In [3]:
updated_ticker_list = utils.get_ticker_names(
    1000, None, remove_industries=["health_care_equipment_and_services", "pharmaceuticals_biotechnology_and_life_sciences"]
)
df = df[df.ticker_x.isin(updated_ticker_list.index)]
df = df[df.ticker_y.isin(updated_ticker_list.index)]

## Production model training

In [18]:
train_size = 62
selected_dates = np.sort(df["trade_date"].unique())[-train_size:]
df_prod = df[df.trade_date.isin(selected_dates)].sample(frac=1)
print(len(df_prod))
print(df_prod["label"].value_counts())

69062
0    54330
1    14732
Name: label, dtype: int64


In [42]:
clf_prod, scalers_prod = train.train_production_xgb(df_prod, params, noise_level=0.005)

[0]	validation_0-logloss:0.68622
[1]	validation_0-logloss:0.68057
[2]	validation_0-logloss:0.67586
[3]	validation_0-logloss:0.67193
[4]	validation_0-logloss:0.66873
[5]	validation_0-logloss:0.66597
[6]	validation_0-logloss:0.66365
[7]	validation_0-logloss:0.66159
[8]	validation_0-logloss:0.65990
[9]	validation_0-logloss:0.65797
[10]	validation_0-logloss:0.65592
[11]	validation_0-logloss:0.65473
[12]	validation_0-logloss:0.65231
[13]	validation_0-logloss:0.65062
[14]	validation_0-logloss:0.64942
[15]	validation_0-logloss:0.64762
[16]	validation_0-logloss:0.64649
[17]	validation_0-logloss:0.64503
[18]	validation_0-logloss:0.64430
[19]	validation_0-logloss:0.64356
[20]	validation_0-logloss:0.64252
[21]	validation_0-logloss:0.64161
[22]	validation_0-logloss:0.64093
[23]	validation_0-logloss:0.63976
[24]	validation_0-logloss:0.63859
[25]	validation_0-logloss:0.63817
[26]	validation_0-logloss:0.63742
[27]	validation_0-logloss:0.63664
[28]	validation_0-logloss:0.63616
[29]	validation_0-loglos

## Model training experiments

In [13]:
import importlib
importlib.reload(train)

<module 'train' from '/home/jupyter/stonk-rank/train.py'>

In [22]:
splits = preprocessing.split_data(
    df, date_count_train=62, date_count_valid=2, date_count_gap=6, random_state=3303544
)
print(len(splits["train"]))
print(len(splits["validation"]))
print(splits["train"]["label"].value_counts())
print(splits["validation"]["label"].value_counts())

66799
1834
0    52660
1    14139
Name: label, dtype: int64
0    1459
1     375
Name: label, dtype: int64


In [23]:
X_train, scalers = preprocessing.transform_features(splits["train"], noise_level=0.005)

X_valid, _ = preprocessing.transform_features(
    splits["validation"], scalers=scalers, noise_level=0
)

y_train = splits["train"]["label"]
y_valid = splits["validation"]["label"]

In [15]:
df_trial_results = train.model_hp_search(
            df,
            n_evals=1000,
            fixed_train_window_size=False,
            max_train_window_size=62,
            trial_name="live-test-window-auto",
            additive_random_noise=0.005,
            write_csv=True,
            random_state=420,
            data_dir="data",
            output_dir="experiments",
        )

 25%|██▍       | 249/1000 [01:28<04:08,  3.02trial/s, best loss: -0.4559006323055327] 

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [06:52<00:00,  2.43trial/s, best loss: -0.4952553926923142]


In [21]:
df_trials = pd.read_csv("data/experiments/live-test-window-62.csv").sort_values(by="ap", ascending=False)
df_trials.head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,train_window_size,f1_score,precision,ap,auc,pos_preds
0,2.151161,2.0,3.0,7.0,71.0,3.438988,64.0,0.339523,0.337731,0.484781,0.614284,379
1,1.838679,2.0,3.0,5.0,67.0,3.703935,64.0,0.336709,0.320482,0.469522,0.620283,415
2,4.867665,2.0,3.0,7.0,69.0,3.360048,64.0,0.338939,0.329146,0.467604,0.611812,398
3,1.876985,2.0,3.0,8.0,61.0,3.83994,64.0,0.345253,0.321101,0.464392,0.616144,436
4,4.041675,2.0,3.0,8.0,72.0,3.344612,64.0,0.342412,0.333333,0.463714,0.607019,396
5,4.73111,2.0,3.0,7.0,67.0,3.811074,64.0,0.343558,0.318182,0.463383,0.617907,440
6,4.748602,2.0,3.0,6.0,67.0,3.813046,64.0,0.343558,0.318182,0.463383,0.617882,440
7,1.913054,2.0,4.0,8.0,58.0,3.923929,64.0,0.350515,0.307229,0.463215,0.627234,498
8,2.230551,2.0,4.0,6.0,64.0,3.393895,64.0,0.349045,0.334146,0.463033,0.618548,410
9,4.75895,3.0,3.0,6.0,70.0,3.618292,64.0,0.346106,0.322581,0.460869,0.609632,434


In [20]:
df_trials = pd.read_csv("data/experiments/live-test-window-auto.csv").sort_values(by="ap", ascending=False)
df_trials.head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,train_window_size,f1_score,precision,ap,auc,pos_preds
0,4.15832,4.0,4.0,1.0,44.0,3.957271,64.0,0.347407,0.317181,0.495255,0.634554,454
1,3.938605,2.0,6.0,7.0,39.0,3.533844,64.0,0.349206,0.346457,0.476587,0.632716,381
2,4.61819,2.0,6.0,8.0,42.0,3.631451,64.0,0.360736,0.334091,0.471571,0.632089,440
3,3.499165,2.0,6.0,8.0,42.0,3.550919,64.0,0.358839,0.355091,0.465618,0.633836,383
4,4.275422,3.0,4.0,1.0,52.0,3.686806,64.0,0.35705,0.346734,0.461158,0.640228,398
5,3.532749,2.0,6.0,8.0,41.0,3.622206,64.0,0.338028,0.325123,0.460918,0.620507,406
6,3.681902,2.0,6.0,7.0,37.0,3.615334,64.0,0.352318,0.35,0.459819,0.641036,380
7,3.970753,4.0,6.0,8.0,40.0,3.655311,64.0,0.348356,0.320628,0.459043,0.624522,446
8,3.625991,3.0,5.0,2.0,41.0,3.785602,64.0,0.373626,0.344595,0.457879,0.634987,444
9,3.679454,2.0,6.0,8.0,44.0,3.594501,64.0,0.361596,0.339578,0.456662,0.63601,427


In [27]:
params = {
    "gamma": 4.158320,
    "scale_pos_weight": 3.957271,
    "max_depth": 4,
    "min_child_weight": 1,
    "max_delta_step": 4,
    "colsample_bylevel": 1,
    "n_estimators": 44,
    "learning_rate": 0.1,
    "subsample": 1,
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
    "random_state": np.random.randint(999929),
}

clf = xgb.XGBClassifier(**params)
clf = clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])

[0]	validation_0-logloss:0.68807	validation_1-logloss:0.68667
[1]	validation_0-logloss:0.68309	validation_1-logloss:0.68124
[2]	validation_0-logloss:0.67699	validation_1-logloss:0.67476
[3]	validation_0-logloss:0.67151	validation_1-logloss:0.66948
[4]	validation_0-logloss:0.66326	validation_1-logloss:0.66582
[5]	validation_0-logloss:0.65913	validation_1-logloss:0.66176
[6]	validation_0-logloss:0.65548	validation_1-logloss:0.65859
[7]	validation_0-logloss:0.65166	validation_1-logloss:0.65578
[8]	validation_0-logloss:0.64495	validation_1-logloss:0.65358
[9]	validation_0-logloss:0.64179	validation_1-logloss:0.65136
[10]	validation_0-logloss:0.64021	validation_1-logloss:0.64983
[11]	validation_0-logloss:0.63566	validation_1-logloss:0.64809
[12]	validation_0-logloss:0.62924	validation_1-logloss:0.64672
[13]	validation_0-logloss:0.62666	validation_1-logloss:0.64540
[14]	validation_0-logloss:0.62502	validation_1-logloss:0.64409
[15]	validation_0-logloss:0.62457	validation_1-logloss:0.64311
[1

In [41]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
thres = 0.5
y_preds = y_score > thres

df_results_valid = splits["validation"].copy()
df_results_valid["score"] = y_score
df_results_valid["prediction"] = y_preds

_ = evaluate.performance_summary(
    y_score=y_score, y_preds=y_preds, y_true=y_valid, auc_cutoff=0.5, verbose=True
)

_ = evaluate.returns_on_predictions(df_results_valid, verbose=True)

# _ = evaluate.performance_on_slice(df_results_valid, "subindustry")

**Validation**
Precision: 0.3170731707317073
PR-AUC/AP score: 0.43869202769171994
ROC-AUC score: 0.6358071738633768
Total positive predictions: 451
Total positive labels: 375

Totals:
        prediction
result            
FN             232
FP             308
TN            1151
TP             143

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.079418          0.100478            0.094750
FP              0.013308          0.004192           -0.014159
TN              0.005002         -0.007922           -0.029189
TP              0.040881          0.075776            0.100469

Std:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.064267          0.054784            0.064799
FP              0.043222          0.058507            0.078811
TN              0.049983          0.066082            0.07

In [46]:
# pd.set_option("display.max_rows", 200)
# evaluate.performance_on_trading_use_case(
#         df_results_valid, top_n_trades=5, min_industry_score=0.4
#     )

In [31]:
for name, importance in zip(clf.feature_names_in_, clf.feature_importances_):
    print(name, importance)

adf_pass_rate 0.029732222
last_residual 0.08531821
residual_mean_max 0.0812026
industry 0.118948996
vix 0.26275504
betas_rsquared 0.14753708
arima_forecast_normalized 0.21244633
residual_inter 0.062059514


In [44]:
pd.set_option("display.max_rows", 200)

In [None]:
df_results_valid[df_results_valid.result == "FP"].iloc[:].drop(
    columns=["beta", "intercept", "data_window_start", "label", "prediction"]
).iloc[:100]

In [None]:
# df_results_valid[df_results_valid.return_three_month < -0.2]

In [None]:
# df_results_valid[df_results_valid.subindustry == 'consumer_services'].iloc[0:100]