# Imports

In [None]:
# !pip install yfinance
# !pip install pmdarima
# !pip install hyperopt
# !pip install xgboost
# !pip install numpy -U

In [14]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import train
import preprocessing

# Download stock daily prices & indexes

In [2]:
### Gets all ticker names (no argument given)
market_cap_min_mm = 1000
market_cap_max_mm = None

ticker_list = utils.get_ticker_names(
    market_cap_min_mm=market_cap_min_mm,
    market_cap_max_mm=market_cap_max_mm,
    remove_industries=[
        # "diversified_financials",
        "pharmaceuticals_biotechnology_and_life_sciences",
    ],
)

In [3]:
### Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2021, 1, 18)
### Date of today
date_to = datetime.today()
### How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 5.1

In [4]:
_, _ = utils.download_stonk_prices(
    ticker_list.index, period_years=period_years, date_to=date_to
)
_, _ = utils.download_stonk_prices(
    ["^VIX"], period_years=period_years, date_to=date_to, fname_prefix="vix"
)
_, _ = utils.download_stonk_prices(
    ["^GSPC"], period_years=period_years, date_to=date_to, fname_prefix="sp500"
)
_, _ = utils.download_stonk_prices(
    ["CL=F"], period_years=period_years, date_to=date_to, fname_prefix="oil"
)
_, _ = utils.download_stonk_prices(
    ["DX=F"], period_years=period_years, date_to=date_to, fname_prefix="usd"
)
_, _ = utils.download_stonk_prices(
    ["^TNX"], period_years=period_years, date_to=date_to, fname_prefix="yield"
)
_, _ = utils.download_stonk_prices(
    ["HG=F"], period_years=period_years, date_to=date_to, fname_prefix="copper"
)

[*********************100%***********************]  2442 of 2442 completed

25 Failed downloads:
- JOBS: No data found, symbol may be delisted
- FB: No data found, symbol may be delisted
- RLGY: No data found, symbol may be delisted
- POST WI: No data found, symbol may be delisted
- BLL: No data found, symbol may be delisted
- BIP.PRB: No data found, symbol may be delisted
- MIME: No data found, symbol may be delisted
- O.WI: No data found, symbol may be delisted
- OCDX: No data found, symbol may be delisted
- BIP.PRA: No data found, symbol may be delisted
- SNX.WI: No data found, symbol may be delisted
- APSG: No data found, symbol may be delisted
- TSC: No data found, symbol may be delisted
- EPAY: No data found, symbol may be delisted
- ANAT: No data found, symbol may be delisted
- RXN WI: No data found, symbol may be delisted
- T WD: No data found, symbol may be delisted
- NCBS: No data found, symbol may be delisted
- DELL WI: No data found, symbol may be delisted
- CERN: No data f

# Run data pipeline

In [19]:
industries = [
    # 'health_care_equipment_and_services',
    # 'software_and_services',
    # 'retailing',
    # 'telecommunication_services',
    # "capital_goods",
    # "energy",
    # # 'pharmaceuticals_biotechnology_and_life_sciences',
    # 'consumer_staples',
    # 'banks',
    # 'diversified_financials',
    # 'metals_and_mining',
    # 'technology_hardware_and_equipment',
    # 'utilities',
    # 'chemicals',
    # 'automobiles_and_components',
    # "semiconductors_and_semiconductor_equipment",
    # 'media_and_entertainment',
    # 'real_estate',
    # 'consumer_services',
    # 'consumer_durables_and_apparel',
    # 'insurance',
    # 'transportation',
    # 'commercial_and_professional_services',
    'paper_and_forest_products',
    'containers_and_packaging',
    'construction_materials'
]

l_reg = 3
l_roll = 2
dt = 10
last_residual_cutoff = 2.5
adf_pval_cutoff = 0.1
adf_pass_rate_filter = 0.5
mean_max_residual_dt = 21
arima_forecast_months = 3
arima_eval_models = 5

market_cap_max_string = "max" if market_cap_max_mm is None else str(market_cap_max_mm)
pipeline_dir = (
    "pipeline_run_" + str(market_cap_min_mm) + "_to_" + market_cap_max_string + "_cap"
)
output_dir = os.path.join("data", pipeline_dir)

stonk_model = predict.XGBStonkModel()

market_indexes = utils.get_market_indexes()

In [22]:
datasets = []
i = 0
total_industries = len(industries)
for industry in industries:
    i += 1
    stonks = utils.get_stonk_data(
        market_cap_min_mm,
        market_cap_max_mm,
        remove_industries=[industry],
        filter_industries=True,
    )
    X, Y = processing.combine_stonk_pairs(stonks)

    print("Industry ({0}/{1}): {2}".format(i, total_industries, industry))

    features = pipelines.process_features_from_price_data(
        X=X,
        Y=Y,
        market_indexes=market_indexes,
        l_reg=l_reg,
        l_roll=l_roll,
        dt=dt,
        last_residual_cutoff=last_residual_cutoff,
        adf_pval_cutoff=adf_pval_cutoff,
        adf_pass_rate_filter=adf_pass_rate_filter,
        mean_max_residual_dt=mean_max_residual_dt,
        arima_forecast_months=arima_forecast_months,
        arima_eval_models=arima_eval_models,
    )
    
    if len(features) == 0:
        print("No trades")
        continue

    print(
        "Mean max value for {0}: {1}".format(industry, features["residuals_max_mean"])
    )
    dataset = utils.build_dataset_from_live_data_by_industry(
        std_residuals=features["std_residuals"],
        adfs=features["adfs"],
        subindustry=industry,
        mean_max_residual=features["residuals_max_mean"],
        vix_index=market_indexes["vix"].loc[stonks.columns[-1]],
        betas_stability_rsquared=features["beta_stability_rsquared_vals"],
        arima_forecasts=features["arima_forecasts"],
    )

    predictions, df_processed = stonk_model.predict(dataset)
    datasets.append((dataset, df_processed))
    predictions = pd.DataFrame(predictions)
    predictions.index = features["adfs"].index

    features["residuals"].insert(0, "dates", features["dates_index"].values)
    features["betas"].insert(0, "dates", features["dates_index"].values)

    features["residuals"].to_csv(
        os.path.join(output_dir, industry + "_residuals.csv"),
        header=False,
        index=True,
    )
    features["betas"].to_csv(
        os.path.join(output_dir, industry + "_betas.csv"), header=False, index=True
    )
    features["adfs_raw"].to_csv(
        os.path.join(output_dir, industry + "_adfs_raw.csv"), header=False, index=True
    )
    predictions.to_csv(
        os.path.join(output_dir, industry + "_predictions.csv"),
        header=False,
        index=True,
    )
    features["arima_forecasts"].to_csv(
        os.path.join(output_dir, industry + "_arima.csv"),
        header=False,
        index=True,
    )
    features["beta_stability_rsquared_vals"].to_csv(
        os.path.join(output_dir, industry + "_rsquared.csv"),
        header=False,
        index=True,
    )
    features["market_correlations"].to_csv(
        os.path.join(output_dir, industry + "_correlations.csv"),
        header=True,
        index=True,
    )

print("*** All done ***")

Industry (1/3): paper_and_forest_products
Industry (2/3): containers_and_packaging
Mean max value for containers_and_packaging: 2.8289999961853027
Industry (3/3): construction_materials
*** All done ***


# Data collection

In [58]:
stonks = utils.get_stonk_data(disable_filter=True)
stonks = stonks.loc[:, :"2019-07-19"]

In [None]:
pipelines.data_collection_rolling_pipeline(
    stonk_prices=stonks,
    l_reg=3,
    l_roll=2,
    dt=10,
    market_cap_min_mm=1000,
    market_cap_max_mm=None,
    last_residual_cutoff=2.5,
    mean_max_residual_dt=21,
    adf_pval_cutoff=0.1,
    adf_pass_rate_filter=0.5,
    arima_forecast_months=3,
    arima_eval_models=5,
    trade_length_months=3,
    trading_interval_weeks=2,
    remove_industries=[
        "pharmaceuticals_biotechnology_and_life_sciences",
    ],
    first_n_windows=10,
)

Total data windows: 10


In [2]:
dataset = utils.ingest_trade_pipeline_outputs(
    data_dir="data/data_collection_pipeline/1000_to_max/"
)

vix = utils.get_stonk_data(fname_prefix="vix", disable_filter=True).iloc[0]
dataset["vix"] = dataset["trade_date"].apply(lambda x: vix.loc[x])

dataset.to_csv("data/dataset_bigcap.csv", header=True, index=False)

# Model development

In [5]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe, atpe, rand
import pickle

## Dataset ingest

In [6]:
df = pd.read_csv("data/dataset_bigcap.csv")
df = df[df.beta > 0]
df = df[df.last_residual.abs() >= 2]
df = preprocessing.assign_labels(df)

In [4]:
# updated_ticker_list = utils.get_ticker_names(
#     1000, None, remove_industries=["pharmaceuticals_biotechnology_and_life_sciences"]
# )
# df = df[df.ticker_x.isin(updated_ticker_list.index)]
# df = df[df.ticker_y.isin(updated_ticker_list.index)]

## Production model training

In [8]:
drop_dates = 26
selected_dates = np.sort(df["trade_date"].unique())[drop_dates:]
df_prod = df[df.trade_date.isin(selected_dates)].sample(frac=1)
print(len(df_prod))
print(df_prod["label"].value_counts())

71910
0    55982
1    15928
Name: label, dtype: int64


In [16]:
clf_prod, scalers_prod = train.train_production_xgb(
    df_prod, params, noise_level=0.005
)

[0]	validation_0-logloss:0.68368
[1]	validation_0-logloss:0.67577
[2]	validation_0-logloss:0.66949
[3]	validation_0-logloss:0.66392
[4]	validation_0-logloss:0.65914
[5]	validation_0-logloss:0.65530
[6]	validation_0-logloss:0.65217
[7]	validation_0-logloss:0.64924
[8]	validation_0-logloss:0.64645
[9]	validation_0-logloss:0.64403
[10]	validation_0-logloss:0.64015
[11]	validation_0-logloss:0.63761
[12]	validation_0-logloss:0.63535
[13]	validation_0-logloss:0.63384
[14]	validation_0-logloss:0.63228
[15]	validation_0-logloss:0.63020
[16]	validation_0-logloss:0.62808
[17]	validation_0-logloss:0.62593
[18]	validation_0-logloss:0.62493
[19]	validation_0-logloss:0.62267
[20]	validation_0-logloss:0.62133
[21]	validation_0-logloss:0.61940
[22]	validation_0-logloss:0.61779
[23]	validation_0-logloss:0.61630
[24]	validation_0-logloss:0.61492
[25]	validation_0-logloss:0.61356
[26]	validation_0-logloss:0.61236
[27]	validation_0-logloss:0.61067
[28]	validation_0-logloss:0.60889
[29]	validation_0-loglos

## Model training experiments

In [5]:
# import importlib
# importlib.reload(preprocessing)

<module 'preprocessing' from '/home/jupyter/stonk-rank/preprocessing.py'>

In [49]:
splits = preprocessing.split_data(df, 2, 6, 18, random_state=330544)
print(len(splits["train"]))
print(len(splits["validation"]))
print(splits["train"]["label"].value_counts())
print(splits["validation"]["label"].value_counts())

70130
2306
0    54974
1    15156
Name: label, dtype: int64
0    1817
1     489
Name: label, dtype: int64


In [50]:
noise_level = 0.005

X_train, scalers = preprocessing.transform_features(
    splits["train"], noise_level=noise_level
)
X_valid, _ = preprocessing.transform_features(
    splits["validation"], scalers=scalers, noise_level=0
)

y_train = splits["train"]["label"]
y_valid = splits["validation"]["label"]

In [27]:
hyperparameter_space = {
    "gamma": hp.uniform("gamma", 0, 5),
    "scale_pos_weight": hp.uniform("scale_pos_weight", 3, 5),
    "max_depth": hp.quniform("max_depth", 3, 9, 1),
    "min_child_weight": hp.quniform("min_child_weight", 1, 8, 1),
    "max_delta_step": hp.quniform("max_delta_step", 1, 4, 1),
    "n_estimators": hp.quniform("n_estimators", 25, 85, 1),
    # "n_estimators": hp.choice("n_estimators", np.array([50, 75, 100, 150, 200])),
    # "subsample": hp.uniform("subsample", 0.9, 1),
    # "colsample_bylevel" : hp.uniform("colsample_bylevel", 0.5, 1),
}

In [28]:
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score


def optimization_objective(space):
    clf = xgb.XGBClassifier(
        gamma=space["gamma"],
        scale_pos_weight=space["scale_pos_weight"],
        #
        max_depth=int(space["max_depth"]),
        min_child_weight=int(space["min_child_weight"]),
        max_delta_step=int(space["max_delta_step"]),
        #
        # colsample_bylevel = space['colsample_bylevel'],
        colsample_bylevel=1,
        n_estimators=int(space["n_estimators"]),
        learning_rate=0.1,
        # subsample = space['subsample'],
        subsample=1,
        #
        tree_method="hist",
        enable_categorical=True,
        max_cat_to_onehot=1,
        random_state=np.random.randint(9999999),
    )

    clf.fit(
        X_train,
        y_train,
        verbose=False,
    )

    y_score = clf.predict_proba(X_valid)[:, 1]
    y_preds = y_score > 0.5

    f1 = f1_score(y_valid, y_preds, zero_division=0)
    precision = precision_score(y_valid, y_preds, zero_division=0)
    ap = evaluate.average_precision_from_cutoff(y_valid, y_score, 0.5)
    roc = roc_auc_score(y_valid, y_score)

    pos_preds = int(y_preds.sum())
    pos_labels = int(y_valid.sum())

    ap = ap if pos_preds >= pos_labels else 0

    if f1 == 0 or precision == 0:
        return {
            "loss": 100,
            "precision": precision,
            "f1_score": f1,
            "ap": ap,
            "auc": roc,
            "pos_preds": pos_preds,
            "status": STATUS_FAIL,
        }
    else:
        return {
            "loss": -ap,
            "precision": precision,
            "f1_score": f1,
            "ap": ap,
            "auc": roc,
            "pos_preds": pos_preds,
            "status": STATUS_OK,
        }

In [40]:
trials = Trials()

best_hyperparams = fmin(
    fn=optimization_objective,
    space=hyperparameter_space,
    algo=tpe.suggest,
    max_evals=1000,
    trials=trials,
)

trial_vals = trials.vals
trial_vals["f1_score"] = list(map(lambda x: x["f1_score"], trials.results))
trial_vals["precision"] = list(map(lambda x: x["precision"], trials.results))
trial_vals["ap"] = list(map(lambda x: x["ap"], trials.results))
trial_vals["auc"] = list(map(lambda x: x["auc"], trials.results))
trial_vals["pos_preds"] = list(map(lambda x: x["pos_preds"], trials.results))

df_trials = pd.DataFrame.from_dict(trial_vals)
df_trials.to_csv("data/experiments/data-window-12#2.csv", index=False)

100%|██████████| 1000/1000 [05:11<00:00,  3.21trial/s, best loss: -0.4252182637992771]


In [7]:
params = {
    # reg def 0
    "gamma": 3.740318,
    # L2 def 1
    # "reg_lambda" : 1,
    # "reg_alpha" : 0,
    # Class imbalance def 1
    "scale_pos_weight": 4.19,
    # Integers:
    "max_depth": 7,
    # Reg def 1
    "min_child_weight": 7,
    # Class imbalance def 0
    "max_delta_step": 3,
    # Choice:
    "colsample_bylevel": 1,
    "n_estimators": 32,
    "learning_rate": 0.1,
    "subsample": 1,
    # Fixed:
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
    "random_state": np.random.randint(999929),
}

clf = xgb.XGBClassifier(**params)
clf = clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])

NameError: name 'X_train' is not defined

In [85]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
thres = 0.5
y_preds = y_score > thres

evaluate.performance_summary(y_score, y_preds, y_valid, auc_cutoff=0.5)

df_results_valid = evaluate.returns_on_predictions(splits["validation"], y_preds)

evaluate.performance_on_slice(
    splits["validation"], y_score, y_preds, "subindustry", False
)

**Validation**
Precision: 0.29133858267716534
PR-AUC/AP score: 0.4539650920872376
ROC-AUC score: 0.6011836630415086
Total positive predictions: 508

Totals:
        prediction
result            
FN             341
FP             360
TN            1457
TP             148

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.061293          0.109944            0.110419
FP             -0.001033         -0.003419           -0.004561
TN             -0.006916         -0.001380           -0.015544
TP              0.035554          0.069791            0.094027

Stds:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.097132          0.067523            0.075366
FP              0.044237          0.050456            0.063621
TN              0.043540          0.055174            0.070105
TP              0.055

In [63]:
pd.set_option("display.max_rows", 100)

In [None]:
df_results_valid[df_results_valid.result == "FP"].iloc[:].drop(
    columns=["beta", "intercept", "data_window_start", "label", "prediction"]
).iloc[:100]

In [None]:
df_results_valid[df_results_valid.return_three_month < -0.2]

In [None]:
# df_results_valid[df_results_valid.subindustry == 'consumer_services'].iloc[0:100]

In [86]:
for name, importance in zip(clf.feature_names_in_, clf.feature_importances_):
    print(name, importance)

adf_pass_rate 0.05284967
last_residual 0.061928246
residual_mean_max 0.10744238
vix 0.33485457
betas_rsquared 0.09239178
arima_forecast 0.13933738
industry 0.14954282
residual_inter 0.061653122


In [70]:
df_trials = pd.read_csv("data/experiments/data-window-8#2.csv")
df_trials.sort_values("ap", ascending=False).head(20)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
889,4.628714,3.0,7.0,7.0,26.0,4.391147,0.298714,0.289272,0.505644,0.596444,522
533,4.967208,1.0,7.0,8.0,26.0,4.234768,0.304915,0.299213,0.491292,0.58478,508
641,3.561182,3.0,7.0,7.0,28.0,4.408826,0.296512,0.281768,0.477855,0.583115,543
689,3.889369,3.0,7.0,7.0,25.0,4.462526,0.328082,0.3,0.476207,0.597664,590
745,3.740318,3.0,7.0,7.0,32.0,4.191858,0.286877,0.285425,0.474131,0.591046,494
669,3.705566,3.0,7.0,7.0,30.0,4.474944,0.328922,0.3058,0.472589,0.601805,569
838,4.705772,3.0,7.0,7.0,27.0,4.500833,0.315018,0.28524,0.471085,0.597975,603
397,4.002332,3.0,7.0,8.0,29.0,4.46356,0.316505,0.301294,0.470805,0.597833,541
828,4.715484,3.0,7.0,7.0,25.0,4.501516,0.317431,0.287854,0.469181,0.596208,601
826,4.702435,3.0,7.0,7.0,25.0,4.492402,0.318307,0.289298,0.469166,0.596232,598


In [56]:
df_trials = pd.read_csv("data/experiments/data-window-20#1.csv")
df_trials.sort_values("ap", ascending=False).head(20)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
132,1.929893,1.0,6.0,6.0,30.0,4.539448,0.302304,0.275168,0.459227,0.600292,596
229,0.258271,1.0,6.0,5.0,46.0,4.352045,0.307692,0.300781,0.458494,0.606459,512
372,1.660028,1.0,5.0,8.0,35.0,4.122727,0.298174,0.295775,0.44364,0.591987,497
226,0.506907,1.0,6.0,4.0,47.0,4.339547,0.308458,0.300388,0.442264,0.599763,516
408,1.988844,1.0,6.0,5.0,36.0,4.374556,0.294985,0.284091,0.441424,0.592987,528
70,0.062094,1.0,6.0,1.0,38.0,4.549194,0.325976,0.278665,0.439854,0.602224,689
278,0.72353,1.0,6.0,5.0,34.0,4.549731,0.311858,0.286942,0.439649,0.587353,582
369,0.577124,1.0,6.0,5.0,36.0,4.370624,0.298419,0.288719,0.437355,0.596477,523
168,0.078554,1.0,6.0,3.0,36.0,4.469489,0.321271,0.282609,0.435318,0.606815,644
316,1.924731,2.0,6.0,4.0,25.0,4.590809,0.304225,0.28125,0.435279,0.593134,576


In [None]:
df_trials = pd.read_csv("data/data-window-size-2#6.csv")
df_trials.sort_values("ap", ascending=False).head(50)