# Imports

In [None]:
# !pip install yfinance
# !pip install pmdarima
# !pip install hyperopt
# !pip install xgboost
# !pip install numpy -U

In [1]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import preprocessing

# Download stock daily prices & indexes

In [2]:
### Gets all ticker names (no argument given)
market_cap_min_mm = 1000
market_cap_max_mm = None

ticker_list = utils.get_ticker_names(
    market_cap_min_mm=market_cap_min_mm,
    market_cap_max_mm=market_cap_max_mm,
    remove_industries=[
        # "diversified_financials",
        "pharmaceuticals_biotechnology_and_life_sciences",
    ],
)

In [9]:
### Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2021, 1, 18)
### Date of today
date_to = datetime.today()
### How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 10

In [10]:
_, _ = utils.download_stonk_prices(
    ticker_list.index, period_years=period_years, date_to=date_to
)
_, _ = utils.download_stonk_prices(
    ["^VIX"], period_years=period_years, date_to=date_to, fname_prefix="vix"
)
_, _ = utils.download_stonk_prices(
    ["^GSPC"], period_years=period_years, date_to=date_to, fname_prefix="sp500"
)
_, _ = utils.download_stonk_prices(
    ["CL=F"], period_years=period_years, date_to=date_to, fname_prefix="oil"
)
_, _ = utils.download_stonk_prices(
    ["DX=F"], period_years=period_years, date_to=date_to, fname_prefix="usd"
)
_, _ = utils.download_stonk_prices(
    ["^TNX"], period_years=period_years, date_to=date_to, fname_prefix="yield"
)
_, _ = utils.download_stonk_prices(
    ["HG=F"], period_years=period_years, date_to=date_to, fname_prefix="copper"
)

[*********************100%***********************]  2442 of 2442 completed

21 Failed downloads:
- OCDX: No data found, symbol may be delisted
- MIME: No data found, symbol may be delisted
- MGP: No data found, symbol may be delisted
- POST WI: No data found, symbol may be delisted
- BIP.PRB: No data found, symbol may be delisted
- DELL WI: No data found, symbol may be delisted
- RXN WI: No data found, symbol may be delisted
- BIP.PRA: No data found, symbol may be delisted
- BLL: No data found, symbol may be delisted
- FOE: No data found, symbol may be delisted
- APSG: No data found, symbol may be delisted
- SGMS: No data found, symbol may be delisted
- ZNGA: No data found, symbol may be delisted
- EPAY: No data found, symbol may be delisted
- SNX.WI: No data found, symbol may be delisted
- O.WI: No data found, symbol may be delisted
- ANAT: No data found, symbol may be delisted
- JOBS: No data found, symbol may be delisted
- TSC: No data found, symbol may be delisted
- T WD: No data f

# Run data pipeline

In [3]:
industries = [
    # # 'health_care_equipment_and_services',
    # 'software_and_services',
    # 'retailing',
    # 'telecommunication_services',
    # "capital_goods",
    # "energy",
    # # 'pharmaceuticals_biotechnology_and_life_sciences',
    # 'consumer_staples',
    # 'banks',
    # 'diversified_financials',
    # 'metals_and_mining',
    # 'technology_hardware_and_equipment',
    # 'utilities',
    # 'chemicals',
    # 'automobiles_and_components',
    "semiconductors_and_semiconductor_equipment",
    # 'media_and_entertainment',
    # 'real_estate',
    # 'consumer_services',
    # 'consumer_durables_and_apparel',
    # 'insurance',
    # 'transportation',
    # 'commercial_and_professional_services',
    # 'paper_and_forest_products',
    # 'containers_and_packaging',
    # 'construction_materials'
]

l_reg = 3
l_roll = 2
dt = 10
last_residual_cutoff = 2.5
adf_pval_cutoff = 0.1
adf_pass_rate_filter = 0.5
mean_max_residual_dt = 21
arima_forecast_months = 3
arima_eval_models = 5

market_cap_max_string = "max" if market_cap_max_mm is None else str(market_cap_max_mm)
pipeline_dir = (
    "pipeline_run_" + str(market_cap_min_mm) + "_to_" + market_cap_max_string + "_cap"
)
output_dir = os.path.join("data", pipeline_dir)

stonk_model = predict.XGBStonkModel()

market_indexes = utils.get_market_indexes()

In [5]:
datasets = []
i = 0
total_industries = len(industries)
for industry in industries:
    i += 1
    stonks = utils.get_stonk_data(
        market_cap_min_mm,
        market_cap_max_mm,
        remove_industries=[industry],
        filter_industries=True,
    )
    X, Y = processing.combine_stonk_pairs(stonks)

    print("Industry ({0}/{1}): {2}".format(i, total_industries, industry))

    print("Processing features...")
    features = pipelines.process_features_from_price_data(
        X=X,
        Y=Y,
        market_indexes=market_indexes,
        l_reg=l_reg,
        l_roll=l_roll,
        dt=dt,
        last_residual_cutoff=last_residual_cutoff,
        adf_pval_cutoff=adf_pval_cutoff,
        adf_pass_rate_filter=adf_pass_rate_filter,
        mean_max_residual_dt=mean_max_residual_dt,
        arima_forecast_months=arima_forecast_months,
        arima_eval_models=arima_eval_models,
    )

    print(
        "Mean max value for {0}: {1}".format(industry, features["residuals_max_mean"])
    )
    print("Preparing data for model...")
    dataset = utils.build_dataset_from_live_data_by_industry(
        std_residuals=features["std_residuals"],
        adfs=features["adfs"],
        subindustry=industry,
        mean_max_residual=features["residuals_max_mean"],
        vix_index=market_indexes["vix"].loc[stonks.columns[-1]],
        betas_stability_rsquared=features["beta_stability_rsquared_vals"],
        arima_forecasts=features["arima_forecasts"],
    )

    print("Running model...")
    predictions, df_processed = stonk_model.predict(dataset)
    datasets.append((dataset, df_processed))
    predictions = pd.DataFrame(predictions)
    predictions.index = features["adfs"].index

    features["residuals"].insert(0, "dates", features["dates_index"].values)
    features["betas"].insert(0, "dates", features["dates_index"].values)

    print("Writing results to CSV...")
    features["residuals"].to_csv(
        os.path.join(output_dir, industry + "_residuals.csv"),
        header=False,
        index=True,
    )
    features["betas"].to_csv(
        os.path.join(output_dir, industry + "_betas.csv"), header=False, index=True
    )
    features["adfs_raw"].to_csv(
        os.path.join(output_dir, industry + "_adfs_raw.csv"), header=False, index=True
    )
    predictions.to_csv(
        os.path.join(output_dir, industry + "_predictions.csv"),
        header=False,
        index=True,
    )
    features["arima_forecasts"].to_csv(
        os.path.join(output_dir, industry + "_arima.csv"),
        header=False,
        index=True,
    )
    features["beta_stability_rsquared_vals"].to_csv(
        os.path.join(output_dir, industry + "_rsquared.csv"),
        header=False,
        index=True,
    )

print("*** All done ***")

Industry (1/1): semiconductors_and_semiconductor_equipment
Processing features...
Mean max value for semiconductors_and_semiconductor_equipment: 3.5199999809265137
Preparing data for model...
Running model...
Writing results to CSV...
*** All done ***


# Data collection

In [3]:
stonks = utils.get_stonk_data(disable_filter=True)
# stonks = stonks.loc[:, :"2020-05-08"]

In [None]:
pipelines.data_collection_rolling_pipeline(
    stonk_prices=stonks,
    l_reg=3,
    l_roll=2,
    dt=10,
    market_cap_min_mm=1000,
    market_cap_max_mm=None,
    last_residual_cutoff=2.5,
    mean_max_residual_dt=21,
    adf_pval_cutoff=0.1,
    adf_pass_rate_filter=0.5,
    arima_forecast_months=3,
    arima_eval_models=5,
    trade_length_months=3,
    trading_interval_weeks=2,
    remove_industries=[
        "pharmaceuticals_biotechnology_and_life_sciences",
    ],
    first_n_windows=72,
)

Total data windows: 72


In [6]:
dataset = utils.ingest_trade_pipeline_outputs(
    data_dir="data/data_collection_pipeline/1000_to_max/"
)

vix = utils.get_stonk_data(fname_prefix="vix", disable_filter=True).iloc[0]
dataset["vix"] = dataset["trade_date"].apply(lambda x: vix.loc[x])

dataset.to_csv("data/dataset_bigcap.csv", header=True, index=False)

# Model development

In [2]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe, atpe, rand
import pickle

## Dataset ingest

In [3]:
df = pd.read_csv("data/dataset_bigcap.csv")
df = df[df.beta > 0]
df = df[df.last_residual.abs() >= 2]
df = preprocessing.assign_labels(df)

In [4]:
updated_ticker_list = utils.get_ticker_names(
    1000, None, remove_industries=["pharmaceuticals_biotechnology_and_life_sciences"]
)
df = df[df.ticker_x.isin(updated_ticker_list.index)]
df = df[df.ticker_y.isin(updated_ticker_list.index)]

## Production model training

In [212]:
drop_dates = 18
selected_dates = np.sort(df["trade_date"].unique())[drop_dates:]
df_prod = df[df.trade_date.isin(selected_dates)].sample(frac=1)
print(len(df_prod))
print(df_prod["label"].value_counts())

69406
0    56257
1    13149
Name: label, dtype: int64


In [215]:
clf_prod, scalers_prod = pipelines.train_production_xgb(
    df_prod, params, noise_level=0.005
)

[0]	validation_0-logloss:0.67312
[1]	validation_0-logloss:0.65610
[2]	validation_0-logloss:0.64180
[3]	validation_0-logloss:0.62930
[4]	validation_0-logloss:0.61805
[5]	validation_0-logloss:0.60780
[6]	validation_0-logloss:0.59860
[7]	validation_0-logloss:0.59078
[8]	validation_0-logloss:0.58385
[9]	validation_0-logloss:0.57781
[10]	validation_0-logloss:0.57245
[11]	validation_0-logloss:0.56719
[12]	validation_0-logloss:0.56284
[13]	validation_0-logloss:0.55834
[14]	validation_0-logloss:0.55426
[15]	validation_0-logloss:0.55047
[16]	validation_0-logloss:0.54713
[17]	validation_0-logloss:0.54407
[18]	validation_0-logloss:0.54059
[19]	validation_0-logloss:0.53803
[20]	validation_0-logloss:0.53479
[21]	validation_0-logloss:0.53217
[22]	validation_0-logloss:0.52936
[23]	validation_0-logloss:0.52646
[24]	validation_0-logloss:0.52402
[25]	validation_0-logloss:0.52119
[26]	validation_0-logloss:0.51903
[27]	validation_0-logloss:0.51664
[28]	validation_0-logloss:0.51511
[29]	validation_0-loglos

## Model training experiments

In [210]:
splits = preprocessing.split_data(df, 2, 6, 10, random_state=344)
print(len(splits["train"]))
print(len(splits["validation"]))
print(splits["train"]["label"].value_counts())
print(splits["validation"]["label"].value_counts())

73555
2671
0    56142
1    17413
Name: label, dtype: int64
0    1886
1     785
Name: label, dtype: int64


In [211]:
noise_level = 0.005

X_train, scalers = preprocessing.transform_features(
    splits["train"], noise_level=noise_level
)
X_valid, _ = preprocessing.transform_features(
    splits["validation"], scalers=scalers, noise_level=0
)

y_train = splits["train"]["label"]
y_valid = splits["validation"]["label"]

In [98]:
hyperparameter_space = {
    "gamma": hp.uniform("gamma", 0, 5),
    "scale_pos_weight": hp.uniform("scale_pos_weight", 3, 5),
    "max_depth": hp.quniform("max_depth", 3, 9, 1),
    "min_child_weight": hp.quniform("min_child_weight", 1, 8, 1),
    "max_delta_step": hp.quniform("max_delta_step", 1, 4, 1),
    "n_estimators": hp.quniform("n_estimators", 25, 85, 1),
    # "n_estimators": hp.choice("n_estimators", np.array([50, 75, 100, 150, 200])),
    # "subsample": hp.uniform("subsample", 0.9, 1),
    # "colsample_bylevel" : hp.uniform("colsample_bylevel", 0.5, 1),
}

In [192]:
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score


def optimization_objective(space):
    clf = xgb.XGBClassifier(
        gamma=space["gamma"],
        scale_pos_weight=space["scale_pos_weight"],
        #
        max_depth=int(space["max_depth"]),
        min_child_weight=int(space["min_child_weight"]),
        max_delta_step=int(space["max_delta_step"]),
        #
        # colsample_bylevel = space['colsample_bylevel'],
        colsample_bylevel=1,
        n_estimators=int(space["n_estimators"]),
        learning_rate=0.1,
        # subsample = space['subsample'],
        subsample=0.99,
        #
        tree_method="hist",
        enable_categorical=True,
        max_cat_to_onehot=1,
        random_state=np.random.randint(9999999),
    )

    clf.fit(
        X_train,
        y_train,
        verbose=False,
    )

    y_score = clf.predict_proba(X_valid)[:, 1]
    y_preds = y_score > 0.5

    f1 = f1_score(y_valid, y_preds, zero_division=0)
    precision = precision_score(y_valid, y_preds, zero_division=0)
    ap = evaluate.average_precision_from_cutoff(y_valid, y_score, 0.5)
    roc = roc_auc_score(y_valid, y_score)

    pos_preds = int(y_preds.sum())
    pos_labels = int(y_valid.sum())

    ap = ap if pos_preds >= pos_labels else 0

    if f1 == 0 or precision == 0:
        return {
            "loss": 100,
            "precision": precision,
            "f1_score": f1,
            "ap": ap,
            "auc": roc,
            "pos_preds": pos_preds,
            "status": STATUS_FAIL,
        }
    else:
        return {
            "loss": -ap,
            "precision": precision,
            "f1_score": f1,
            "ap": ap,
            "auc": roc,
            "pos_preds": pos_preds,
            "status": STATUS_OK,
        }

In [180]:
trials = Trials()

best_hyperparams = fmin(
    fn=optimization_objective,
    space=hyperparameter_space,
    algo=tpe.suggest,
    max_evals=500,
    trials=trials,
)

trial_vals = trials.vals
trial_vals["f1_score"] = list(map(lambda x: x["f1_score"], trials.results))
trial_vals["precision"] = list(map(lambda x: x["precision"], trials.results))
trial_vals["ap"] = list(map(lambda x: x["ap"], trials.results))
trial_vals["auc"] = list(map(lambda x: x["auc"], trials.results))
trial_vals["pos_preds"] = list(map(lambda x: x["pos_preds"], trials.results))

df_trials = pd.DataFrame.from_dict(trial_vals)
df_trials.to_csv("data/experiments/bigcap_auc-0.5_window-10#7.csv", index=False)

100%|██████████| 500/500 [03:33<00:00,  2.34trial/s, best loss: -0.4298228298524843] 


In [208]:
params = {
    # reg def 0
    "gamma": 0.928277,
    # L2 def 1
    # "reg_lambda" : 1,
    # "reg_alpha" : 0,
    # Class imbalance def 1
    "scale_pos_weight": 3.284871,
    # Integers:
    "max_depth": 7,
    # Reg def 1
    "min_child_weight": 1,
    # Class imbalance def 0
    "max_delta_step": 2,
    # Choice:
    "colsample_bylevel": 1,
    "n_estimators": 78,
    "learning_rate": 0.1,
    "subsample": 1,
    # Fixed:
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
    "random_state": np.random.randint(999929),
}

clf = xgb.XGBClassifier(**params)
clf = clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])

[0]	validation_0-logloss:0.68319	validation_1-logloss:0.67599
[1]	validation_0-logloss:0.67585	validation_1-logloss:0.66181
[2]	validation_0-logloss:0.67009	validation_1-logloss:0.64985
[3]	validation_0-logloss:0.66494	validation_1-logloss:0.63971
[4]	validation_0-logloss:0.66001	validation_1-logloss:0.63087
[5]	validation_0-logloss:0.65626	validation_1-logloss:0.62324
[6]	validation_0-logloss:0.65255	validation_1-logloss:0.61688
[7]	validation_0-logloss:0.65075	validation_1-logloss:0.61088
[8]	validation_0-logloss:0.64801	validation_1-logloss:0.60579
[9]	validation_0-logloss:0.64586	validation_1-logloss:0.60086
[10]	validation_0-logloss:0.64513	validation_1-logloss:0.59657
[11]	validation_0-logloss:0.64335	validation_1-logloss:0.59260
[12]	validation_0-logloss:0.64195	validation_1-logloss:0.58935
[13]	validation_0-logloss:0.64140	validation_1-logloss:0.58619
[14]	validation_0-logloss:0.64046	validation_1-logloss:0.58282
[15]	validation_0-logloss:0.63936	validation_1-logloss:0.58011
[1

In [200]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
thres = 0.5
y_preds = y_score > thres

evaluate.performance_summary(y_score, y_preds, y_valid, auc_cutoff=0.5)

df_results_valid = evaluate.returns_on_predictions(splits["validation"], y_preds)

evaluate.performance_on_slice(
    splits["validation"], y_score, y_preds, "subindustry", False
)

**Validation**
Precision: 0.384526558891455
PR-AUC/AP score: 0.417500748443032
ROC-AUC score: 0.583334796792997
Total positive predictions: 866

Totals:
        prediction
result            
FN             452
FP             533
TN            1353
TP             333

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.062480          0.103938            0.120442
FP             -0.004319         -0.006503           -0.011660
TN             -0.006641         -0.012817            0.000409
TP              0.070766          0.119859            0.128631

Stds:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.053346          0.066295            0.066062
FP              0.046502          0.054244            0.061286
TN              0.052589          0.069283            0.064622
TP              0.056238 

  recall = tps / tps[-1]


In [10]:
pd.set_option("display.max_rows", 100)

In [None]:
df_results_valid[df_results_valid.result == "FP"].iloc[:].drop(
    columns=["beta", "intercept", "data_window_start", "sp500", "label", "prediction"]
).iloc[:100]

In [None]:
df_results_valid[
    (
        (
            df_results_valid.last_residual.abs()
            - df_results_valid.residual_three_month.abs()
        )
        >= 0.5
    )
    & (df_results_valid.result == "FP")
].drop(
    columns=["beta", "intercept", "data_window_start", "sp500", "label", "prediction"]
)

In [None]:
# df_results_valid[df_results_valid.subindustry == 'consumer_services'].iloc[0:100]

In [209]:
for name, importance in zip(clf.feature_names_in_, clf.feature_importances_):
    print(name, importance)

adf_pass_rate 0.05776371
last_residual 0.0828459
residual_mean_max 0.12184093
vix 0.32355833
betas_rsquared 0.07020924
arima_forecast 0.108092435
industry 0.17025457
residual_inter 0.06543488


In [194]:
df_trials = pd.read_csv("data/experiments/bigcap_auc-0.5_window-10#7.csv")
df_trials.sort_values("ap", ascending=False).head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
393,2.878043,3.0,8.0,3.0,70.0,3.129911,0.366645,0.366412,0.429823,0.57577,786
423,2.62542,3.0,9.0,2.0,78.0,3.201474,0.381897,0.371981,0.426153,0.577761,828
99,3.011774,2.0,9.0,2.0,46.0,3.258718,0.377724,0.359862,0.42374,0.58133,867
280,2.190512,2.0,8.0,3.0,64.0,3.439644,0.398175,0.360537,0.418014,0.578617,968
414,2.750131,3.0,9.0,4.0,69.0,3.19522,0.378713,0.368231,0.416806,0.582363,831
410,2.578012,3.0,9.0,4.0,65.0,3.134705,0.372475,0.369212,0.415433,0.572806,799
345,2.491225,3.0,9.0,3.0,67.0,3.292916,0.382953,0.362089,0.413536,0.574807,881
368,2.421524,3.0,9.0,4.0,66.0,3.18509,0.373434,0.367448,0.412881,0.578061,811
456,2.782687,2.0,8.0,1.0,76.0,3.232579,0.399038,0.377702,0.412832,0.579888,879
472,3.419377,4.0,8.0,2.0,75.0,3.420464,0.399323,0.3583,0.411764,0.575774,988


In [101]:
df_trials = pd.read_csv("data/experiments/bigcap_auc-0.5_window-10#6.csv")
df_trials.sort_values("ap", ascending=False).head(20)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
822,0.402362,4.0,7.0,2.0,82.0,3.310414,0.387569,0.371495,0.445251,0.581368,856
999,0.765877,2.0,9.0,1.0,78.0,3.289813,0.373116,0.36803,0.439491,0.58166,807
933,0.948234,2.0,9.0,1.0,70.0,3.242355,0.3891,0.387137,0.437242,0.584024,793
821,0.881649,4.0,7.0,1.0,75.0,3.34717,0.387755,0.366629,0.431691,0.576774,881
483,0.007246,4.0,8.0,1.0,84.0,3.434106,0.374319,0.356813,0.429603,0.579647,866
587,0.928277,2.0,7.0,1.0,78.0,3.284871,0.385266,0.366246,0.428353,0.574104,871
958,1.511575,2.0,9.0,1.0,73.0,3.326969,0.38815,0.36939,0.428255,0.584125,869
919,1.049328,2.0,9.0,1.0,77.0,3.398301,0.39054,0.372685,0.427636,0.586307,864
622,1.034416,2.0,7.0,1.0,68.0,3.544619,0.395377,0.378347,0.426879,0.580146,859
212,0.788948,4.0,9.0,1.0,69.0,3.666578,0.396345,0.359213,0.42637,0.580343,966


In [None]:
df_trials = pd.read_csv("data/data-window-size-2#6.csv")
df_trials.sort_values("ap", ascending=False).head(50)