# Imports

In [None]:
# !pip install yfinance
# !pip install pmdarima
# !pip install hyperopt
# !pip install xgboost
# !pip install numpy -U

In [1]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import preprocessing

# Download stock daily prices & indexes

In [2]:
### Gets all ticker names (no argument given)
market_cap_min_mm = 100
market_cap_max_mm = 1000

ticker_list = utils.get_ticker_names(
    market_cap_min_mm=market_cap_min_mm,
    market_cap_max_mm=market_cap_max_mm,
    remove_industries = ['diversified_financials', 'pharmaceuticals_biotechnology_and_life_sciences'],
)

In [3]:
### Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2021, 1, 18)
### Date of today
date_to = datetime.today()
### How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 10

In [4]:
df, df_clean = utils.download_stonk_prices(
    ticker_list.index, period_years=period_years, date_to=date_to
)
vix, vix_clean = utils.download_stonk_prices(
    ["^VIX"], period_years=period_years, date_to=date_to, fname_prefix="vix"
)
sp500, sp500_clean = utils.download_stonk_prices(
    ["^GSPC"], period_years=period_years, date_to=date_to, fname_prefix="sp500"
)

[*********************100%***********************]  1210 of 1210 completed

7 Failed downloads:
- XENT: No data found, symbol may be delisted
- GNOG: No data found, symbol may be delisted
- LAWS: No data found, symbol may be delisted
- BKSY.WS: No data found, symbol may be delisted
- SHPW.WS: No data found, symbol may be delisted
- IIN: No data found, symbol may be delisted
- PROG: No data found, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


# Run data pipeline

In [4]:
industries = [
    # # 'health_care_equipment_and_services',
    # 'software_and_services',
    # 'retailing',
    # 'telecommunication_services',
    # "capital_goods",
    'energy',
    # # 'pharmaceuticals_biotechnology_and_life_sciences',
    # 'consumer_staples',
    # 'banks',
    # 'diversified_financials',
    # 'metals_and_mining',
    # 'technology_hardware_and_equipment',
    # 'utilities',
    # 'chemicals',
    # 'automobiles_and_components',
    # 'semiconductors_and_semiconductor_equipment',
    # 'media_and_entertainment',
    # 'real_estate',
    # 'consumer_services',
    # 'consumer_durables_and_apparel',
    # 'insurance',
    # 'transportation',
    # 'commercial_and_professional_services',
    # 'paper_and_forest_products',
    # 'containers_and_packaging',
    # 'construction_materials'
]

l_reg = 3
l_roll = 2
dt = 10

market_cap_max_string = "max" if market_cap_max_mm is None else str(market_cap_max_mm)
pipeline_dir = "pipeline_run_" + str(market_cap_min_mm) + "_to_" + market_cap_max_string + "_cap"
output_dir = os.path.join("data", pipeline_dir)

stonk_model = predict.XGBStonkModel()
vix = utils.get_stonk_data(fname_prefix="vix", disable_filter=True).iloc[0]

In [None]:
datasets = []
i = 1
total_industries = len(industries)
for industry in industries:
    stonks = utils.get_stonk_data(filter_industries=[industry])
    X, Y = processing.combine_stonk_pairs(stonks)

    print("Industry ({0}/{1}): {2}".format(i, total_industries, industry))

    print("Processing residuals...")
    residuals, betas, _, dates_index = utils.measure_time(
        partial(
            processing.get_rolling_residuals,
            X=X,
            Y=Y,
            l_reg=l_reg,
            l_roll=l_roll,
            dt=dt,
        )
    )

    std_residuals, means, stds = processing.get_standardized_residuals(
        residuals
    )

    trades_before = len(std_residuals)
    std_residuals = std_residuals[std_residuals.iloc[:, -1].abs() >= 2.5]
    trades_after = len(std_residuals)
    print(
        "{0} trades selected out of {1} by residual values".format(
            trades_after, trades_before
        )
    )
    if trades_after == 0:
        continue
        
    residuals = residuals.loc[std_residuals.index]
    betas = betas.loc[std_residuals.index]
    dates_index = dates_index.loc[std_residuals.index]

    print("Processing ADFs...")
    adfs, adfs_raw = utils.measure_time(
        partial(
            processing.get_aggregate_adfs,
            residuals,
            betas=betas,
        )
    )

    selected_by_adf = (adfs >= 0.5).values
    adfs = adfs[selected_by_adf]

    trades_before = len(std_residuals)
    std_residuals = std_residuals[selected_by_adf]
    trades_after = len(std_residuals)
    print(
        "{0} trades selected out of {1} by ADF pass rates".format(
            trades_after, trades_before
        )
    )

    if len(std_residuals) == 0:
        continue

    betas = betas.loc[adfs.index]
    
    residuals = residuals.loc[adfs.index]
    adfs_raw = adfs_raw.loc[adfs.index]
    
    dates_index = dates_index.loc[std_residuals.index]
    
    means = means.loc[std_residuals.index]
    stds = stds.loc[std_residuals.index]

    residuals_max_mean = processing.get_mean_residual_magnitude(
        std_residuals.to_numpy(), dt=21
    )
    print(
        "Mean max residual value for {0} after filtering is {1}".format(
            industry, residuals_max_mean
        )
    )
    
    print("Processing beta stability tests...")
    beta_stability_rsquared_vals = utils.measure_time(
        partial(
            processing.calculate_beta_stability_rsquared,
            prices_X=X, prices_Y=Y, betas=betas, dates_index=dates_index
        )
    )
    assert np.all(beta_stability_rsquared_vals.index == std_residuals.index)
    
    print("Processing ARIMA forecasts...")
    arima_forecasts = utils.measure_time(
        partial(
            processing.calculate_arima_forecast,
            std_residuals=std_residuals,
            forecast_months=3,
            eval_models=5,
        )
    )

    print("Preparing data for model...")
    dataset = utils.build_dataset_from_live_data_by_industry(
        std_residuals=std_residuals,
        adfs=adfs,
        subindustry=industry,
        mean_max_residual=residuals_max_mean,
        vix_index=vix.loc[stonks.columns[-1]],
        betas_stability_rsquared=beta_stability_rsquared_vals,
        arima_forecasts=arima_forecasts
    )

    print("Running model...")
    predictions, df_processed = stonk_model.predict(dataset)
    datasets.append((dataset, df_processed))
    predictions = pd.DataFrame(predictions)
    predictions.index = adfs.index

    residuals.insert(0, "dates", dates_index.values)
    betas.insert(0, "dates", dates_index.values)
    
    print("Writing results to CSV...")
    # Very big industry, exceeds Git file size limit
    if industry == "diversified_financials":
        half = len(residuals) // 2
        residuals_fst = residuals.iloc[:half]
        residuals_snd = residuals.iloc[half:]
        residuals_fst.to_csv(
            os.path.join(output_dir, industry + "_one_residuals.csv"),
            header=False,
            index=True,
        )
        residuals_snd.to_csv(
            os.path.join(output_dir, industry + "_two_residuals.csv"),
            header=False,
            index=True,
        )
        del residuals_fst
        del residuals_snd
    else:
        residuals.to_csv(
            os.path.join(output_dir, industry + "_residuals.csv"),
            header=False,
            index=True,
        )
    betas.to_csv(
        os.path.join(output_dir, industry + "_betas.csv"), header=False, index=True
    )
    adfs_raw.to_csv(
        os.path.join(output_dir, industry + "_adfs_raw.csv"), header=False, index=True
    )
    predictions.to_csv(
        os.path.join(output_dir, industry + "_predictions.csv"),
        header=False,
        index=True,
    )
    arima_forecasts.to_csv(
        os.path.join(output_dir, industry + "_arima.csv"),
        header=False,
        index=True,
    )
    beta_stability_rsquared_vals.to_csv(
        os.path.join(output_dir, industry + "_rsquared.csv"),
        header=False,
        index=True,
    )
    
    i += 1

print("*** All done ***")

# Data collection

In [2]:
stonks = utils.get_stonk_data(disable_filter=True)
# stonks = stonks.loc[:, :"2020-05-08"]

In [None]:
pipelines.data_collection_rolling_pipeline(
    stonks,
    l_reg=3,
    l_roll=2,
    dt=10,
    market_cap_min_mm=100,
    market_cap_max_mm=1000,
    last_residual_cutoff=2,
    mean_max_residual_dt=21,
    adf_pval_cutoff=0.1,
    adf_pass_rate_filter=0.5,
    arima_forecast_months=3,
    arima_eval_models=5,
    trade_length_months=3,
    trading_interval_weeks=2,
    remove_industries = ['diversified_financials', 'pharmaceuticals_biotechnology_and_life_sciences', 'containers_and_packaging'],
    first_n_windows=72,
)

Total data windows: 72


In [2]:
dataset = utils.ingest_trade_pipeline_outputs()

vix = utils.get_stonk_data(fname_prefix="vix", disable_filter=True).iloc[0]
sp500 = utils.get_stonk_data(fname_prefix="sp500", disable_filter=True).iloc[0]

sp500_chg = pd.Series((sp500.iloc[63:].values / sp500.iloc[:-63].values) - 1)
sp500_chg.index = sp500.iloc[63:].index

dataset["vix"] = dataset["trade_date"].apply(lambda x: vix.loc[x])
dataset["sp500"] = dataset["trade_date"].apply(lambda x: sp500_chg.loc[x])
dataset.to_csv("data/dataset_smallcap.csv", header=True, index=False)

# Model development

In [2]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe, atpe, rand
import pickle

In [3]:
def train_production_xgb(
    df: pd.DataFrame, params: Dict[str, Any], noise_level: float = 0
) -> Tuple[xgb.XGBClassifier, sklearn.base.TransformerMixin]:
    X_train, scalers = preprocessing.transform_features(df, noise_level=noise_level)
    y_train = df["label"]

    clf = xgb.XGBClassifier(**params)

    clf.fit(X_train, y_train, eval_set=[(X_train, y_train)])
    clf.save_model(os.path.join("data", "xgb_classifier.json"))

    with open(os.path.join("data", "scalers.json"), "wb") as fp:
        pickle.dump(scalers, fp)

    return clf, scalers

In [3]:
df = pd.read_csv("data/dataset.csv")
df = df[df.beta > 0]
df = df[df.last_residual.abs() >= 2.5]
df = preprocessing.assign_labels(df)

In [128]:
drop_dates = 18
selected_dates = np.sort(df["trade_date"].unique())[drop_dates:]
df_prod = df[df.trade_date.isin(selected_dates)].sample(frac=1)
print(len(df_prod))
print(df_prod["label"].value_counts())

92655
0    79266
1    13389
Name: label, dtype: int64


In [None]:
clf_prod, scalers_prod = train_production_xgb(df_prod, params, noise_level=0.005)

In [127]:
splits = preprocessing.split_data(df, 2, 6, 10, random_state=432233293)
print(len(splits["train"]))
print(len(splits["validation"]))
print(splits["train"]["label"].value_counts())
print(splits["validation"]["label"].value_counts())

98187
3605
0    79429
1    18758
Name: label, dtype: int64
0    2792
1     813
Name: label, dtype: int64


In [124]:
noise_level = 0.005

X_train, scalers = preprocessing.transform_features(
    splits["train"], noise_level=noise_level
)
X_valid, _ = preprocessing.transform_features(
    splits["validation"], scalers=scalers, noise_level=0
)

y_train = splits["train"]["label"]
y_valid = splits["validation"]["label"]

In [32]:
hyperparameter_space = {
    "gamma": hp.uniform("gamma", 0, 5),
    "scale_pos_weight": hp.uniform("scale_pos_weight", 3, 7),
    "max_depth": hp.quniform("max_depth", 3, 8, 1),
    "min_child_weight": hp.quniform("min_child_weight", 1, 8, 1),
    "max_delta_step": hp.quniform("max_delta_step", 1, 4, 1),
    "n_estimators": hp.quniform("n_estimators", 25, 80, 1),
    # "n_estimators": hp.choice("n_estimators", np.array([50, 75, 100, 150, 200])),
    # "subsample": hp.uniform("subsample", 0.9, 1),
    # "colsample_bylevel" : hp.uniform("colsample_bylevel", 0.5, 1),
}

In [33]:
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score


def optimization_objective(space):
    clf = xgb.XGBClassifier(
        gamma=space["gamma"],
        scale_pos_weight=space["scale_pos_weight"],
        #
        max_depth=int(space["max_depth"]),
        min_child_weight=int(space["min_child_weight"]),
        max_delta_step=int(space["max_delta_step"]),
        #
        # colsample_bylevel = space['colsample_bylevel'],
        colsample_bylevel=1,
        n_estimators=int(space["n_estimators"]),
        learning_rate=0.1,
        # subsample = space['subsample'],
        subsample=0.99,
        #
        tree_method="hist",
        enable_categorical=True,
        max_cat_to_onehot=1,
        random_state=np.random.randint(9999999),
    )

    clf.fit(
        X_train,
        y_train,
        verbose=False,
    )

    y_score = clf.predict_proba(X_valid)[:, 1]
    y_preds = y_score > 0.5

    f1 = f1_score(y_valid, y_preds, zero_division=0)
    precision = precision_score(y_valid, y_preds, zero_division=0)
    ap = evaluate.average_precision_from_cutoff(y_valid, y_score, 0)
    roc = roc_auc_score(y_valid, y_score)

    pos_preds = int(y_preds.sum())
    pos_labels = int(y_valid.sum())

    ap = ap if pos_preds >= pos_labels else 0

    if f1 == 0 or precision == 0:
        return {
            "loss": 100,
            "precision": precision,
            "f1_score": f1,
            "ap": ap,
            "auc": roc,
            "pos_preds": pos_preds,
            "status": STATUS_FAIL,
        }
    else:
        return {
            "loss": -ap,
            "precision": precision,
            "f1_score": f1,
            "ap": ap,
            "auc": roc,
            "pos_preds": pos_preds,
            "status": STATUS_OK,
        }

In [52]:
trials = Trials()

best_hyperparams = fmin(
    fn=optimization_objective,
    space=hyperparameter_space,
    algo=tpe.suggest,
    max_evals=400,
    trials=trials,
)

trial_vals = trials.vals
trial_vals["f1_score"] = list(map(lambda x: x["f1_score"], trials.results))
trial_vals["precision"] = list(map(lambda x: x["precision"], trials.results))
trial_vals["ap"] = list(map(lambda x: x["ap"], trials.results))
trial_vals["auc"] = list(map(lambda x: x["auc"], trials.results))
trial_vals["pos_preds"] = list(map(lambda x: x["pos_preds"], trials.results))

df_trials = pd.DataFrame.from_dict(trial_vals)
df_trials.to_csv("data/experiments/data_window_size_24#1.csv", index=False)

100%|██████████| 400/400 [02:16<00:00,  2.94trial/s, best loss: -0.3041476716238842]


In [None]:
params = {
    # reg def 0
    "gamma": 4.159451,
    # L2 def 1
    # "reg_lambda" : 1,
    # "reg_alpha" : 0,
    # Class imbalance def 1
    "scale_pos_weight": 5.631922,
    # Integers:
    "max_depth": 6,
    # Reg def 1
    "min_child_weight": 3,
    # Class imbalance def 0
    "max_delta_step": 2,
    # Choice:
    "colsample_bylevel": 1,
    "n_estimators": 55,
    "learning_rate": 0.1,
    "subsample": 1,
    # Fixed:
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
    "random_state": np.random.randint(999929),
}

clf = xgb.XGBClassifier(**params)

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])

In [None]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
thres = 0.5
y_preds = y_score > thres

evaluate.performance_summary(y_score, y_preds, y_valid, auc_cutoff=0.4)

df_results_valid = evaluate.returns_on_predictions(splits["validation"], y_preds)

evaluate.performance_on_slice(
    splits["validation"], y_score, y_preds, "subindustry", False
)

In [14]:
pd.set_option("display.max_rows", 200)

In [98]:
# df_results_valid[df_results_valid.result == "FP"].iloc[:100]

In [None]:
# df_results_valid[df_results_valid.subindustry == 'consumer_services'].iloc[0:100]

In [110]:
for name, importance in zip(clf.feature_names_in_, clf.feature_importances_):
    print(name, importance)

adf_pass_rate 0.04439353
last_residual 0.074401274
residual_mean_max 0.13986948
vix 0.34344476
betas_rsquared 0.051621858
arima_forecast 0.08900969
industry 0.2032127
residual_inter 0.054046713


In [31]:
df_trials = pd.read_csv("data/experiments/data_window_size_8#1.csv")
df_trials.sort_values("ap", ascending=False).head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
148,4.402027,2.0,3.0,4.0,46.0,5.036233,0.301226,0.286667,0.307824,0.612917,900
390,2.571218,1.0,3.0,8.0,79.0,5.152819,0.303274,0.284483,0.306634,0.605328,928
397,2.905877,1.0,3.0,7.0,77.0,5.193479,0.307246,0.29057,0.30637,0.60683,912
192,2.268182,1.0,3.0,8.0,72.0,5.801728,0.326882,0.290353,0.306365,0.608112,1047
257,1.807177,1.0,3.0,8.0,63.0,4.940918,0.303207,0.288248,0.30607,0.608726,902
120,0.619195,1.0,3.0,8.0,55.0,4.634449,0.298363,0.294258,0.30605,0.610108,836
109,3.901148,1.0,3.0,8.0,51.0,4.543907,0.298235,0.295181,0.305957,0.610852,830
287,1.661592,2.0,3.0,5.0,59.0,4.682743,0.29777,0.291962,0.305486,0.611061,846
86,1.400583,1.0,3.0,7.0,62.0,4.94317,0.30129,0.287794,0.305466,0.609473,893
135,3.433409,2.0,3.0,4.0,57.0,4.977905,0.300236,0.288965,0.30513,0.610926,879


In [26]:
df_trials = pd.read_csv("data/experiments/data_window_size_20#1.csv")
df_trials.sort_values("ap", ascending=False).head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
134,3.906547,3.0,5.0,6.0,66.0,6.716516,0.3749,0.277679,0.313899,0.600266,1689
112,3.469554,2.0,5.0,3.0,52.0,6.02704,0.379252,0.289799,0.31257,0.60718,1539
314,3.522302,3.0,5.0,5.0,58.0,6.160963,0.38052,0.286335,0.308583,0.600154,1610
116,3.563187,4.0,5.0,3.0,51.0,6.246752,0.379491,0.282452,0.308114,0.598825,1664
177,1.403024,1.0,8.0,5.0,64.0,6.610162,0.375427,0.287394,0.307838,0.607987,1531
214,3.270269,4.0,5.0,4.0,67.0,6.091447,0.376421,0.286172,0.307594,0.603345,1562
221,3.907869,3.0,5.0,1.0,62.0,6.35552,0.373591,0.277678,0.307327,0.59812,1671
48,2.160591,2.0,5.0,3.0,41.0,4.456795,0.353448,0.314477,0.307306,0.606309,1043
38,1.560098,4.0,5.0,3.0,52.0,4.963571,0.368747,0.300933,0.307297,0.604676,1286
282,2.791044,4.0,5.0,3.0,74.0,6.435629,0.381105,0.282493,0.306633,0.603704,1685


In [None]:
df_trials = pd.read_csv("data/data-window-size-2#6.csv")
df_trials.sort_values("ap", ascending=False).head(50)