# Imports

In [None]:
# !pip install yfinance

In [1]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import preprocessing

# Download stock daily prices & VIX index

In [18]:
### Gets all ticker names (no argument given)
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [19]:
### Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2021, 1, 18)
### Date of today
date_to = datetime.today()
### How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 6

In [20]:
df, df_clean = utils.download_stonk_prices(ticker_list.index, period_years=period_years, date_to=date_to)
vix, vix_clean = utils.download_stonk_prices(["^VIX"], period_years=period_years, date_to=date_to, fname_prefix="vix")
sp500, sp500_clean = utils.download_stonk_prices(["^GSPC"], period_years=period_years, date_to=date_to, fname_prefix="sp500")

[*********************100%***********************]  2820 of 2820 completed

13 Failed downloads:
- MRK.WI: No data found, symbol may be delisted
- FOE: No data found, symbol may be delisted
- PFE.WI: No data found, symbol may be delisted
- MGP: No data found, symbol may be delisted
- BIP.PRB: No data found, symbol may be delisted
- POST WI: No data found, symbol may be delisted
- RXN WI: No data found, symbol may be delisted
- BIP.PRA: No data found, symbol may be delisted
- SGMS: No data found, symbol may be delisted
- SNX.WI: No data found, symbol may be delisted
- O.WI: No data found, symbol may be delisted
- T WD: No data found, symbol may be delisted
- DELL WI: No data found, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


# Run data pipeline

In [2]:
industries = [
    # 'health_care_equipment_and_services',
    'software_and_services',
    'retailing',
    'telecommunication_services',
    'capital_goods',
    'energy',
    # 'pharmaceuticals_biotechnology_and_life_sciences',
    'consumer_staples',
    'banks',
    'diversified_financials',
    'metals_and_mining',
    'technology_hardware_and_equipment',
    'utilities',
    'chemicals',
    'automobiles_and_components',
    'semiconductors_and_semiconductor_equipment',
    'media_and_entertainment',
    'real_estate',
    'consumer_services',
    'consumer_durables_and_apparel',
    'insurance',
    'transportation',
    'commercial_and_professional_services',
    'paper_and_forest_products',
    'containers_and_packaging',
    'construction_materials'
    ]

l_reg = 3
l_roll = 2
dt = 10

output_dir = 'data'

stonk_model = predict.XGBStonkModel()
vix = utils.get_stonk_data(fname_prefix='vix', disable_filter=True).iloc[0]

In [None]:
datasets = []
i = 1
total_industries = len(industries)
for industry in industries:
    stonks = utils.get_stonk_data(filter_industries=[industry])
    X, Y = processing.combine_stonk_pairs(stonks)
    
    print('Industry ({0}/{1}): {2}'.format(i, total_industries, industry))
    
    print('Processing residuals...')
    residuals, betas, _, date_index = utils.measure_time(partial(processing.get_rolling_residuals, X=X, Y=Y, l_reg=l_reg, l_roll=l_roll, dt=dt))
    residuals.insert(0, "dates", date_index)
    betas.insert(0, "dates", date_index)
    
    std_residuals, _, _ = processing.get_standardized_residuals(residuals.drop(columns="dates"))

    trades_before = len(std_residuals)
    std_residuals = std_residuals[std_residuals.iloc[:, -1].abs() >= 2.5]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by residual values'.format(trades_after, trades_before))
    if trades_after == 0:
        print('No trades left after filtering residuals, skipping this industry...')
        continue
    residuals = residuals.loc[std_residuals.index]
    betas = betas.loc[std_residuals.index]
    
    print('Processing ADFs...')
    adfs, adfs_raw = utils.measure_time(partial(processing.get_aggregate_adfs, residuals.drop(columns="dates"), betas=betas.drop(columns="dates")))
    
    selected_by_adf = (adfs >= 0.5).values
    adfs = adfs[selected_by_adf]
    
    trades_before = len(std_residuals)
    std_residuals = std_residuals[selected_by_adf]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by ADF pass rates'.format(trades_after, trades_before))
    
    if len(std_residuals) == 0:
        print('No trades left after filtering ADF pass rates, skipping this industry...')
        continue

    betas = betas.loc[adfs.index]
    residuals = residuals.loc[adfs.index]
    adfs_raw = adfs_raw.loc[adfs.index]
    
    residuals_max_mean = processing.get_mean_residual_magnitude(std_residuals.to_numpy(), dt=21)
    print('Mean max residual value for {0} after filtering is {1}'.format(industry, residuals_max_mean))
    
    print('Preparing data for model...')
    dataset = utils.build_dataset_from_live_data_by_industry(std_residuals.to_numpy(), adfs.to_numpy().ravel(), industry, residuals_max_mean, vix.loc[stonks.columns[-1]])
    
    print('Running model...')
    predictions, df_processed = stonk_model.predict(dataset)
    datasets.append((dataset, df_processed))
    predictions = pd.DataFrame(predictions)
    predictions.index = adfs.index
    
    print('Writing results to CSV...')
    # Very big industry, exceeds Git file size limit
    if industry == "diversified_financials":
        half = len(residuals) // 2
        residuals_fst = residuals.iloc[:half]
        residuals_snd = residuals.iloc[half:]
        residuals_fst.to_csv(os.path.join(output_dir, industry + '_one_residuals.csv'), header=False, index=True)
        residuals_snd.to_csv(os.path.join(output_dir, industry + '_two_residuals.csv'), header=False, index=True)
        del residuals_fst
        del residuals_snd
    else:
        residuals.to_csv(os.path.join(output_dir, industry + '_residuals.csv'), header=False, index=True)
    betas.to_csv(os.path.join(output_dir, industry + '_betas.csv'), header=False, index=True)
    adfs_raw.to_csv(os.path.join(output_dir, industry + '_adfs_raw.csv'), header=False, index=True)
    predictions.to_csv(os.path.join(output_dir, industry + '_predictions.csv'), header=False, index=True)
    i+= 1
    
print('*** All done ***')

In [55]:
# datasets[-23][0].head(200)

# Data collection

In [3]:
stonks = utils.get_stonk_data()
stonks = stonks.loc[:, :'2022-07-01']

In [None]:
pipelines.data_collection_rolling_pipeline(
    stonks,
    l_reg=3,
    l_roll=2,
    dt=10,
    market_cap_min_mm=1000,
    market_cap_max_mm=None,
    last_residual_cutoff=2.5,
    mean_max_residual_dt=21,
    adf_pval_cutoff=0.1,
    adf_pass_rate_filter=0.5,
    trade_length_months=3,
    trading_interval_weeks=2,
    first_n_windows=1,
)

In [4]:
dataset = utils.ingest_trade_pipeline_outputs()

vix = utils.get_stonk_data(fname_prefix='vix', disable_filter=True).iloc[0]
sp500 = utils.get_stonk_data(fname_prefix='sp500', disable_filter=True).iloc[0]

sp500_chg = pd.Series((sp500.iloc[63:].values / sp500.iloc[:-63].values) - 1)
sp500_chg.index = sp500.iloc[63:].index

dataset['vix'] = dataset['trade_date'].apply(lambda x: vix.loc[x])
dataset['sp500'] = dataset['trade_date'].apply(lambda x: sp500_chg.loc[x])
dataset.to_csv('data/dataset.csv', header=True, index=False)

# Model development

In [2]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe, atpe, rand
import pickle

In [3]:
def train_production_xgb(df: pd.DataFrame, params: Dict[str, Any], noise_level: float = 0) -> Tuple[xgb.XGBClassifier, sklearn.base.TransformerMixin]:
    X_train, scalers = preprocessing.transform_features(df, noise_level=noise_level)
    y_train = df['label']
    
    clf = xgb.XGBClassifier(
        **params
    )

    clf.fit(X_train, y_train, eval_set=[(X_train, y_train)])
    clf.save_model(os.path.join('data', 'xgb_classifier.json'))
                   
    with open(os.path.join('data', 'scalers.json'), 'wb') as fp:
        pickle.dump(scalers, fp)
                   
    return clf, scalers

In [4]:
df = pd.read_csv('data/dataset.csv')
df = df[df.beta > 0]
df = df[df.last_residual.abs() >= 2.5]
df = preprocessing.assign_labels(df)

In [5]:
drop_dates = 2
selected_dates = np.sort(df['trade_date'].unique())[drop_dates:]
df_prod = df[df.trade_date.isin(selected_dates)].sample(frac=1)
print(len(df_prod))
print(df_prod['label'].value_counts())

107667
0    93485
1    14182
Name: label, dtype: int64


In [14]:
clf_prod, scalers_prod = train_production_xgb(df_prod, params, noise_level=0.005)

[0]	validation_0-logloss:0.67261
[1]	validation_0-logloss:0.65538
[2]	validation_0-logloss:0.64082
[3]	validation_0-logloss:0.62843
[4]	validation_0-logloss:0.61781
[5]	validation_0-logloss:0.60800
[6]	validation_0-logloss:0.59962
[7]	validation_0-logloss:0.59232
[8]	validation_0-logloss:0.58599
[9]	validation_0-logloss:0.58051
[10]	validation_0-logloss:0.57559
[11]	validation_0-logloss:0.57119
[12]	validation_0-logloss:0.56680
[13]	validation_0-logloss:0.56327
[14]	validation_0-logloss:0.55999
[15]	validation_0-logloss:0.55713
[16]	validation_0-logloss:0.55457
[17]	validation_0-logloss:0.55228
[18]	validation_0-logloss:0.54993
[19]	validation_0-logloss:0.54765
[20]	validation_0-logloss:0.54557
[21]	validation_0-logloss:0.54383
[22]	validation_0-logloss:0.54168
[23]	validation_0-logloss:0.53976
[24]	validation_0-logloss:0.53834
[25]	validation_0-logloss:0.53706
[26]	validation_0-logloss:0.53561
[27]	validation_0-logloss:0.53409
[28]	validation_0-logloss:0.53300
[29]	validation_0-loglos

In [7]:
splits = preprocessing.split_data(df, 2, 6, 2, random_state=3439)
print(len(splits['train']))
print(len(splits['validation']))
print(splits['train']['label'].value_counts())
print(splits['validation']['label'].value_counts())

94725
3817
0    82557
1    12168
Name: label, dtype: int64
0    3203
1     614
Name: label, dtype: int64


In [8]:
noise_level = 0.005

X_train, scalers = preprocessing.transform_features(splits['train'], noise_level=noise_level)
X_valid, _ = preprocessing.transform_features(splits['validation'], scalers=scalers, noise_level=0)

y_train = splits['train']['label']
y_valid = splits['validation']['label']

In [122]:
hyperparameter_space = {
    "gamma": hp.uniform("gamma", 0, 5),
    "scale_pos_weight" : hp.uniform("scale_pos_weight", 2, 12),
    "max_depth": hp.quniform("max_depth", 3, 10, 1),
    "min_child_weight" : hp.quniform("min_child_weight", 1, 8, 1),
    "max_delta_step" : hp.quniform("max_delta_step", 1, 4, 1),
    "n_estimators": hp.quniform("n_estimators", 50, 200, 1),
    # "n_estimators": hp.choice("n_estimators", np.array([50, 75, 100, 150, 200])),
    # "subsample": hp.uniform("subsample", 0.5, 1),
    # "colsample_bylevel" : hp.uniform("colsample_bylevel", 0.5, 1),
    }

In [123]:
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

def optimization_objective(space):
    clf = xgb.XGBClassifier(
        gamma = space['gamma'],
        scale_pos_weight = space['scale_pos_weight'],
        #
        max_depth = int(space['max_depth']),
        min_child_weight = int(space['min_child_weight']),
        max_delta_step = int(space['max_delta_step']),
        #
        # colsample_bylevel = space['colsample_bylevel'],
        colsample_bylevel = 1,
        n_estimators = int(space['n_estimators']), 
        learning_rate = 0.1,
        # subsample = space['subsample'],
        subsample = 1,
        #
        tree_method = "hist",
        enable_categorical = True,
        max_cat_to_onehot = 1,
        random_state = np.random.randint(9999999),
    )
    
    clf.fit(
        X_train, y_train,
        verbose=False,
    )
    
    y_score = clf.predict_proba(X_valid)[:, 1]
    y_preds = y_score > 0.5
    
    f1 = f1_score(y_valid, y_preds, zero_division=0)
    precision = precision_score(y_valid, y_preds, zero_division=0)
    ap = evaluate.average_precision_from_cutoff(y_valid, y_score, 0.4)
    roc = roc_auc_score(y_valid, y_score)
    
    pos_preds = int(y_preds.sum())
    pos_labels = int(y_valid.sum())
    
    ap = ap if pos_preds >= pos_labels else 0
    
    if f1 == 0 or precision == 0:
        return {'loss': 100, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'status': STATUS_FAIL}
    else:
        return {'loss': -ap, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'status': STATUS_OK}

In [124]:
trials = Trials()

best_hyperparams = fmin(
    fn = optimization_objective,
    space = hyperparameter_space,
    algo = tpe.suggest,
    max_evals = 1000,
    trials = trials
)

trial_vals = trials.vals
trial_vals['f1_score'] = list(map(lambda x: x['f1_score'], trials.results))
trial_vals['precision'] = list(map(lambda x: x['precision'], trials.results))
trial_vals['ap'] = list(map(lambda x: x['ap'], trials.results))
trial_vals['auc'] = list(map(lambda x: x['auc'], trials.results))
trial_vals['pos_preds'] = list(map(lambda x: x['pos_preds'], trials.results))

df_trials = pd.DataFrame.from_dict(trial_vals)
df_trials.to_csv('data/data-window-size-2#7.csv', index=False)

100%|██████████| 1000/1000 [08:34<00:00,  1.94trial/s, best loss: -0.3386527370666844]


In [9]:
params = {
    # reg def 0
    "gamma": 3.301387,
    # L2 def 1
    # "reg_lambda" : 1,
    # "reg_alpha" : 0,
    # Class imbalance def 1
    "scale_pos_weight" : 5.568589,
    # Integers:
    "max_depth": 6,
    # Reg def 1
    "min_child_weight" : 7,
    # Class imbalance def 0
    "max_delta_step" : 3,
    # Choice:
    "colsample_bylevel" : 1,
    "n_estimators": 63,
    "learning_rate": 0.1,
    "subsample": 1,
    # Fixed:
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
    "random_state": np.random.randint(999929)
}

clf = xgb.XGBClassifier(
        **params
    )

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])

[0]	validation_0-logloss:0.67189	validation_1-logloss:0.66963
[1]	validation_0-logloss:0.65267	validation_1-logloss:0.64974
[2]	validation_0-logloss:0.63792	validation_1-logloss:0.63305
[3]	validation_0-logloss:0.62608	validation_1-logloss:0.61877
[4]	validation_0-logloss:0.61542	validation_1-logloss:0.60659
[5]	validation_0-logloss:0.60531	validation_1-logloss:0.59574
[6]	validation_0-logloss:0.59563	validation_1-logloss:0.58631
[7]	validation_0-logloss:0.58840	validation_1-logloss:0.57780
[8]	validation_0-logloss:0.58312	validation_1-logloss:0.57060
[9]	validation_0-logloss:0.57737	validation_1-logloss:0.56427
[10]	validation_0-logloss:0.57170	validation_1-logloss:0.55871
[11]	validation_0-logloss:0.56806	validation_1-logloss:0.55358
[12]	validation_0-logloss:0.56478	validation_1-logloss:0.54905
[13]	validation_0-logloss:0.56087	validation_1-logloss:0.54490
[14]	validation_0-logloss:0.55812	validation_1-logloss:0.54101
[15]	validation_0-logloss:0.55599	validation_1-logloss:0.53778
[1

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=True,
              eval_metric=['logloss'], gamma=3.301387, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_bin=256,
              max_cat_to_onehot=1, max_delta_step=3, max_depth=6, max_leaves=0,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=63, n_jobs=0, num_parallel_tree=1, predictor='auto',
              random_state=816397, reg_alpha=0, reg_lambda=1, ...)

In [13]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
thres = 0.7
y_preds = y_score > thres

evaluate.performance_summary(y_score, y_preds, y_valid, auc_cutoff=0.4)

df_results_valid = evaluate.returns_on_predictions(splits['validation'], y_preds)

evaluate.performance_on_slice(splits['validation'], y_score, y_preds, 'subindustry', False)

**Validation**
Precision: 0.5909090909090909
PR-AUC/AP score: 0.31257193127093735
ROC-AUC score: 0.5907709689918144
Total positive predictions: 22

Totals:
        prediction
result            
FN             601
FP               9
TN            3194
TP              13

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.044448          0.115033            0.131266
FP             -0.033444         -0.004778            0.004444
TN             -0.011937         -0.008423            0.002332
TP              0.087615          0.151462            0.154692

Stds:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.048232          0.071011            0.065843
FP              0.076883          0.067900            0.089370
TN              0.050026          0.065858            0.065416
TP              0.0461

  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


In [156]:
pd.set_option('display.max_rows', 100)

In [232]:
# df_results_valid[df_results_valid.result == 'TP'].iloc[:100]

In [None]:
# df_results_valid[df_results_valid.subindustry == 'consumer_services'].iloc[0:100]

In [31]:
print(clf.feature_names_in_)

['adf_pass_rate' 'last_residual' 'residual_mean_max' 'vix' 'industry'
 'residual_inter']


In [32]:
print(clf.feature_importances_)

[0.07705482 0.17807524 0.07431204 0.37910897 0.23682532 0.05462363]


In [None]:
df_trials = pd.read_csv('data/last-residual-cutoff-check-2.5#13.csv')
df_trials.sort_values('ap', ascending=False).head(50)

In [125]:
df_trials = pd.read_csv('data/data-window-size-2#7.csv')
df_trials.sort_values('ap', ascending=False).head(50)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
930,3.301387,3.0,6.0,7.0,63.0,5.568589,0.299602,0.293292,0.338653,0.595601,641
457,2.92591,3.0,7.0,8.0,50.0,6.302919,0.293391,0.264744,0.329611,0.58978,763
15,3.923418,3.0,7.0,7.0,178.0,5.936053,0.295322,0.267905,0.329164,0.599077,754
268,3.248298,3.0,7.0,7.0,96.0,5.449515,0.283228,0.275385,0.329022,0.589463,650
959,3.616223,3.0,6.0,7.0,69.0,5.688692,0.304868,0.298905,0.327613,0.596205,639
594,3.060776,3.0,6.0,7.0,57.0,5.561815,0.282277,0.278041,0.326087,0.592876,633
788,2.950274,3.0,6.0,8.0,59.0,5.693641,0.299921,0.290965,0.32545,0.593496,653
847,4.183113,3.0,8.0,8.0,52.0,5.829544,0.28013,0.28013,0.325219,0.593333,614
990,2.866214,4.0,6.0,8.0,58.0,5.977148,0.297381,0.282164,0.324945,0.600966,684
249,3.547738,3.0,7.0,7.0,54.0,5.624273,0.285028,0.280315,0.322395,0.59271,635


In [126]:
df_trials = pd.read_csv('data/data-window-size-2#6.csv')
df_trials.sort_values('ap', ascending=False).head(50)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
65,0.125181,4.0,9.0,4.0,0,5.767172,0.278583,0.275478,0.324768,0.589048,628
281,1.579883,2.0,9.0,1.0,1,6.553113,0.283091,0.266955,0.317135,0.586566,693
8,2.296297,1.0,7.0,3.0,0,5.511179,0.290852,0.279699,0.315297,0.598035,665
402,2.20581,4.0,5.0,4.0,0,5.361338,0.29717,0.287234,0.314747,0.593136,658
379,1.565489,3.0,5.0,4.0,0,5.376183,0.317035,0.307339,0.31352,0.597948,654
183,1.218787,2.0,9.0,8.0,1,7.162291,0.282454,0.251269,0.31286,0.589907,788
374,2.277036,4.0,5.0,4.0,0,5.950475,0.305772,0.293413,0.312291,0.59863,668
332,3.146166,4.0,10.0,4.0,2,6.290035,0.269438,0.255474,0.311749,0.573845,685
363,1.362032,1.0,7.0,3.0,2,5.675878,0.288274,0.288274,0.310065,0.595921,614
135,2.572876,4.0,10.0,1.0,1,8.046408,0.290909,0.236493,0.309582,0.592004,981
