# Imports

In [None]:
# !pip install yfinance

In [1]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any

import numpy as np
import pandas as pd

import utils
import pipelines
import processing
import evaluate
import predict
import preprocessing

# Download stock daily prices

In [5]:
# Gets all ticker names (no argument given)
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [6]:
# Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2022, 3, 1)
# Date of today
date_to = datetime.today()
# How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 5

In [7]:
# Download ticker price data for the tickers selected above (saved to .csv automatically)
df, df_clean = utils.download_stonk_prices(ticker_list.index, period_years=period_years, date_to=date_to)

[*********************100%***********************]  2820 of 2820 completed

10 Failed downloads:
- O.WI: No data found, symbol may be delisted
- POST WI: No data found, symbol may be delisted
- BIP.PRA: No data found, symbol may be delisted
- SNX.WI: No data found, symbol may be delisted
- BIP.PRB: No data found, symbol may be delisted
- MRK.WI: No data found, symbol may be delisted
- DELL WI: No data found, symbol may be delisted
- RXN WI: No data found, symbol may be delisted
- T WD: No data found, symbol may be delisted
- PFE.WI: No data found, symbol may be delisted


In [9]:
ticker_list.groupby('subindustry').count()

Unnamed: 0_level_0,market_cap
subindustry,Unnamed: 1_level_1
automobiles_and_components,42
banks,177
capital_goods,231
chemicals,61
commercial_and_professional_services,72
construction_materials,7
consumer_durables_and_apparel,73
consumer_services,99
consumer_staples,121
containers_and_packaging,20


# Run data pipeline

In [11]:
# Visos industrijos:
# industries = ticker_list['subindustry'].unique()
# Jei nori atskirai po kelias arba po viena (uncomment):
industries = ['semiconductors_and_semiconductor_equipment', 'insurance']
l_reg = 3
l_roll = 2
dt = 10
date_from = '2017-04-20'
date_to = '2022-04-18'
output_dir = 'data'

stonk_model = predict.XGBStonkModel()

In [12]:
i = 1
total_industries = len(industries)
for industry in industries:
    stonks = utils.get_stonk_data(date_from, date_to, filter_industries=[industry])
    X, Y = processing.combine_stonk_pairs(stonks)
    
    print('Industry ({0}/{1}): {2}'.format(i, total_industries, industry))
    
    print('Processing residuals...')
    residuals, betas, _, date_index = utils.measure_time(partial(processing.get_rolling_residuals, X=X, Y=Y, l_reg=l_reg, l_roll=l_roll, dt=dt))
    residuals.insert(0, "dates", date_index)
    betas.insert(0, "dates", date_index)
    
    std_residuals, _, _ = processing.get_standardized_residuals(residuals.drop(columns="dates"))

    trades_before = len(std_residuals)
    std_residuals = std_residuals[std_residuals.iloc[:, -1].abs() >= 2]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by residual values'.format(trades_after, trades_before))
    if trades_after == 0:
        print('No trades left after filtering residuals, skipping this industry...')
        continue
    residuals = residuals.loc[std_residuals.index]
    betas = betas.loc[std_residuals.index]
    
    print('Processing ADFs...')
    adfs, adfs_raw = utils.measure_time(partial(processing.get_aggregate_adfs, residuals.drop(columns="dates"), betas=betas.drop(columns="dates")))
    
    selected_by_adf = (adfs >= 0.5).values
    adfs = adfs[selected_by_adf]
    
    trades_before = len(std_residuals)
    std_residuals = std_residuals[selected_by_adf]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by ADF pass rates'.format(trades_after, trades_before))
    
    if len(std_residuals) == 0:
        print('No trades left after filtering ADF pass rates, skipping this industry...')
        continue

    betas = betas.loc[adfs.index]
    residuals = residuals.loc[adfs.index]
    adfs_raw = adfs_raw.loc[adfs.index]
    
    residuals_max_mean = processing.get_mean_residual_magnitude(std_residuals.to_numpy(), dt=21)
    print('Mean max residual value for {0} after filtering is {1}'.format(industry, residuals_max_mean))
    
    print('Preparing data for model...')
    dataset = utils.build_dataset_from_live_data_by_industry(std_residuals.to_numpy(), adfs.to_numpy().ravel(), industry, residuals_max_mean)
    
    print('Running model...')
    predictions = pd.DataFrame(stonk_model.predict(dataset))
    predictions.index = adfs.index
    
    print('Writing results to CSV...')
    residuals.to_csv(os.path.join(output_dir, industry + '_residuals.csv'), header=False, index=True)
    betas.to_csv(os.path.join(output_dir, industry + '_betas.csv'), header=False, index=True)
    adfs_raw.to_csv(os.path.join(output_dir, industry + '_adfs_raw.csv'), header=False, index=True)
    adfs.to_csv(os.path.join(output_dir, industry + '_adfs.csv'), header=False, index=True)
    predictions.to_csv(os.path.join(output_dir, industry + '_predictions.csv'), header=False, index=True)
    i+= 1
    
print('*** All done ***')

Industry (1/2): semiconductors_and_semiconductor_equipment
Processing residuals...
Done after: 10s
258 trades selected out of 1891 by residual values
Processing ADFs...
Done after: 29s
90 trades selected out of 258 by ADF pass rates
Mean max residual value for semiconductors_and_semiconductor_equipment after filtering is 4.340000152587891
Preparing data for model...
Running model...
Writing results to CSV...
Industry (2/2): insurance
Processing residuals...
Done after: 9s
370 trades selected out of 1891 by residual values
Processing ADFs...
Done after: 39s
58 trades selected out of 370 by ADF pass rates
Mean max residual value for insurance after filtering is 4.059999942779541
Preparing data for model...
Running model...
Writing results to CSV...
*** All done ***


# Data collection

In [3]:
stonks = utils.get_stonk_data('2017-04-20', '2022-04-18')

In [4]:
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [None]:
pipelines.data_collection_rolling_pipeline(stonks, industries=list(ticker_list['subindustry'].unique()), l_reg=3, l_roll=2, dt=20, market_cap_min_mm=1000, market_cap_max_mm=None, adf_pval_cutoff=0.1, adf_pass_rate_filter=0.5, trade_length_months=3, trading_interval_weeks=2)

# Model development

In [5]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe
import pickle

In [43]:
def train_production_xgb(df: pd.DataFrame, params: Dict[str, Any], scaling='minmax', add_noise=True) -> xgb.XGBClassifier:
    X_train, scalers = preprocessing.transform_features(df, scaling=scaling, add_noise=add_noise)
    y_train = df['label']
    
    clf = xgb.XGBClassifier(
        **params
    )

    clf.fit(X_train, y_train, eval_set=[(X_train, y_train)])
    clf.save_model(os.path.join('data', 'xgb_classifier.json'))
                   
    with open(os.path.join('data', 'scalers.json'), 'wb') as fp:
        pickle.dump(scalers, fp)
                   
    return clf, scalers

In [2]:
df = pd.read_csv('data/dataset.csv')
df = df[df.beta > 0]
df = preprocessing.assign_labels(df)

In [11]:
clf_prod, scalers_prod = train_production_xgb(df, params)

[0]	validation_0-logloss:0.66957
[1]	validation_0-logloss:0.64946
[2]	validation_0-logloss:0.63195
[3]	validation_0-logloss:0.61684
[4]	validation_0-logloss:0.60366
[5]	validation_0-logloss:0.59228
[6]	validation_0-logloss:0.58223
[7]	validation_0-logloss:0.57339
[8]	validation_0-logloss:0.56560
[9]	validation_0-logloss:0.55864
[10]	validation_0-logloss:0.55258
[11]	validation_0-logloss:0.54716
[12]	validation_0-logloss:0.54235
[13]	validation_0-logloss:0.53805
[14]	validation_0-logloss:0.53413
[15]	validation_0-logloss:0.53064
[16]	validation_0-logloss:0.52752
[17]	validation_0-logloss:0.52473
[18]	validation_0-logloss:0.52224
[19]	validation_0-logloss:0.52004
[20]	validation_0-logloss:0.51803
[21]	validation_0-logloss:0.51616
[22]	validation_0-logloss:0.51454
[23]	validation_0-logloss:0.51303
[24]	validation_0-logloss:0.51169


In [5]:
# past_dates = 50
# dates = np.sort(df['trade_date'].unique())
# dates = dates[-past_dates:]
# df = df[df.trade_date.isin(dates)]

In [13]:
splits = preprocessing.split_data_mixed(df, 4, 0.06)

In [14]:
scaling = 'minmax'
add_noise = True

X_train, scalers = preprocessing.transform_features(splits['train'], scaling=scaling, add_noise=add_noise)
X_valid, _ = preprocessing.transform_features(splits['validation'], scalers=scalers, scaling=scaling, add_noise=add_noise)
# X_test, _ = preprocessing.transform_features(splits['test'], scalers=scalers, scaling=scaling, add_noise=add_noise)

y_train = splits['train']['label']
y_valid = splits['validation']['label']
# y_test = splits['test']['label']

In [55]:
hyperparameter_space = {
    # Continuous:
    "gamma": hp.uniform("gamma", 0, 5),
    "scale_pos_weight" : hp.uniform("scale_pos_weight", 4, 10),
    # Integers:
    "max_depth": hp.quniform("max_depth", 2, 8, 1),
    "min_child_weight" : hp.quniform("min_child_weight", 1, 10, 1),
    "max_delta_step" : hp.quniform("max_delta_step", 0, 5, 1),
    # Choice:
    "colsample_bylevel" : hp.choice("colsample_bylevel", np.array([0.5, 0.75, 1])),
    "n_estimators": hp.choice("n_estimators", np.array([25, 50, 75, 100])),
    "subsample": hp.choice("subsample", np.array([0.5, 0.75, 1])),
    }

In [56]:
def optimization_objective(space):
    clf = xgb.XGBClassifier(
        gamma = space['gamma'],
        scale_pos_weight = space['scale_pos_weight'],
        #
        max_depth = int(space['max_depth']),
        min_child_weight = int(space['min_child_weight']),
        max_delta_step = int(space['max_delta_step']),
        #
        colsample_bylevel = space['colsample_bylevel'],
        n_estimators = int(space['n_estimators']), 
        learning_rate = 0.1,
        subsample = space['subsample'],
        #
        seed = 420,
        tree_method = "gpu_hist",
        enable_categorical = True,
        max_cat_to_onehot = 1,
    )
    
    clf.fit(
        X_train, y_train,
        verbose=False,
    )
    
    y_score = clf.predict_proba(X_valid)[:, 1]
    y_preds = y_score > 0.5
    
    f1 = f1_score(y_valid, y_preds, zero_division=0)
    precision = precision_score(y_valid, y_preds, zero_division=0)
    ap = evaluate.average_precision_from_cutoff(y_valid, y_score, 0.6)
    roc = roc_auc_score(y_valid, y_score)
    
    pos_preds = int(y_preds.sum())
    pos_labels = int(y_valid.sum())
    
    ap = ap if pos_preds >= pos_labels else 0
    
    if f1 == 0 or precision == 0:
        return {'loss': 999, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'pos_labels': pos_labels, 'status': STATUS_FAIL}
    else:
        return {'loss': -ap, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'pos_labels': pos_labels, 'status': STATUS_OK}

In [57]:
trials = Trials()

best_hyperparams = fmin(
    fn = optimization_objective,
    space = hyperparameter_space,
    algo = tpe.suggest,
    max_evals = 1000,
    trials = trials
)

trial_vals = trials.vals
trial_vals['f1_score'] = list(map(lambda x: x['f1_score'], trials.results))
trial_vals['precision'] = list(map(lambda x: x['precision'], trials.results))
trial_vals['ap'] = list(map(lambda x: x['ap'], trials.results))
trial_vals['auc'] = list(map(lambda x: x['auc'], trials.results))
trial_vals['pos_preds'] = list(map(lambda x: x['pos_preds'], trials.results))
trial_vals['pos_labels'] = list(map(lambda x: x['pos_labels'], trials.results))

df_trials = pd.DataFrame.from_dict(trial_vals)
df_trials.to_csv('data/trials_47_opt-ap0.6_evals-1000_minmax-scaling_noise_hybrid.csv', index=False)

100%|███████████████████████████████████████████| 1000/1000 [09:24<00:00,  1.77trial/s, best loss: -0.5583620695335232]


In [None]:
params = { 
    # reg def 0
    "gamma": 4.03,
    # L2 def 1
    # "reg_lambda" : 1,
    # "reg_alpha" : 0,
    # Class imbalance def 1
    "scale_pos_weight" : 5.555772,
    # Integers:
    "max_depth": 4,
    # Reg def 1
    "min_child_weight" : 10,
    # Class imbalance def 0
    "max_delta_step" : 2,
    # Choice:
    "colsample_bylevel" : 1,
    "n_estimators": 25,
    "learning_rate": 0.1,
    "subsample": 1,
    # Fixed:
    "seed": 420,
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
}

clf = xgb.XGBClassifier(
        **params
    )

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])
clf.save_model(os.path.join('data', 'test_classifier.json'))

In [32]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
threshold = 0.6
y_preds = y_score > threshold

evaluate.performance_summary(y_score, y_preds, y_valid)

df_results_valid = evaluate.returns_on_predictions(splits['validation'], y_preds)

evaluate.performance_on_slice(splits['validation'], y_score, y_preds, 'subindustry', False)

**Validation**
Precision: 0.39274924471299094
PR-AUC/AP score: 0.5747368845982437
ROC-AUC score: 0.6350689549817067
Total positive predictions: 331

Totals:
        prediction
result            
FN            1911
FP             201
TN           14315
TP             130

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.067773          0.106705            0.126168
FP             -0.010627         -0.032413           -0.032114
TN             -0.009393         -0.023010           -0.028772
TP              0.083938          0.170546            0.199408

Stds:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.107426          0.115443            0.138242
FP              0.076977          0.103190            0.120008
TN              0.056843          0.088226            0.096815
TP              0.088

  recall = tps / tps[-1]
  recall = tps / tps[-1]


In [None]:
# print("**Test**")
# y_score = clf.predict_proba(X_test)[:, 1]
# threshold = 0.6
# y_preds = y_score > threshold

# evaluate.performance_summary(y_score, y_preds, y_test)

# df_results_test = evaluate.returns_on_predictions(splits['test'], y_preds)

# evaluate.performance_on_slice(splits['test'], y_score, y_preds, 'subindustry', True)

In [54]:
# df_results_test[df_results_test.result == 'FN'].head(100)

In [103]:
clf.feature_names_in_

array(['adf_pass_rate', 'last_residual', 'residual_mean_max', 'industry',
       'residual_inter'], dtype='<U17')

In [104]:
clf.feature_importances_

array([0.12706983, 0.13587943, 0.14899054, 0.45897767, 0.12908258],
      dtype=float32)

In [55]:
pd.set_option('display.max_rows', 100)

In [42]:
df_trials = pd.read_csv('data/trials_45_opt-ap0.6_evals-1000_minmax-scaling_noise_hybrid.csv')
df_trials.sort_values('ap', ascending=False).head(50)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels
992,1,0.292132,2.0,6.0,6.0,0,7.053207,2,0.184432,0.292547,0.391674,0.613236,899,1953
939,1,0.148257,2.0,6.0,6.0,0,7.123132,2,0.187065,0.291441,0.391407,0.613434,923,1953
967,1,0.599214,2.0,6.0,6.0,0,7.101042,2,0.184165,0.28884,0.391407,0.613209,914,1953
961,1,0.541002,2.0,6.0,6.0,0,7.102172,2,0.184165,0.28884,0.391407,0.613209,914,1953
974,1,0.688943,2.0,6.0,6.0,0,7.080427,2,0.18468,0.291391,0.391172,0.613215,906,1953
663,2,0.660313,2.0,5.0,6.0,0,7.288243,1,0.182392,0.289532,0.389835,0.614637,898,1953
818,2,0.23178,3.0,5.0,7.0,0,7.267782,1,0.185211,0.296505,0.389553,0.615051,887,1953
930,1,0.005327,3.0,6.0,6.0,0,6.947144,1,0.186375,0.3,0.389455,0.613657,880,1953
826,2,0.257282,3.0,5.0,7.0,0,7.277203,1,0.183803,0.29425,0.389115,0.615023,887,1953
608,2,0.352153,2.0,5.0,6.0,0,7.26483,1,0.186883,0.300113,0.389063,0.614904,883,1953


In [11]:
df_trials = pd.read_csv('data/trials_46_opt-ap0.6_evals-1000_minmax-scaling_noise_hybrid.csv')
df_trials.sort_values('ap', ascending=False).head(50)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels
815,1,1.124667,4.0,4.0,8.0,0,5.454375,0,0.232337,0.216,0.559415,0.616151,2375,2041
996,1,0.793776,5.0,4.0,9.0,0,5.321547,0,0.23274,0.231778,0.55416,0.616015,2058,2041
811,1,0.881646,4.0,4.0,8.0,0,5.495581,0,0.232404,0.21195,0.551823,0.616149,2477,2041
804,1,0.94648,4.0,4.0,8.0,0,5.493598,0,0.232455,0.212036,0.551823,0.616149,2476,2041
797,1,1.322823,4.0,4.0,8.0,0,5.493589,0,0.232352,0.211864,0.551765,0.616225,2478,2041
828,1,1.353525,4.0,4.0,8.0,0,5.481624,0,0.232971,0.212895,0.551753,0.616224,2466,2041
800,1,1.175993,4.0,4.0,8.0,0,5.501682,0,0.23221,0.210969,0.549546,0.616219,2498,2041
868,1,1.159292,4.0,4.0,8.0,0,5.511524,0,0.232732,0.211178,0.549522,0.616217,2505,2041
785,1,1.971187,4.0,4.0,8.0,0,5.510339,0,0.232732,0.211178,0.549322,0.616223,2505,2041
786,1,1.961925,4.0,4.0,8.0,0,5.397012,0,0.233882,0.224989,0.547175,0.617227,2209,2041


In [58]:
df_trials = pd.read_csv('data/trials_47_opt-ap0.6_evals-1000_minmax-scaling_noise_hybrid.csv')
df_trials.sort_values('ap', ascending=False).head(50)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels
154,2,2.106686,2.0,4.0,8.0,0,5.338896,2,0.234989,0.226201,0.558362,0.614281,2206,2041
586,2,4.032224,2.0,4.0,10.0,0,5.555772,2,0.235703,0.211884,0.556781,0.614167,2558,2041
801,2,3.316369,2.0,4.0,9.0,0,5.553942,2,0.235703,0.211884,0.556781,0.614166,2558,2041
591,2,4.04879,2.0,4.0,10.0,0,5.368263,2,0.233656,0.22541,0.550762,0.614817,2196,2041
698,2,3.80281,2.0,4.0,9.0,0,5.342241,2,0.235183,0.226983,0.550073,0.61397,2194,2041
583,2,4.57534,2.0,4.0,9.0,0,5.339841,2,0.235239,0.227086,0.550073,0.613971,2193,2041
499,2,4.583719,2.0,4.0,8.0,0,5.393055,2,0.234048,0.225295,0.547458,0.61414,2206,2041
788,2,3.732558,2.0,4.0,9.0,0,5.404041,2,0.233882,0.224989,0.547458,0.614135,2209,2041
405,2,2.858222,2.0,4.0,9.0,0,5.404193,2,0.233882,0.224989,0.547458,0.614135,2209,2041
580,2,3.996881,2.0,4.0,9.0,0,5.376883,2,0.231837,0.219641,0.547363,0.614461,2281,2041


# Other

In [27]:
def predict_baseline_model(X_train, y_train, X_valid, y_valid, baseline='rule-based', residual_cutoff_adj=0, adf_cutoff=0.5):
    def class_positive(example):
        if all([
            np.abs(example['last_residual']) > (example['residual_mean_max'] + residual_cutoff_adj),
            example['adf_pass_rate'] > adf_cutoff,
        ]):
            return 1
        else:
            return 0
        
    if baseline == "rule-based":
        y_train_preds = X_train.apply(class_positive, axis=1).to_numpy()
        y_valid_preds = X_valid.apply(class_positive, axis=1).to_numpy()
    elif baseline == "random":
        y_train_preds = np.random.randint(0, 2, len(X_train))
        y_valid_preds = np.random.randint(0, 2, len(X_valid))
    
    precision = precision_score(y_train.to_numpy(), y_train_preds)
    f1 = f1_score(y_train.to_numpy(), y_train_preds)
    print("Final baseline precision on train:", precision)
    print("Final baseline F1 score on train:", f1)

    precision = precision_score(y_valid.to_numpy(), y_valid_preds)
    f1 = f1_score(y_valid.to_numpy(), y_valid_preds)
    print("Final baseline precision on validation:", precision)
    print("Final baseline F1 score on validation:", f1)

In [28]:
def predict_random_forest(X_train, y_train, X_valid, y_valid):
    clf = RandomForestClassifier(
        n_estimators = 100,
        max_depth = None,
        max_features = "auto",
        oob_score = False,
        class_weight = "balanced_subsample",
    )
    
    clf.fit(X_train, y_train)
    
    y_train_preds = clf.predict(X_train)
    y_valid_preds = clf.predict(X_valid)
    
    precision = precision_score(y_train.to_numpy(), y_train_preds)
    f1 = f1_score(y_train.to_numpy(), y_train_preds)
    auc = roc_auc_score(y_train.to_numpy(), y_train_preds)
    
    print("Final RF precision on train:", precision)
    print("Final RF F1 score on train:", f1)
    print("Final RF AUC score on train:", auc)
    
    precision = precision_score(y_valid.to_numpy(), y_valid_preds)
    f1 = f1_score(y_valid.to_numpy(), y_valid_preds)
    auc = roc_auc_score(y_valid.to_numpy(), y_valid_preds)
    
    print("Final RF precision on valid:", precision)
    print("Final RF F1 score on valid:", f1)
    print("Final RF AUC score on valid:", auc)