# Imports

In [None]:
# !pip install yfinance

In [13]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import preprocessing

# Download stock daily prices

In [5]:
# Gets all ticker names (no argument given)
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [6]:
# Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2022, 3, 1)
# Date of today
date_to = datetime.today()
# How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 5

In [7]:
# Download ticker price data for the tickers selected above (saved to .csv automatically)
df, df_clean = utils.download_stonk_prices(ticker_list.index, period_years=period_years, date_to=date_to)

[*********************100%***********************]  2820 of 2820 completed

10 Failed downloads:
- O.WI: No data found, symbol may be delisted
- POST WI: No data found, symbol may be delisted
- BIP.PRA: No data found, symbol may be delisted
- SNX.WI: No data found, symbol may be delisted
- BIP.PRB: No data found, symbol may be delisted
- MRK.WI: No data found, symbol may be delisted
- DELL WI: No data found, symbol may be delisted
- RXN WI: No data found, symbol may be delisted
- T WD: No data found, symbol may be delisted
- PFE.WI: No data found, symbol may be delisted


In [9]:
ticker_list.groupby('subindustry').count()

Unnamed: 0_level_0,market_cap
subindustry,Unnamed: 1_level_1
automobiles_and_components,42
banks,177
capital_goods,231
chemicals,61
commercial_and_professional_services,72
construction_materials,7
consumer_durables_and_apparel,73
consumer_services,99
consumer_staples,121
containers_and_packaging,20


# Run data pipeline

In [201]:
# Visos industrijos:
# industries = ticker_list['subindustry'].unique()
# Jei nori atskirai po kelias arba po viena (uncomment):
industries = ['consumer_services']
l_reg = 3
l_roll = 2
dt = 10
date_from = '2017-06-01'
date_to = '2022-05-27'
output_dir = 'data'

stonk_model = predict.XGBStonkModel()

In [202]:
i = 1
total_industries = len(industries)
for industry in industries:
    stonks = utils.get_stonk_data(date_from, date_to, filter_industries=[industry])
    X, Y = processing.combine_stonk_pairs(stonks)
    
    print('Industry ({0}/{1}): {2}'.format(i, total_industries, industry))
    
    print('Processing residuals...')
    residuals, betas, _, date_index = utils.measure_time(partial(processing.get_rolling_residuals, X=X, Y=Y, l_reg=l_reg, l_roll=l_roll, dt=dt))
    residuals.insert(0, "dates", date_index)
    betas.insert(0, "dates", date_index)
    
    std_residuals, _, _ = processing.get_standardized_residuals(residuals.drop(columns="dates"))

    trades_before = len(std_residuals)
    std_residuals = std_residuals[std_residuals.iloc[:, -1].abs() >= 2]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by residual values'.format(trades_after, trades_before))
    if trades_after == 0:
        print('No trades left after filtering residuals, skipping this industry...')
        continue
    residuals = residuals.loc[std_residuals.index]
    betas = betas.loc[std_residuals.index]
    
    print('Processing ADFs...')
    adfs, adfs_raw = utils.measure_time(partial(processing.get_aggregate_adfs, residuals.drop(columns="dates"), betas=betas.drop(columns="dates")))
    
    selected_by_adf = (adfs >= 0.5).values
    adfs = adfs[selected_by_adf]
    
    trades_before = len(std_residuals)
    std_residuals = std_residuals[selected_by_adf]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by ADF pass rates'.format(trades_after, trades_before))
    
    if len(std_residuals) == 0:
        print('No trades left after filtering ADF pass rates, skipping this industry...')
        continue

    betas = betas.loc[adfs.index]
    residuals = residuals.loc[adfs.index]
    adfs_raw = adfs_raw.loc[adfs.index]
    
    residuals_max_mean = processing.get_mean_residual_magnitude(std_residuals.to_numpy(), dt=21)
    print('Mean max residual value for {0} after filtering is {1}'.format(industry, residuals_max_mean))
    
    print('Preparing data for model...')
    dataset = utils.build_dataset_from_live_data_by_industry(std_residuals.to_numpy(), adfs.to_numpy().ravel(), industry, residuals_max_mean)
    
    print('Running model...')
    predictions, df_processed = stonk_model.predict(dataset)
    predictions = pd.DataFrame(predictions)
    predictions.index = adfs.index
    
    print('Writing results to CSV...')
    residuals.to_csv(os.path.join(output_dir, industry + '_residuals.csv'), header=False, index=True)
    betas.to_csv(os.path.join(output_dir, industry + '_betas.csv'), header=False, index=True)
    adfs_raw.to_csv(os.path.join(output_dir, industry + '_adfs_raw.csv'), header=False, index=True)
    adfs.to_csv(os.path.join(output_dir, industry + '_adfs.csv'), header=False, index=True)
    predictions.to_csv(os.path.join(output_dir, industry + '_predictions.csv'), header=False, index=True)
    i+= 1
    
print('*** All done ***')

Industry (1/1): consumer_services
Processing residuals...
Done after: 13s
307 trades selected out of 2485 by residual values
Processing ADFs...
Done after: 32s
77 trades selected out of 307 by ADF pass rates
Mean max residual value for consumer_services after filtering is 3.799999952316284
Preparing data for model...
Running model...
Writing results to CSV...
*** All done ***


# Data collection

In [3]:
stonks = utils.get_stonk_data('2017-04-20', '2022-04-18')

In [4]:
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [None]:
pipelines.data_collection_rolling_pipeline(stonks, industries=list(ticker_list['subindustry'].unique()), l_reg=3, l_roll=2, dt=20, market_cap_min_mm=1000, market_cap_max_mm=None, adf_pval_cutoff=0.1, adf_pass_rate_filter=0.5, trade_length_months=3, trading_interval_weeks=2)

# Model development

In [2]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe
import pickle

In [36]:
def train_production_xgb(df: pd.DataFrame, params: Dict[str, Any], scaling: str = 'minmax', add_noise: bool = True) -> Tuple[xgb.XGBClassifier, sklearn.base.TransformerMixin]:
    X_train, scalers = preprocessing.transform_features(df, scaling=scaling, add_noise=add_noise)
    y_train = df['label']
    
    clf = xgb.XGBClassifier(
        **params
    )

    clf.fit(X_train, y_train, eval_set=[(X_train, y_train)])
    clf.save_model(os.path.join('data', 'xgb_classifier.json'))
                   
    with open(os.path.join('data', 'scalers.json'), 'wb') as fp:
        pickle.dump(scalers, fp)
                   
    return clf, scalers

In [4]:
df = pd.read_csv('data/dataset.csv')
df = df[df.beta > 0]
df = preprocessing.assign_labels(df)

In [37]:
clf_prod, scalers_prod = train_production_xgb(df, params)

[0]	validation_0-logloss:0.66960
[1]	validation_0-logloss:0.64951
[2]	validation_0-logloss:0.63240
[3]	validation_0-logloss:0.61761
[4]	validation_0-logloss:0.60432
[5]	validation_0-logloss:0.59291
[6]	validation_0-logloss:0.58259
[7]	validation_0-logloss:0.57407
[8]	validation_0-logloss:0.56612
[9]	validation_0-logloss:0.55908
[10]	validation_0-logloss:0.55274
[11]	validation_0-logloss:0.54704
[12]	validation_0-logloss:0.54201
[13]	validation_0-logloss:0.53775
[14]	validation_0-logloss:0.53409
[15]	validation_0-logloss:0.53044
[16]	validation_0-logloss:0.52730
[17]	validation_0-logloss:0.52451
[18]	validation_0-logloss:0.52184
[19]	validation_0-logloss:0.51965
[20]	validation_0-logloss:0.51777
[21]	validation_0-logloss:0.51576
[22]	validation_0-logloss:0.51389
[23]	validation_0-logloss:0.51246
[24]	validation_0-logloss:0.51109


In [110]:
splits = preprocessing.split_data_mixed(df, 6, 0)

In [6]:
scaling = 'minmax'
add_noise = True

X_train, scalers = preprocessing.transform_features(splits['train'], scaling=scaling, add_noise=add_noise)
X_valid, _ = preprocessing.transform_features(splits['validation'], scalers=scalers, scaling=scaling, add_noise=add_noise)

y_train = splits['train']['label']
y_valid = splits['validation']['label']

In [9]:
hyperparameter_space = {
    # Continuous:
    "gamma": hp.uniform("gamma", 0, 5),
    "scale_pos_weight" : hp.uniform("scale_pos_weight", 4, 10),
    # Integers:
    "max_depth": hp.quniform("max_depth", 3, 6, 1),
    "min_child_weight" : hp.quniform("min_child_weight", 1, 10, 1),
    "max_delta_step" : hp.quniform("max_delta_step", 0, 5, 1),
    # Choice:
    "colsample_bylevel" : hp.choice("colsample_bylevel", np.array([0.5, 0.75, 1])),
    "n_estimators": hp.choice("n_estimators", np.array([25, 50, 75])),
    "subsample": hp.choice("subsample", np.array([0.5, 0.75, 1])),
    }

In [10]:
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [11]:
def optimization_objective(space):
    clf = xgb.XGBClassifier(
        gamma = space['gamma'],
        scale_pos_weight = space['scale_pos_weight'],
        #
        max_depth = int(space['max_depth']),
        min_child_weight = int(space['min_child_weight']),
        max_delta_step = int(space['max_delta_step']),
        #
        colsample_bylevel = space['colsample_bylevel'],
        n_estimators = int(space['n_estimators']), 
        learning_rate = 0.1,
        subsample = space['subsample'],
        #
        tree_method = "gpu_hist",
        enable_categorical = True,
        max_cat_to_onehot = 1,
    )
    
    clf.fit(
        X_train, y_train,
        verbose=False,
    )
    
    y_score = clf.predict_proba(X_valid)[:, 1]
    y_preds = y_score > 0.5
    
    f1 = f1_score(y_valid, y_preds, zero_division=0)
    precision = precision_score(y_valid, y_preds, zero_division=0)
    ap = evaluate.average_precision_from_cutoff(y_valid, y_score, 0.6)
    roc = roc_auc_score(y_valid, y_score)
    
    pos_preds = int(y_preds.sum())
    pos_labels = int(y_valid.sum())
    
    ap = ap if pos_preds >= pos_labels else 0
    
    if f1 == 0 or precision == 0:
        return {'loss': 999, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'pos_labels': pos_labels, 'status': STATUS_FAIL}
    else:
        return {'loss': -ap, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'pos_labels': pos_labels, 'status': STATUS_OK}

In [12]:
trials = Trials()

best_hyperparams = fmin(
    fn = optimization_objective,
    space = hyperparameter_space,
    algo = tpe.suggest,
    max_evals = 1000,
    trials = trials
)

trial_vals = trials.vals
trial_vals['f1_score'] = list(map(lambda x: x['f1_score'], trials.results))
trial_vals['precision'] = list(map(lambda x: x['precision'], trials.results))
trial_vals['ap'] = list(map(lambda x: x['ap'], trials.results))
trial_vals['auc'] = list(map(lambda x: x['auc'], trials.results))
trial_vals['pos_preds'] = list(map(lambda x: x['pos_preds'], trials.results))
trial_vals['pos_labels'] = list(map(lambda x: x['pos_labels'], trials.results))

df_trials = pd.DataFrame.from_dict(trial_vals)
df_trials.to_csv('data/trials_52_opt-ap0.6_evals-1000_minmax-scaling_noise_oos.csv', index=False)

100%|███████████████████████████████████████████| 1000/1000 [08:21<00:00,  1.99trial/s, best loss: -0.5077162992992107]


In [19]:
params = { 
    # reg def 0
    "gamma": 0.190072,
    # L2 def 1
    # "reg_lambda" : 1,
    # "reg_alpha" : 0,
    # Class imbalance def 1
    "scale_pos_weight" : 5.530400,
    # Integers:
    "max_depth": 4,
    # Reg def 1
    "min_child_weight" : 8,
    # Class imbalance def 0
    "max_delta_step" : 1,
    # Choice:
    "colsample_bylevel" : 0.75,
    "n_estimators": 25,
    "learning_rate": 0.1,
    "subsample": 0.75,
    # Fixed:
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
}

clf = xgb.XGBClassifier(
        **params
    )

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])
clf.save_model(os.path.join('data', 'test_classifier.json'))

[0]	validation_0-logloss:0.67063	validation_1-logloss:0.66635
[1]	validation_0-logloss:0.65153	validation_1-logloss:0.64325
[2]	validation_0-logloss:0.63468	validation_1-logloss:0.62327
[3]	validation_0-logloss:0.62089	validation_1-logloss:0.60606
[4]	validation_0-logloss:0.60912	validation_1-logloss:0.59113
[5]	validation_0-logloss:0.59811	validation_1-logloss:0.57777
[6]	validation_0-logloss:0.58917	validation_1-logloss:0.56663
[7]	validation_0-logloss:0.58101	validation_1-logloss:0.55624
[8]	validation_0-logloss:0.57483	validation_1-logloss:0.54730
[9]	validation_0-logloss:0.56847	validation_1-logloss:0.53927
[10]	validation_0-logloss:0.56346	validation_1-logloss:0.53221
[11]	validation_0-logloss:0.55844	validation_1-logloss:0.52574
[12]	validation_0-logloss:0.55375	validation_1-logloss:0.52009
[13]	validation_0-logloss:0.55035	validation_1-logloss:0.51523
[14]	validation_0-logloss:0.54714	validation_1-logloss:0.51059
[15]	validation_0-logloss:0.54439	validation_1-logloss:0.50665
[1

In [30]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
threshold = 0.65
y_preds = y_score > threshold

evaluate.performance_summary(y_score, y_preds, y_valid)

df_results_valid = evaluate.returns_on_predictions(splits['validation'], y_preds)

evaluate.performance_on_slice(splits['validation'], y_score, y_preds, 'subindustry', False)

**Validation**
Precision: 0.4391891891891892
PR-AUC/AP score: 0.4665762313730002
ROC-AUC score: 0.5900856532541726
Total positive predictions: 148

Totals:
        prediction
result            
FN            2274
FP              83
TN           14700
TP              65

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.063081          0.102989            0.120759
FP              0.000169         -0.013229           -0.002337
TN             -0.002381         -0.014484           -0.020771
TP              0.043492          0.111523            0.124738

Stds:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.056728          0.069431            0.076642
FP              0.041462          0.048359            0.053820
TN              0.051436          0.071046            0.083955
TP              0.0428

In [35]:
df_results_valid[df_results_valid.result == 'FP'].sample(50)

Unnamed: 0,ticker_x,ticker_y,trade_date,adf_pass_rate,last_residual,beta,intercept,residual_mean_max,return_one_month,residual_one_month,return_two_month,residual_two_month,return_three_month,residual_three_month,data_window_start,subindustry,label,prediction,result,score
50453,AVGO,ENPH,2021-12-22,0.57,-4.21,0.65,-143.05,5.07,0.021,-3.64,-0.007,-4.39,0.044,-3.0,2017-01-06,semiconductors_and_semiconductor_equipment,0,True,FP,0.669451
12365,AER,CW,2022-03-04,0.9,6.19,1.16,61.56,4.78,0.036,4.97,0.047,4.63,0.07,3.85,2017-03-20,capital_goods,0,True,FP,0.724733
49950,UCTT,VECO,2022-03-04,0.61,4.11,0.35,6.16,4.8,0.023,3.6,0.016,3.74,0.084,2.23,2017-03-20,semiconductors_and_semiconductor_equipment,0,True,FP,0.669451
28273,NVDA,TSEM,2022-03-04,0.61,4.13,0.08,15.02,4.8,0.022,3.72,-0.065,5.33,-0.072,5.47,2017-03-20,semiconductors_and_semiconductor_equipment,0,True,FP,0.669451
19161,CRGE,DLB,2022-03-04,0.86,-4.29,10.47,59.32,4.55,-0.01,-4.44,-0.106,-6.01,-0.029,-4.76,2017-03-20,software_and_services,0,True,FP,0.704358
10275,NXPI,TSEM,2022-02-18,0.55,6.7,0.13,6.05,4.16,-0.025,7.38,-0.04,7.78,0.007,6.52,2017-03-07,semiconductors_and_semiconductor_equipment,0,True,FP,0.700135
95181,NXPI,RMBS,2022-03-04,0.67,4.52,0.1,2.03,4.8,-0.051,5.88,0.048,3.23,0.072,2.59,2017-03-20,semiconductors_and_semiconductor_equipment,0,True,FP,0.669451
51936,AVGO,MCHP,2022-03-04,0.9,-4.1,0.13,10.78,4.8,0.007,-3.89,0.01,-3.81,0.05,-2.62,2017-03-20,semiconductors_and_semiconductor_equipment,0,True,FP,0.707467
53951,AWI,NOC,2022-03-04,0.55,5.37,1.77,171.98,4.78,0.035,4.52,0.005,5.25,-0.026,5.99,2017-03-20,capital_goods,0,True,FP,0.670362
98406,ASX,TSEM,2022-03-04,0.75,4.69,3.31,6.01,4.8,-0.023,5.13,-0.04,5.45,-0.019,5.04,2017-03-20,semiconductors_and_semiconductor_equipment,0,True,FP,0.676128


In [62]:
# print("**Test**")
# y_score = clf.predict_proba(X_test)[:, 1]
# threshold = 0.6
# y_preds = y_score > threshold

# evaluate.performance_summary(y_score, y_preds, y_test)

# df_results_test = evaluate.returns_on_predictions(splits['test'], y_preds)

# evaluate.performance_on_slice(splits['test'], y_score, y_preds, 'subindustry', True)

In [63]:
clf.feature_names_in_

array(['adf_pass_rate', 'last_residual', 'residual_mean_max', 'industry',
       'residual_inter'], dtype='<U17')

In [66]:
clf.feature_importances_

array([0.10213113, 0.11947197, 0.11750855, 0.5689818 , 0.0919065 ],
      dtype=float32)

In [55]:
pd.set_option('display.max_rows', 100)

In [14]:
df_trials = pd.read_csv('data/trials_52_opt-ap0.6_evals-1000_minmax-scaling_noise_oos.csv')
df_trials.sort_values('ap', ascending=False).head(50)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
569,1,1.892316,4.0,4.0,10.0,0,5.489414,1,0.234918,0.234269,0.507716,0.590882,2352,2339.0,,,,,,,
848,1,0.481071,2.0,4.0,10.0,0,5.49253,1,0.232218,0.227366,0.507677,0.590882,2441,2339.0,,,,,,,
933,1,0.844103,2.0,4.0,10.0,0,5.500247,1,0.232218,0.227366,0.507677,0.590882,2441,2339.0,,,,,,,
787,2,3.841731,4.0,4.0,5.0,0,5.205978,1,0.190072,2.0,4.0,10.0,0,5.5304,1.0,0.232277,0.226699,0.505819,0.590902,2457.0,2339.0
875,1,0.089786,2.0,4.0,10.0,0,5.51287,1,0.231636,0.226254,0.505819,0.590903,2453,2339.0,,,,,,,
834,1,0.432134,2.0,4.0,10.0,0,5.509008,1,0.231684,0.226346,0.505805,0.590902,2452,2339.0,,,,,,,
764,1,0.607126,2.0,4.0,10.0,0,5.508706,1,0.231684,0.226346,0.505805,0.590902,2452,2339.0,,,,,,,
802,1,0.294976,2.0,4.0,10.0,0,5.510765,1,0.231636,0.226254,0.505805,0.590903,2453,2339.0,,,,,,,
594,1,0.615758,2.0,4.0,10.0,0,5.508325,1,0.231702,0.22677,0.505805,0.590902,2443,2339.0,,,,,,,
442,1,2.91692,3.0,4.0,10.0,0,5.508097,1,0.231702,0.22677,0.505247,0.590864,2443,2339.0,,,,,,,


In [115]:
df_trials = pd.read_csv('data/trials_51_opt-ap0.6_evals-1000_minmax-scaling_noise_hybrid.csv')
df_trials.sort_values('ap', ascending=False).head(50)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels
700,2,0.295072,4.0,4.0,3.0,0,5.320584,1,0.226729,0.214145,0.405646,0.600825,2036,1810
823,2,4.01618,5.0,4.0,4.0,0,5.174391,1,0.219365,0.217226,0.404146,0.601074,1846,1810
830,2,4.24213,5.0,4.0,5.0,0,5.172915,1,0.219365,0.217226,0.404146,0.601073,1846,1810
868,2,3.523732,5.0,4.0,4.0,0,5.159605,1,0.219846,0.218172,0.404146,0.601084,1838,1810
754,2,4.011583,5.0,4.0,5.0,0,5.192931,1,0.220044,0.216971,0.40412,0.601082,1862,1810
735,2,0.205411,4.0,4.0,5.0,0,5.184834,1,0.218827,0.216173,0.40412,0.601136,1855,1810
516,2,0.027429,4.0,4.0,3.0,0,5.214,1,0.221691,0.213627,0.403653,0.600429,1952,1810
959,2,3.934003,5.0,4.0,3.0,0,5.21554,1,0.222104,0.21392,0.403653,0.600429,1954,1810
787,2,3.841731,4.0,4.0,5.0,0,5.205978,1,0.220839,0.216446,0.403053,0.601203,1885,1810
884,2,3.706742,5.0,4.0,4.0,0,5.207621,1,0.220779,0.216331,0.403053,0.601203,1886,1810


In [135]:
df_trials = pd.read_csv('data/trials_47_opt-ap0.6_evals-1000_minmax-scaling_noise_hybrid.csv')
df_trials.sort_values('ap', ascending=False).head(50)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels
154,2,2.106686,2.0,4.0,8.0,0,5.338896,2,0.234989,0.226201,0.558362,0.614281,2206,2041
586,2,4.032224,2.0,4.0,10.0,0,5.555772,2,0.235703,0.211884,0.556781,0.614167,2558,2041
801,2,3.316369,2.0,4.0,9.0,0,5.553942,2,0.235703,0.211884,0.556781,0.614166,2558,2041
591,2,4.04879,2.0,4.0,10.0,0,5.368263,2,0.233656,0.22541,0.550762,0.614817,2196,2041
698,2,3.80281,2.0,4.0,9.0,0,5.342241,2,0.235183,0.226983,0.550073,0.61397,2194,2041
583,2,4.57534,2.0,4.0,9.0,0,5.339841,2,0.235239,0.227086,0.550073,0.613971,2193,2041
499,2,4.583719,2.0,4.0,8.0,0,5.393055,2,0.234048,0.225295,0.547458,0.61414,2206,2041
788,2,3.732558,2.0,4.0,9.0,0,5.404041,2,0.233882,0.224989,0.547458,0.614135,2209,2041
405,2,2.858222,2.0,4.0,9.0,0,5.404193,2,0.233882,0.224989,0.547458,0.614135,2209,2041
580,2,3.996881,2.0,4.0,9.0,0,5.376883,2,0.231837,0.219641,0.547363,0.614461,2281,2041


# Other

In [27]:
def predict_baseline_model(X_train, y_train, X_valid, y_valid, baseline='rule-based', residual_cutoff_adj=0, adf_cutoff=0.5):
    def class_positive(example):
        if all([
            np.abs(example['last_residual']) > (example['residual_mean_max'] + residual_cutoff_adj),
            example['adf_pass_rate'] > adf_cutoff,
        ]):
            return 1
        else:
            return 0
        
    if baseline == "rule-based":
        y_train_preds = X_train.apply(class_positive, axis=1).to_numpy()
        y_valid_preds = X_valid.apply(class_positive, axis=1).to_numpy()
    elif baseline == "random":
        y_train_preds = np.random.randint(0, 2, len(X_train))
        y_valid_preds = np.random.randint(0, 2, len(X_valid))
    
    precision = precision_score(y_train.to_numpy(), y_train_preds)
    f1 = f1_score(y_train.to_numpy(), y_train_preds)
    print("Final baseline precision on train:", precision)
    print("Final baseline F1 score on train:", f1)

    precision = precision_score(y_valid.to_numpy(), y_valid_preds)
    f1 = f1_score(y_valid.to_numpy(), y_valid_preds)
    print("Final baseline precision on validation:", precision)
    print("Final baseline F1 score on validation:", f1)

In [28]:
def predict_random_forest(X_train, y_train, X_valid, y_valid):
    clf = RandomForestClassifier(
        n_estimators = 100,
        max_depth = None,
        max_features = "auto",
        oob_score = False,
        class_weight = "balanced_subsample",
    )
    
    clf.fit(X_train, y_train)
    
    y_train_preds = clf.predict(X_train)
    y_valid_preds = clf.predict(X_valid)
    
    precision = precision_score(y_train.to_numpy(), y_train_preds)
    f1 = f1_score(y_train.to_numpy(), y_train_preds)
    auc = roc_auc_score(y_train.to_numpy(), y_train_preds)
    
    print("Final RF precision on train:", precision)
    print("Final RF F1 score on train:", f1)
    print("Final RF AUC score on train:", auc)
    
    precision = precision_score(y_valid.to_numpy(), y_valid_preds)
    f1 = f1_score(y_valid.to_numpy(), y_valid_preds)
    auc = roc_auc_score(y_valid.to_numpy(), y_valid_preds)
    
    print("Final RF precision on valid:", precision)
    print("Final RF F1 score on valid:", f1)
    print("Final RF AUC score on valid:", auc)