# Imports

In [None]:
# !pip install yfinance

In [1]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any

import numpy as np
import pandas as pd

import utils
import pipelines
import processing
import evaluate
import predict
import preprocessing

# Download stock daily prices

In [5]:
# Gets all ticker names (no argument given)
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [6]:
# Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2022, 3, 1)
# Date of today
date_to = datetime.today()
# How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 5

In [7]:
# Download ticker price data for the tickers selected above (saved to .csv automatically)
df, df_clean = utils.download_stonk_prices(ticker_list.index, period_years=period_years, date_to=date_to)

[*********************100%***********************]  2820 of 2820 completed

10 Failed downloads:
- O.WI: No data found, symbol may be delisted
- POST WI: No data found, symbol may be delisted
- BIP.PRA: No data found, symbol may be delisted
- SNX.WI: No data found, symbol may be delisted
- BIP.PRB: No data found, symbol may be delisted
- MRK.WI: No data found, symbol may be delisted
- DELL WI: No data found, symbol may be delisted
- RXN WI: No data found, symbol may be delisted
- T WD: No data found, symbol may be delisted
- PFE.WI: No data found, symbol may be delisted


In [9]:
ticker_list.groupby('subindustry').count()

Unnamed: 0_level_0,market_cap
subindustry,Unnamed: 1_level_1
automobiles_and_components,42
banks,177
capital_goods,231
chemicals,61
commercial_and_professional_services,72
construction_materials,7
consumer_durables_and_apparel,73
consumer_services,99
consumer_staples,121
containers_and_packaging,20


# Run data pipeline

In [3]:
# Visos industrijos:
# industries = ticker_list['subindustry'].unique()
# Jei nori atskirai po kelias arba po viena (uncomment):
industries = ['semiconductors_and_semiconductor_equipment']
l_reg = 3
l_roll = 2
dt = 10
date_from = '2017-04-20'
date_to = '2022-04-18'
output_dir = 'data'

stonk_model = predict.XGBStonkModel()

In [4]:
i = 1
total_industries = len(industries)
for industry in industries:
    stonks = utils.get_stonk_data(date_from, date_to, filter_industries=[industry])
    X, Y = processing.combine_stonk_pairs(stonks)
    
    print('Industry ({0}/{1}): {2}'.format(i, total_industries, industry))
    
    print('Processing residuals...')
    residuals, betas, _, date_index = utils.measure_time(partial(processing.get_rolling_residuals, X=X, Y=Y, l_reg=l_reg, l_roll=l_roll, dt=dt))
    residuals.insert(0, "dates", date_index)
    betas.insert(0, "dates", date_index)
    
    std_residuals, _, _ = processing.get_standardized_residuals(residuals.drop(columns="dates"))

    trades_before = len(std_residuals)
    std_residuals = std_residuals[std_residuals.iloc[:, -1].abs() >= 2]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by residual values'.format(trades_after, trades_before))
    if trades_after == 0:
        print('No trades left after filtering residuals, skipping this industry...')
        continue
    residuals = residuals.loc[std_residuals.index]
    betas = betas.loc[std_residuals.index]
    
    print('Processing ADFs...')
    adfs, adfs_raw = utils.measure_time(partial(processing.get_aggregate_adfs, residuals.drop(columns="dates"), betas=betas.drop(columns="dates")))
    
    selected_by_adf = (adfs >= 0.5).values
    adfs = adfs[selected_by_adf]
    
    trades_before = len(std_residuals)
    std_residuals = std_residuals[selected_by_adf]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by ADF pass rates'.format(trades_after, trades_before))
    
    if len(std_residuals) == 0:
        print('No trades left after filtering ADF pass rates, skipping this industry...')
        continue

    betas = betas.loc[adfs.index]
    residuals = residuals.loc[adfs.index]
    adfs_raw = adfs_raw.loc[adfs.index]
    
    residuals_max_mean = processing.get_mean_residual_magnitude(std_residuals.to_numpy(), dt=21)
    print('Mean max residual value for {0} after filtering is {1}'.format(industry, residuals_max_mean))
    
    print('Preparing data for model...')
    dataset = utils.build_dataset_from_live_data_by_industry(std_residuals.to_numpy(), adfs.to_numpy().ravel(), industry, residuals_max_mean)
    
    print('Running model...')
    predictions, df_processed = stonk_model.predict(dataset)
    predictions = pd.DataFrame(predictions)
    predictions.index = adfs.index
    
    print('Writing results to CSV...')
    residuals.to_csv(os.path.join(output_dir, industry + '_residuals.csv'), header=False, index=True)
    betas.to_csv(os.path.join(output_dir, industry + '_betas.csv'), header=False, index=True)
    adfs_raw.to_csv(os.path.join(output_dir, industry + '_adfs_raw.csv'), header=False, index=True)
    adfs.to_csv(os.path.join(output_dir, industry + '_adfs.csv'), header=False, index=True)
    predictions.to_csv(os.path.join(output_dir, industry + '_predictions.csv'), header=False, index=True)
    i+= 1
    
print('*** All done ***')

Industry (1/1): semiconductors_and_semiconductor_equipment
Processing residuals...
Done after: 9s
258 trades selected out of 1891 by residual values
Processing ADFs...
Done after: 26s
90 trades selected out of 258 by ADF pass rates
Mean max residual value for semiconductors_and_semiconductor_equipment after filtering is 4.340000152587891
Preparing data for model...
Running model...
Writing results to CSV...
*** All done ***


# Data collection

In [3]:
stonks = utils.get_stonk_data('2017-04-20', '2022-04-18')

In [4]:
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [None]:
pipelines.data_collection_rolling_pipeline(stonks, industries=list(ticker_list['subindustry'].unique()), l_reg=3, l_roll=2, dt=20, market_cap_min_mm=1000, market_cap_max_mm=None, adf_pval_cutoff=0.1, adf_pass_rate_filter=0.5, trade_length_months=3, trading_interval_weeks=2)

# Model development

In [2]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe
import pickle

In [199]:
def train_production_xgb(df: pd.DataFrame, params: Dict[str, Any], scaling='minmax', add_noise=True) -> xgb.XGBClassifier:
    X_train, scalers = preprocessing.transform_features(df, scaling=scaling, add_noise=add_noise)
    y_train = df['label']
    
    clf = xgb.XGBClassifier(
        **params
    )

    clf.fit(X_train, y_train, eval_set=[(X_train, y_train)])
    clf.save_model(os.path.join('data', 'xgb_classifier.json'))
                   
    with open(os.path.join('data', 'scalers.json'), 'wb') as fp:
        pickle.dump(scalers, fp)
                   
    return clf, scalers

In [3]:
df = pd.read_csv('data/dataset.csv')
df = df[df.beta > 0]
df = preprocessing.assign_labels(df)

In [200]:
clf_prod, scalers_prod = train_production_xgb(df, params)

[0]	validation_0-logloss:0.66953
[1]	validation_0-logloss:0.64808
[2]	validation_0-logloss:0.62965
[3]	validation_0-logloss:0.61373
[4]	validation_0-logloss:0.59965
[5]	validation_0-logloss:0.58753
[6]	validation_0-logloss:0.57676
[7]	validation_0-logloss:0.56741
[8]	validation_0-logloss:0.55912
[9]	validation_0-logloss:0.55182
[10]	validation_0-logloss:0.54529
[11]	validation_0-logloss:0.53951
[12]	validation_0-logloss:0.53439
[13]	validation_0-logloss:0.52980
[14]	validation_0-logloss:0.52570
[15]	validation_0-logloss:0.52198
[16]	validation_0-logloss:0.51868
[17]	validation_0-logloss:0.51576
[18]	validation_0-logloss:0.51309
[19]	validation_0-logloss:0.51072
[20]	validation_0-logloss:0.50857
[21]	validation_0-logloss:0.50662
[22]	validation_0-logloss:0.50490
[23]	validation_0-logloss:0.50335
[24]	validation_0-logloss:0.50177


In [110]:
splits = preprocessing.split_data_mixed(df, 4, 0.05, seed=421)

In [111]:
scaling = 'minmax'
add_noise = True

X_train, scalers = preprocessing.transform_features(splits['train'], scaling=scaling, add_noise=add_noise)
X_valid, _ = preprocessing.transform_features(splits['validation'], scalers=scalers, scaling=scaling, add_noise=add_noise)

y_train = splits['train']['label']
y_valid = splits['validation']['label']

In [112]:
hyperparameter_space = {
    # Continuous:
    "gamma": hp.uniform("gamma", 0, 5),
    "scale_pos_weight" : hp.uniform("scale_pos_weight", 4, 10),
    # Integers:
    "max_depth": hp.quniform("max_depth", 3, 6, 1),
    "min_child_weight" : hp.quniform("min_child_weight", 1, 10, 1),
    "max_delta_step" : hp.quniform("max_delta_step", 0, 5, 1),
    # Choice:
    "colsample_bylevel" : hp.choice("colsample_bylevel", np.array([0.5, 0.75, 1])),
    "n_estimators": hp.choice("n_estimators", np.array([25, 50, 75])),
    "subsample": hp.choice("subsample", np.array([0.5, 0.75, 1])),
    }

In [27]:
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [113]:
def optimization_objective(space):
    clf = xgb.XGBClassifier(
        gamma = space['gamma'],
        scale_pos_weight = space['scale_pos_weight'],
        #
        max_depth = int(space['max_depth']),
        min_child_weight = int(space['min_child_weight']),
        max_delta_step = int(space['max_delta_step']),
        #
        colsample_bylevel = space['colsample_bylevel'],
        n_estimators = int(space['n_estimators']), 
        learning_rate = 0.1,
        subsample = space['subsample'],
        #
        seed = 420,
        tree_method = "gpu_hist",
        enable_categorical = True,
        max_cat_to_onehot = 1,
    )
    
    clf.fit(
        X_train, y_train,
        verbose=False,
    )
    
    y_score = clf.predict_proba(X_valid)[:, 1]
    y_preds = y_score > 0.5
    
    f1 = f1_score(y_valid, y_preds, zero_division=0)
    precision = precision_score(y_valid, y_preds, zero_division=0)
    ap = evaluate.average_precision_from_cutoff(y_valid, y_score, 0.6)
    roc = roc_auc_score(y_valid, y_score)
    
    pos_preds = int(y_preds.sum())
    pos_labels = int(y_valid.sum())
    
    ap = ap if pos_preds >= pos_labels else 0
    
    if f1 == 0 or precision == 0:
        return {'loss': 999, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'pos_labels': pos_labels, 'status': STATUS_FAIL}
    else:
        return {'loss': -ap, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'pos_labels': pos_labels, 'status': STATUS_OK}

In [114]:
trials = Trials()

best_hyperparams = fmin(
    fn = optimization_objective,
    space = hyperparameter_space,
    algo = tpe.suggest,
    max_evals = 1000,
    trials = trials
)

trial_vals = trials.vals
trial_vals['f1_score'] = list(map(lambda x: x['f1_score'], trials.results))
trial_vals['precision'] = list(map(lambda x: x['precision'], trials.results))
trial_vals['ap'] = list(map(lambda x: x['ap'], trials.results))
trial_vals['auc'] = list(map(lambda x: x['auc'], trials.results))
trial_vals['pos_preds'] = list(map(lambda x: x['pos_preds'], trials.results))
trial_vals['pos_labels'] = list(map(lambda x: x['pos_labels'], trials.results))

df_trials = pd.DataFrame.from_dict(trial_vals)
df_trials.to_csv('data/trials_51_opt-ap0.6_evals-1000_minmax-scaling_noise_hybrid.csv', index=False)

100%|███████████████████████████████████████████| 1000/1000 [09:05<00:00,  1.83trial/s, best loss: -0.4056458743466741]


In [196]:
params = { 
    # reg def 0
    "gamma": 4.03,
    # L2 def 1
    # "reg_lambda" : 1,
    # "reg_alpha" : 0,
    # Class imbalance def 1
    "scale_pos_weight" : 5.320584,
    # Integers:
    "max_depth": 4,
    # Reg def 1
    "min_child_weight" : 8,
    # Class imbalance def 0
    "max_delta_step" : 1,
    # Choice:
    "colsample_bylevel" : 1,
    "n_estimators": 25,
    "learning_rate": 0.1,
    "subsample": 1,
    # Fixed:
    "seed": 420,
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
}

clf = xgb.XGBClassifier(
        **params
    )

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])
clf.save_model(os.path.join('data', 'test_classifier.json'))

[0]	validation_0-logloss:0.67114	validation_1-logloss:0.66936
[1]	validation_0-logloss:0.65106	validation_1-logloss:0.64786
[2]	validation_0-logloss:0.63349	validation_1-logloss:0.62810
[3]	validation_0-logloss:0.61793	validation_1-logloss:0.61103
[4]	validation_0-logloss:0.60444	validation_1-logloss:0.59623
[5]	validation_0-logloss:0.59318	validation_1-logloss:0.58335
[6]	validation_0-logloss:0.58341	validation_1-logloss:0.57200
[7]	validation_0-logloss:0.57522	validation_1-logloss:0.56204
[8]	validation_0-logloss:0.56744	validation_1-logloss:0.55321
[9]	validation_0-logloss:0.56093	validation_1-logloss:0.54552
[10]	validation_0-logloss:0.55522	validation_1-logloss:0.53860
[11]	validation_0-logloss:0.55027	validation_1-logloss:0.53251
[12]	validation_0-logloss:0.54570	validation_1-logloss:0.52713
[13]	validation_0-logloss:0.54181	validation_1-logloss:0.52227
[14]	validation_0-logloss:0.53834	validation_1-logloss:0.51802
[15]	validation_0-logloss:0.53523	validation_1-logloss:0.51419
[1

In [197]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
threshold = 0.65
y_preds = y_score > threshold

evaluate.performance_summary(y_score, y_preds, y_valid)

df_results_valid = evaluate.returns_on_predictions(splits['validation'], y_preds)

evaluate.performance_on_slice(splits['validation'], y_score, y_preds, 'subindustry', False)

**Validation**
Precision: 0.34591194968553457
PR-AUC/AP score: 0.3815663785919986
ROC-AUC score: 0.6016294047216361
Total positive predictions: 159

Totals:
        prediction
result            
FN            1755
FP             104
TN           14302
TP              55

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.061313          0.100575            0.120737
FP             -0.037577         -0.033788           -0.022962
TN             -0.010155         -0.024158           -0.028218
TP              0.078564          0.157091            0.198073

Stds:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.060731          0.099246            0.106429
FP              0.202847          0.145381            0.175739
TN              0.055906          0.077815            0.092741
TP              0.259

In [109]:
df_results_valid[df_results_valid.result == 'FP'].head(100)

Unnamed: 0,ticker_x,ticker_y,trade_date,adf_pass_rate,last_residual,beta,intercept,residual_mean_max,return_one_month,residual_one_month,return_two_month,residual_two_month,return_three_month,residual_three_month,data_window_start,subindustry,label,prediction,result,score
49363,KLAC,TSEM,2022-03-04,0.53,5.82,0.07,9.16,4.8,0.005,5.7,-0.015,6.2,0.008,5.62,2017-03-20,semiconductors_and_semiconductor_equipment,0,True,FP,0.836991
97970,RAMP,VRNT,2021-02-25,0.61,5.0,0.32,11.04,6.08,0.005,4.92,-0.063,6.21,-0.024,5.47,2016-03-10,software_and_services,0,True,FP,0.837215
64397,AMAT,RMBS,2022-03-04,0.92,3.89,0.12,6.69,4.8,-0.058,5.57,0.009,3.64,0.032,2.97,2017-03-20,semiconductors_and_semiconductor_equipment,0,True,FP,0.733874
32731,BAK,LXU,2021-10-12,0.75,5.16,0.28,-0.43,4.24,0.003,5.11,0.048,4.42,-0.05,5.91,2016-10-25,chemicals,0,True,FP,0.722986
102606,FIS,GPN,2020-12-28,0.92,3.63,1.63,-45.7,5.58,0.006,3.3,0.013,2.93,0.02,2.58,2016-01-12,software_and_services,0,True,FP,0.763628
27428,ATEN,MSTR,2021-01-27,0.75,3.93,54.89,-220.93,6.03,-0.243,9.41,-0.139,7.08,-0.208,8.63,2016-02-10,software_and_services,0,True,FP,0.727854
20992,ROST,SNBR,2021-02-10,0.73,5.06,0.82,-33.9,4.4,-0.064,6.23,0.036,4.4,0.102,3.22,2016-02-25,retailing,0,True,FP,0.75693
72094,DIOD,TSEM,2022-03-04,0.55,5.83,0.28,6.82,4.8,-0.024,6.44,-0.045,6.95,-0.053,7.15,2017-03-20,semiconductors_and_semiconductor_equipment,0,True,FP,0.80858
44727,ATI,NUE,2021-06-04,0.55,4.95,0.99,33.0,4.39,0.093,3.76,-0.015,5.14,-0.097,6.19,2016-06-17,metals_and_mining,0,True,FP,0.73064
59715,ADSK,GLOB,2021-08-30,0.67,5.21,1.01,-72.6,4.63,0.024,4.22,0.019,4.42,-0.003,5.34,2016-09-13,software_and_services,0,True,FP,0.780946


In [62]:
# print("**Test**")
# y_score = clf.predict_proba(X_test)[:, 1]
# threshold = 0.6
# y_preds = y_score > threshold

# evaluate.performance_summary(y_score, y_preds, y_test)

# df_results_test = evaluate.returns_on_predictions(splits['test'], y_preds)

# evaluate.performance_on_slice(splits['test'], y_score, y_preds, 'subindustry', True)

In [63]:
clf.feature_names_in_

array(['adf_pass_rate', 'last_residual', 'residual_mean_max', 'industry',
       'residual_inter'], dtype='<U17')

In [66]:
clf.feature_importances_

array([0.10213113, 0.11947197, 0.11750855, 0.5689818 , 0.0919065 ],
      dtype=float32)

In [55]:
pd.set_option('display.max_rows', 100)

In [88]:
df_trials = pd.read_csv('data/trials_50_opt-ap0.6_evals-1000_minmax-scaling_noise_hybrid.csv')
df_trials.sort_values('ap', ascending=False).head(50)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels
207,2,2.660469,3.0,4.0,10.0,1,5.039147,2,0.228247,0.223232,0.462638,0.612032,1980,1893
69,2,0.503474,3.0,4.0,6.0,0,5.23114,2,0.217939,0.212212,0.447414,0.609078,1998,1893
411,2,3.988498,3.0,4.0,7.0,0,5.198843,2,0.216412,0.212183,0.445567,0.609041,1970,1893
487,2,3.281887,4.0,4.0,8.0,0,5.180133,2,0.215553,0.212996,0.445461,0.609039,1939,1893
418,2,3.241147,2.0,4.0,7.0,0,5.17733,2,0.215666,0.213216,0.445408,0.609039,1937,1893
58,2,4.115222,3.0,4.0,7.0,0,5.209897,2,0.218163,0.212638,0.444523,0.608997,1994,1893
258,2,4.94296,4.0,4.0,7.0,0,5.223463,2,0.218051,0.212425,0.444264,0.608999,1996,1893
292,2,3.93379,2.0,4.0,5.0,0,5.201617,2,0.216468,0.212291,0.443631,0.609085,1969,1893
266,2,4.21478,3.0,4.0,8.0,0,5.222747,2,0.216701,0.210343,0.440898,0.609714,2011,1893
468,2,4.104593,4.0,4.0,7.0,1,5.220172,2,0.237846,0.216674,0.440404,0.61327,2303,1893


In [115]:
df_trials = pd.read_csv('data/trials_51_opt-ap0.6_evals-1000_minmax-scaling_noise_hybrid.csv')
df_trials.sort_values('ap', ascending=False).head(50)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels
700,2,0.295072,4.0,4.0,3.0,0,5.320584,1,0.226729,0.214145,0.405646,0.600825,2036,1810
823,2,4.01618,5.0,4.0,4.0,0,5.174391,1,0.219365,0.217226,0.404146,0.601074,1846,1810
830,2,4.24213,5.0,4.0,5.0,0,5.172915,1,0.219365,0.217226,0.404146,0.601073,1846,1810
868,2,3.523732,5.0,4.0,4.0,0,5.159605,1,0.219846,0.218172,0.404146,0.601084,1838,1810
754,2,4.011583,5.0,4.0,5.0,0,5.192931,1,0.220044,0.216971,0.40412,0.601082,1862,1810
735,2,0.205411,4.0,4.0,5.0,0,5.184834,1,0.218827,0.216173,0.40412,0.601136,1855,1810
516,2,0.027429,4.0,4.0,3.0,0,5.214,1,0.221691,0.213627,0.403653,0.600429,1952,1810
959,2,3.934003,5.0,4.0,3.0,0,5.21554,1,0.222104,0.21392,0.403653,0.600429,1954,1810
787,2,3.841731,4.0,4.0,5.0,0,5.205978,1,0.220839,0.216446,0.403053,0.601203,1885,1810
884,2,3.706742,5.0,4.0,4.0,0,5.207621,1,0.220779,0.216331,0.403053,0.601203,1886,1810


In [135]:
df_trials = pd.read_csv('data/trials_47_opt-ap0.6_evals-1000_minmax-scaling_noise_hybrid.csv')
df_trials.sort_values('ap', ascending=False).head(50)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels
154,2,2.106686,2.0,4.0,8.0,0,5.338896,2,0.234989,0.226201,0.558362,0.614281,2206,2041
586,2,4.032224,2.0,4.0,10.0,0,5.555772,2,0.235703,0.211884,0.556781,0.614167,2558,2041
801,2,3.316369,2.0,4.0,9.0,0,5.553942,2,0.235703,0.211884,0.556781,0.614166,2558,2041
591,2,4.04879,2.0,4.0,10.0,0,5.368263,2,0.233656,0.22541,0.550762,0.614817,2196,2041
698,2,3.80281,2.0,4.0,9.0,0,5.342241,2,0.235183,0.226983,0.550073,0.61397,2194,2041
583,2,4.57534,2.0,4.0,9.0,0,5.339841,2,0.235239,0.227086,0.550073,0.613971,2193,2041
499,2,4.583719,2.0,4.0,8.0,0,5.393055,2,0.234048,0.225295,0.547458,0.61414,2206,2041
788,2,3.732558,2.0,4.0,9.0,0,5.404041,2,0.233882,0.224989,0.547458,0.614135,2209,2041
405,2,2.858222,2.0,4.0,9.0,0,5.404193,2,0.233882,0.224989,0.547458,0.614135,2209,2041
580,2,3.996881,2.0,4.0,9.0,0,5.376883,2,0.231837,0.219641,0.547363,0.614461,2281,2041


# Other

In [27]:
def predict_baseline_model(X_train, y_train, X_valid, y_valid, baseline='rule-based', residual_cutoff_adj=0, adf_cutoff=0.5):
    def class_positive(example):
        if all([
            np.abs(example['last_residual']) > (example['residual_mean_max'] + residual_cutoff_adj),
            example['adf_pass_rate'] > adf_cutoff,
        ]):
            return 1
        else:
            return 0
        
    if baseline == "rule-based":
        y_train_preds = X_train.apply(class_positive, axis=1).to_numpy()
        y_valid_preds = X_valid.apply(class_positive, axis=1).to_numpy()
    elif baseline == "random":
        y_train_preds = np.random.randint(0, 2, len(X_train))
        y_valid_preds = np.random.randint(0, 2, len(X_valid))
    
    precision = precision_score(y_train.to_numpy(), y_train_preds)
    f1 = f1_score(y_train.to_numpy(), y_train_preds)
    print("Final baseline precision on train:", precision)
    print("Final baseline F1 score on train:", f1)

    precision = precision_score(y_valid.to_numpy(), y_valid_preds)
    f1 = f1_score(y_valid.to_numpy(), y_valid_preds)
    print("Final baseline precision on validation:", precision)
    print("Final baseline F1 score on validation:", f1)

In [28]:
def predict_random_forest(X_train, y_train, X_valid, y_valid):
    clf = RandomForestClassifier(
        n_estimators = 100,
        max_depth = None,
        max_features = "auto",
        oob_score = False,
        class_weight = "balanced_subsample",
    )
    
    clf.fit(X_train, y_train)
    
    y_train_preds = clf.predict(X_train)
    y_valid_preds = clf.predict(X_valid)
    
    precision = precision_score(y_train.to_numpy(), y_train_preds)
    f1 = f1_score(y_train.to_numpy(), y_train_preds)
    auc = roc_auc_score(y_train.to_numpy(), y_train_preds)
    
    print("Final RF precision on train:", precision)
    print("Final RF F1 score on train:", f1)
    print("Final RF AUC score on train:", auc)
    
    precision = precision_score(y_valid.to_numpy(), y_valid_preds)
    f1 = f1_score(y_valid.to_numpy(), y_valid_preds)
    auc = roc_auc_score(y_valid.to_numpy(), y_valid_preds)
    
    print("Final RF precision on valid:", precision)
    print("Final RF F1 score on valid:", f1)
    print("Final RF AUC score on valid:", auc)