# Imports

In [None]:
# !pip install yfinance

In [2]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import preprocessing

# Download stock daily prices

In [42]:
# Gets all ticker names (no argument given)
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [44]:
# Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2022, 3, 1)
# Date of today
date_to = datetime.today()
# How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 5.5

In [45]:
# Download ticker price data for the tickers selected above (saved to .csv automatically)
df, df_clean = utils.download_stonk_prices(ticker_list.index, period_years=period_years, date_to=date_to)

[*********************100%***********************]  2820 of 2820 completed

117 Failed downloads:
- BAP: Error occurred while retrieving timeseries from Redis, keys: [RedisKey [key=BAP, cluster=finance]]
- WWE: Error occurred while retrieving timeseries from Redis, keys: [RedisKey [key=WWE, cluster=finance]]
- NEM: Error occurred while retrieving timeseries from Redis, keys: [RedisKey [key=NEM, cluster=finance]]
- PCOR: CircuitBreaker 'redis' is OPEN and does not permit further calls
- MAN: CircuitBreaker 'redis' is OPEN and does not permit further calls
- HTLF: CircuitBreaker 'redis' is OPEN and does not permit further calls
- MS: CircuitBreaker 'redis' is OPEN and does not permit further calls
- RIG: CircuitBreaker 'redis' is OPEN and does not permit further calls
- CRH: CircuitBreaker 'redis' is OPEN and does not permit further calls
- GDDY: CircuitBreaker 'redis' is OPEN and does not permit further calls
- LFC: CircuitBreaker 'redis' is OPEN and does not permit further calls
- FYBR

In [9]:
ticker_list.groupby('subindustry').count()

Unnamed: 0_level_0,market_cap
subindustry,Unnamed: 1_level_1
automobiles_and_components,42
banks,177
capital_goods,231
chemicals,61
commercial_and_professional_services,72
construction_materials,7
consumer_durables_and_apparel,73
consumer_services,99
consumer_staples,121
containers_and_packaging,20


# Run data pipeline

In [67]:
# Visos industrijos:
# industries = ticker_list['subindustry'].unique()
# Jei nori atskirai po kelias arba po viena (uncomment):
industries = [
    'health_care_equipment_and_services',
    'software_and_services',
    'retailing',
    'telecommunication_services',
    'capital_goods',
    'pharmaceuticals_biotechnology_and_life_sciences',
    'consumer_staples',
    'metals_and_mining',
    'technology_hardware_and_equipment',
    'chemicals',
    'automobiles_and_components',
    'semiconductors_and_semiconductor_equipment',
    'consumer_services',
    'consumer_durables_and_apparel',
    'transportation',
    'commercial_and_professional_services',
    'paper_and_forest_products',
    'containers_and_packaging',
    'construction_materials'
    ]
l_reg = 3
l_roll = 2
dt = 10
date_from = '2016-12-12'
date_to = '2022-06-09'
output_dir = 'data'

stonk_model = predict.XGBStonkModel()

In [68]:
i = 1
total_industries = len(industries)
for industry in industries:
    stonks = utils.get_stonk_data(date_from, date_to, filter_industries=[industry])
    X, Y = processing.combine_stonk_pairs(stonks)
    
    print('Industry ({0}/{1}): {2}'.format(i, total_industries, industry))
    
    print('Processing residuals...')
    residuals, betas, _, date_index = utils.measure_time(partial(processing.get_rolling_residuals, X=X, Y=Y, l_reg=l_reg, l_roll=l_roll, dt=dt))
    residuals.insert(0, "dates", date_index)
    betas.insert(0, "dates", date_index)
    
    std_residuals, _, _ = processing.get_standardized_residuals(residuals.drop(columns="dates"))

    trades_before = len(std_residuals)
    std_residuals = std_residuals[std_residuals.iloc[:, -1].abs() >= 2]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by residual values'.format(trades_after, trades_before))
    if trades_after == 0:
        print('No trades left after filtering residuals, skipping this industry...')
        continue
    residuals = residuals.loc[std_residuals.index]
    betas = betas.loc[std_residuals.index]
    
    print('Processing ADFs...')
    adfs, adfs_raw = utils.measure_time(partial(processing.get_aggregate_adfs, residuals.drop(columns="dates"), betas=betas.drop(columns="dates")))
    
    selected_by_adf = (adfs >= 0.5).values
    adfs = adfs[selected_by_adf]
    
    trades_before = len(std_residuals)
    std_residuals = std_residuals[selected_by_adf]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by ADF pass rates'.format(trades_after, trades_before))
    
    if len(std_residuals) == 0:
        print('No trades left after filtering ADF pass rates, skipping this industry...')
        continue

    betas = betas.loc[adfs.index]
    residuals = residuals.loc[adfs.index]
    adfs_raw = adfs_raw.loc[adfs.index]
    
    residuals_max_mean = processing.get_mean_residual_magnitude(std_residuals.to_numpy(), dt=21)
    print('Mean max residual value for {0} after filtering is {1}'.format(industry, residuals_max_mean))
    
    print('Preparing data for model...')
    dataset = utils.build_dataset_from_live_data_by_industry(std_residuals.to_numpy(), adfs.to_numpy().ravel(), industry, residuals_max_mean)
    
    print('Running model...')
    predictions, df_processed = stonk_model.predict(dataset)
    predictions = pd.DataFrame(predictions)
    predictions.index = adfs.index
    
    print('Writing results to CSV...')
    residuals.to_csv(os.path.join(output_dir, industry + '_residuals.csv'), header=False, index=True)
    betas.to_csv(os.path.join(output_dir, industry + '_betas.csv'), header=False, index=True)
    adfs_raw.to_csv(os.path.join(output_dir, industry + '_adfs_raw.csv'), header=False, index=True)
    adfs.to_csv(os.path.join(output_dir, industry + '_adfs.csv'), header=False, index=True)
    predictions.to_csv(os.path.join(output_dir, industry + '_predictions.csv'), header=False, index=True)
    i+= 1
    
print('*** All done ***')

Industry (1/19): health_care_equipment_and_services
Processing residuals...
Done after: 28s
1029 trades selected out of 5253 by residual values
Processing ADFs...
Done after: 107s
203 trades selected out of 1029 by ADF pass rates
Mean max residual value for health_care_equipment_and_services after filtering is 3.680000066757202
Preparing data for model...
Running model...
Writing results to CSV...
Industry (2/19): software_and_services
Processing residuals...
Done after: 40s
1250 trades selected out of 7626 by residual values
Processing ADFs...
Done after: 131s
270 trades selected out of 1250 by ADF pass rates
Mean max residual value for software_and_services after filtering is 3.930000066757202
Preparing data for model...
Running model...
Writing results to CSV...
Industry (3/19): retailing
Processing residuals...
Done after: 11s
463 trades selected out of 2145 by residual values
Processing ADFs...
Done after: 49s
50 trades selected out of 463 by ADF pass rates
Mean max residual value

In [48]:
df_processed

Unnamed: 0,adf_pass_rate,last_residual,residual_mean_max,industry,residual_inter
0,0.138783,0.025855,0.663074,information_technology,0.500459
1,0.197283,0.048652,0.663074,information_technology,0.554587
2,0.873782,0.137812,0.663074,information_technology,0.766284
3,0.227283,0.010206,0.663074,information_technology,0.463303
4,0.726782,0.083717,0.663074,information_technology,0.637844
...,...,...,...,...,...
112,0.197283,0.080433,0.663074,information_technology,0.630046
113,0.138783,0.140999,0.663074,information_technology,0.773853
114,0.374282,0.106417,0.663074,information_technology,0.691743
115,0.285782,0.089899,0.663074,information_technology,0.652523


# Data collection

In [3]:
stonks = utils.get_stonk_data('2017-04-20', '2022-04-18')

In [4]:
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [None]:
pipelines.data_collection_rolling_pipeline(stonks, industries=list(ticker_list['subindustry'].unique()), l_reg=3, l_roll=2, dt=20, market_cap_min_mm=1000, market_cap_max_mm=None, adf_pval_cutoff=0.1, adf_pass_rate_filter=0.5, trade_length_months=3, trading_interval_weeks=2)

# Model development

In [3]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe
import pickle

In [3]:
def train_production_xgb(df: pd.DataFrame, params: Dict[str, Any], scaling: str = 'minmax', add_noise: bool = True) -> Tuple[xgb.XGBClassifier, sklearn.base.TransformerMixin]:
    X_train, scalers = preprocessing.transform_features(df, scaling=scaling, add_noise=add_noise)
    y_train = df['label']
    
    clf = xgb.XGBClassifier(
        **params
    )

    clf.fit(X_train, y_train, eval_set=[(X_train, y_train)])
    clf.save_model(os.path.join('data', 'xgb_classifier.json'))
                   
    with open(os.path.join('data', 'scalers.json'), 'wb') as fp:
        pickle.dump(scalers, fp)
                   
    return clf, scalers

In [4]:
df = pd.read_csv('data/dataset.csv')
df = df[df.beta > 0]
df = preprocessing.assign_labels(df)

In [64]:
drop_dates = 12
selected_dates = np.sort(df['trade_date'].unique())[drop_dates:]
df_prod = df[df.trade_date.isin(selected_dates)]
print(len(df_prod))

In [65]:
clf_prod, scalers_prod = train_production_xgb(df_prod, params)

[0]	validation_0-logloss:0.67820
[1]	validation_0-logloss:0.66533
[2]	validation_0-logloss:0.65437
[3]	validation_0-logloss:0.64535
[4]	validation_0-logloss:0.63715
[5]	validation_0-logloss:0.62918
[6]	validation_0-logloss:0.62313
[7]	validation_0-logloss:0.61781
[8]	validation_0-logloss:0.61282
[9]	validation_0-logloss:0.60882
[10]	validation_0-logloss:0.60493
[11]	validation_0-logloss:0.60195
[12]	validation_0-logloss:0.59915
[13]	validation_0-logloss:0.59668
[14]	validation_0-logloss:0.59462
[15]	validation_0-logloss:0.59253
[16]	validation_0-logloss:0.59044
[17]	validation_0-logloss:0.58861
[18]	validation_0-logloss:0.58732
[19]	validation_0-logloss:0.58549
[20]	validation_0-logloss:0.58398
[21]	validation_0-logloss:0.58228
[22]	validation_0-logloss:0.58089
[23]	validation_0-logloss:0.57966
[24]	validation_0-logloss:0.57876
[25]	validation_0-logloss:0.57787
[26]	validation_0-logloss:0.57735
[27]	validation_0-logloss:0.57680
[28]	validation_0-logloss:0.57615
[29]	validation_0-loglos

In [21]:
splits = preprocessing.split_data(df, 2, 6, 20, random_state=420)
print(len(splits['train']))
print(len(splits['validation']))

19627
6228


In [22]:
scaling = 'minmax'
add_noise = True

X_train, scalers = preprocessing.transform_features(splits['train'], scaling=scaling, add_noise=add_noise)
X_valid, _ = preprocessing.transform_features(splits['validation'], scalers=scalers, scaling=scaling, add_noise=add_noise)

y_train = splits['train']['label']
y_valid = splits['validation']['label']

In [7]:
hyperparameter_space = {
    # Continuous:
    "gamma": hp.uniform("gamma", 0, 3),
    "scale_pos_weight" : hp.uniform("scale_pos_weight", 4, 7),
    # Integers:
    "max_depth": hp.quniform("max_depth", 2, 4, 1),
    "min_child_weight" : hp.quniform("min_child_weight", 1, 8, 1),
    "max_delta_step" : hp.quniform("max_delta_step", 1, 4, 1),
    # Choice:
    "colsample_bylevel" : hp.choice("colsample_bylevel", np.array([0.5, 0.75, 1])),
    "n_estimators": hp.choice("n_estimators", np.array([25, 50, 75, 100])),
    "subsample": hp.choice("subsample", np.array([0.5, 0.75, 1])),
    }

In [8]:
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [9]:
def optimization_objective(space):
    clf = xgb.XGBClassifier(
        gamma = space['gamma'],
        scale_pos_weight = space['scale_pos_weight'],
        #
        max_depth = int(space['max_depth']),
        min_child_weight = int(space['min_child_weight']),
        max_delta_step = int(space['max_delta_step']),
        #
        colsample_bylevel = space['colsample_bylevel'],
        n_estimators = int(space['n_estimators']), 
        learning_rate = 0.1,
        subsample = space['subsample'],
        #
        tree_method = "hist",
        enable_categorical = True,
        max_cat_to_onehot = 1,
    )
    
    clf.fit(
        X_train, y_train,
        verbose=False,
    )
    
    y_score = clf.predict_proba(X_valid)[:, 1]
    y_preds = y_score > 0.5
    
    f1 = f1_score(y_valid, y_preds, zero_division=0)
    precision = precision_score(y_valid, y_preds, zero_division=0)
    ap = evaluate.average_precision_from_cutoff(y_valid, y_score, 0.55)
    roc = roc_auc_score(y_valid, y_score)
    
    pos_preds = int(y_preds.sum())
    pos_labels = int(y_valid.sum())
    
    ap = ap if pos_preds >= pos_labels else 0
    
    if f1 == 0 or precision == 0:
        return {'loss': 999, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'pos_labels': pos_labels, 'status': STATUS_FAIL}
    else:
        return {'loss': -ap, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'pos_labels': pos_labels, 'status': STATUS_OK}

In [None]:
trials = Trials()

best_hyperparams = fmin(
    fn = optimization_objective,
    space = hyperparameter_space,
    algo = tpe.suggest,
    max_evals = 400,
    trials = trials
)

trial_vals = trials.vals
trial_vals['f1_score'] = list(map(lambda x: x['f1_score'], trials.results))
trial_vals['precision'] = list(map(lambda x: x['precision'], trials.results))
trial_vals['ap'] = list(map(lambda x: x['ap'], trials.results))
trial_vals['auc'] = list(map(lambda x: x['auc'], trials.results))
trial_vals['pos_preds'] = list(map(lambda x: x['pos_preds'], trials.results))
trial_vals['pos_labels'] = list(map(lambda x: x['pos_labels'], trials.results))

df_trials = pd.DataFrame.from_dict(trial_vals)
df_trials.to_csv('data/optimise-data-window-size_2_6_20.csv', index=False)

In [314]:
params = { 
    # reg def 0
    "gamma": 1.718,
    # L2 def 1
    # "reg_lambda" : 1,
    # "reg_alpha" : 0,
    # Class imbalance def 1
    "scale_pos_weight" : 5.865,
    # Integers:
    "max_depth": 2,
    # Reg def 1
    "min_child_weight" : 8,
    # Class imbalance def 0
    "max_delta_step" : 0,
    # Choice:
    "colsample_bylevel" : 1,
    "n_estimators": 25,
    "learning_rate": 0.1,
    "subsample": 1,
    # Fixed:
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
}

clf = xgb.XGBClassifier(
        **params
    )

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])
clf.save_model(os.path.join('data', 'test_classifier.json'))

[0]	validation_0-logloss:0.66978	validation_1-logloss:0.66656
[1]	validation_0-logloss:0.65018	validation_1-logloss:0.64385
[2]	validation_0-logloss:0.63324	validation_1-logloss:0.62432
[3]	validation_0-logloss:0.61846	validation_1-logloss:0.60739
[4]	validation_0-logloss:0.60578	validation_1-logloss:0.59267
[5]	validation_0-logloss:0.59536	validation_1-logloss:0.57979
[6]	validation_0-logloss:0.58573	validation_1-logloss:0.56850
[7]	validation_0-logloss:0.57756	validation_1-logloss:0.55853
[8]	validation_0-logloss:0.57066	validation_1-logloss:0.54976
[9]	validation_0-logloss:0.56439	validation_1-logloss:0.54189
[10]	validation_0-logloss:0.55913	validation_1-logloss:0.53499
[11]	validation_0-logloss:0.55404	validation_1-logloss:0.52884
[12]	validation_0-logloss:0.54975	validation_1-logloss:0.52332
[13]	validation_0-logloss:0.54392	validation_1-logloss:0.51836
[14]	validation_0-logloss:0.54085	validation_1-logloss:0.51397
[15]	validation_0-logloss:0.53744	validation_1-logloss:0.50997
[1

In [319]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
threshold = 0.6
y_preds = y_score > threshold

evaluate.performance_summary(y_score, y_preds, y_valid, auc_cutoff=0.55)

df_results_valid = evaluate.returns_on_predictions(splits['validation'], y_preds)

evaluate.performance_on_slice(splits['validation'], y_score, y_preds, 'subindustry', False)

**Validation**
Precision: 0.3870967741935484
PR-AUC/AP score: 0.42089559449890823
ROC-AUC score: 0.5723785054490308
Total positive predictions: 31

Totals:
        prediction
result            
FN             650
FP              19
TN            5547
TP              12

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.052262          0.089032            0.118209
FP             -0.012368         -0.007158            0.004316
TN             -0.008274         -0.019442           -0.018361
TP              0.016833          0.103917            0.106417

Stds:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.057549          0.064486            0.068636
FP              0.037334          0.060984            0.042265
TN              0.049739          0.064634            0.076047
TP              0.0332

  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


In [10]:
# df_results_valid[df_results_valid.result == 'FP'].sample(50)

In [40]:
clf.feature_names_in_

array(['adf_pass_rate', 'last_residual', 'residual_mean_max', 'industry',
       'residual_inter'], dtype='<U17')

In [41]:
clf.feature_importances_

array([0.10517278, 0.1543806 , 0.1008487 , 0.5313897 , 0.10820813],
      dtype=float32)

In [55]:
pd.set_option('display.max_rows', 100)

In [186]:
df_trials = pd.read_csv('data/optimise-data-window-size_2_6_16.csv')
df_trials.sort_values('ap', ascending=False).head(10)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels
392,2,1.847776,2.0,2.0,2.0,0,4.224905,2,0.174107,0.171554,0.599412,0.573621,682,662
370,2,1.350857,2.0,2.0,8.0,0,4.250404,2,0.172235,0.169343,0.599412,0.573051,685,662
389,2,1.837637,2.0,2.0,8.0,0,4.198275,2,0.173913,0.172619,0.599412,0.573625,672,662
356,2,1.660727,2.0,2.0,8.0,0,4.249715,2,0.172235,0.169343,0.599412,0.573051,685,662
357,2,1.700209,2.0,2.0,8.0,0,4.201487,2,0.173913,0.172619,0.599412,0.573625,672,662
353,1,1.644978,2.0,2.0,8.0,0,4.317579,2,0.17483,0.174436,0.579709,0.571782,665,662
343,1,1.661411,2.0,2.0,7.0,0,4.353007,2,0.175808,0.174888,0.559682,0.572634,669,662
322,1,0.18434,2.0,2.0,7.0,0,4.346687,2,0.175808,0.174888,0.557453,0.571777,669,662
331,1,1.556349,2.0,2.0,7.0,0,4.342613,2,0.175808,0.174888,0.557453,0.571777,669,662
266,1,0.915253,2.0,2.0,7.0,0,4.411387,0,0.178261,0.171309,0.555925,0.572475,718,662


In [12]:
df_trials = pd.read_csv('data/optimise-data-window-size_2_6_18.csv')
df_trials.sort_values('ap', ascending=False).head(10)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels
86,1,0.279044,2.0,2.0,3.0,0,4.122908,1,0.178417,0.17033,0.567434,0.571829,728,662
70,1,1.008222,5.0,2.0,5.0,0,4.166201,1,0.18118,0.169291,0.561059,0.571817,762,662
72,1,0.695833,5.0,2.0,6.0,0,4.210162,1,0.181185,0.168176,0.548746,0.571814,773,662
80,1,0.778058,2.0,2.0,3.0,0,4.371601,1,0.194872,0.169265,0.531343,0.57225,898,662
39,1,1.050924,3.0,2.0,6.0,0,4.461146,1,0.2,0.169831,0.502135,0.57223,948,662
26,1,1.314995,5.0,2.0,6.0,0,4.697516,1,0.210303,0.162162,0.492976,0.57344,1221,662
67,1,1.210492,5.0,2.0,5.0,0,4.71746,1,0.210805,0.162316,0.492976,0.573389,1226,662
68,1,1.204356,5.0,2.0,3.0,0,4.547773,1,0.19942,0.161806,0.486498,0.572165,1063,662
27,1,1.402153,4.0,2.0,5.0,0,4.761078,1,0.205882,0.154962,0.481096,0.57336,1310,662
65,1,1.385656,4.0,2.0,5.0,0,4.762536,1,0.205882,0.154962,0.481096,0.57336,1310,662


In [251]:
df_trials = pd.read_csv('data/optimise-data-window-size_2_6_12.csv')
df_trials.sort_values('ap', ascending=False).head(10)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels
310,0,1.999058,3.0,2.0,3.0,0,5.259557,1,0.176737,0.176737,0.721749,0.575624,662,662
289,0,1.385695,4.0,2.0,1.0,0,5.37828,1,0.184211,0.17847,0.696837,0.575618,706,662
269,0,1.595987,3.0,2.0,1.0,0,5.389881,1,0.183942,0.177966,0.696837,0.575618,708,662
320,0,1.900575,3.0,2.0,4.0,0,5.420382,1,0.182476,0.175243,0.696837,0.575617,719,662
220,0,1.147838,2.0,2.0,1.0,0,5.362522,1,0.184911,0.181159,0.696837,0.575614,690,662
348,0,1.896508,3.0,2.0,5.0,0,5.418106,1,0.182476,0.175243,0.696837,0.575617,719,662
230,0,1.616328,2.0,2.0,2.0,0,5.291729,1,0.177577,0.176912,0.696837,0.575609,667,662
232,0,1.547256,3.0,2.0,2.0,0,5.298677,1,0.177444,0.176647,0.696837,0.575608,668,662
374,0,1.890927,4.0,2.0,4.0,0,5.439248,1,0.182476,0.175243,0.696837,0.575621,719,662
327,0,1.572581,3.0,2.0,3.0,0,5.282941,1,0.177577,0.176912,0.696837,0.575607,667,662


# Other

In [27]:
def predict_baseline_model(X_train, y_train, X_valid, y_valid, baseline='rule-based', residual_cutoff_adj=0, adf_cutoff=0.5):
    def class_positive(example):
        if all([
            np.abs(example['last_residual']) > (example['residual_mean_max'] + residual_cutoff_adj),
            example['adf_pass_rate'] > adf_cutoff,
        ]):
            return 1
        else:
            return 0
        
    if baseline == "rule-based":
        y_train_preds = X_train.apply(class_positive, axis=1).to_numpy()
        y_valid_preds = X_valid.apply(class_positive, axis=1).to_numpy()
    elif baseline == "random":
        y_train_preds = np.random.randint(0, 2, len(X_train))
        y_valid_preds = np.random.randint(0, 2, len(X_valid))
    
    precision = precision_score(y_train.to_numpy(), y_train_preds)
    f1 = f1_score(y_train.to_numpy(), y_train_preds)
    print("Final baseline precision on train:", precision)
    print("Final baseline F1 score on train:", f1)

    precision = precision_score(y_valid.to_numpy(), y_valid_preds)
    f1 = f1_score(y_valid.to_numpy(), y_valid_preds)
    print("Final baseline precision on validation:", precision)
    print("Final baseline F1 score on validation:", f1)

In [28]:
def predict_random_forest(X_train, y_train, X_valid, y_valid):
    clf = RandomForestClassifier(
        n_estimators = 100,
        max_depth = None,
        max_features = "auto",
        oob_score = False,
        class_weight = "balanced_subsample",
    )
    
    clf.fit(X_train, y_train)
    
    y_train_preds = clf.predict(X_train)
    y_valid_preds = clf.predict(X_valid)
    
    precision = precision_score(y_train.to_numpy(), y_train_preds)
    f1 = f1_score(y_train.to_numpy(), y_train_preds)
    auc = roc_auc_score(y_train.to_numpy(), y_train_preds)
    
    print("Final RF precision on train:", precision)
    print("Final RF F1 score on train:", f1)
    print("Final RF AUC score on train:", auc)
    
    precision = precision_score(y_valid.to_numpy(), y_valid_preds)
    f1 = f1_score(y_valid.to_numpy(), y_valid_preds)
    auc = roc_auc_score(y_valid.to_numpy(), y_valid_preds)
    
    print("Final RF precision on valid:", precision)
    print("Final RF F1 score on valid:", f1)
    print("Final RF AUC score on valid:", auc)