# Imports

In [None]:
# !pip install yfinance

In [1]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import preprocessing

# Download stock daily prices

In [42]:
# Gets all ticker names (no argument given)
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [44]:
# Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2022, 3, 1)
# Date of today
date_to = datetime.today()
# How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 5.5

In [None]:
# Download ticker price data for the tickers selected above (saved to .csv automatically)
df, df_clean = utils.download_stonk_prices(ticker_list.index, period_years=period_years, date_to=date_to)

In [None]:
ticker_list.groupby('subindustry').count()

# Run data pipeline

In [67]:
# Visos industrijos:
# industries = ticker_list['subindustry'].unique()
# Jei nori atskirai po kelias arba po viena (uncomment):
industries = [
    'health_care_equipment_and_services',
    'software_and_services',
    'retailing',
    'telecommunication_services',
    'capital_goods',
    'pharmaceuticals_biotechnology_and_life_sciences',
    'consumer_staples',
    'metals_and_mining',
    'technology_hardware_and_equipment',
    'chemicals',
    'automobiles_and_components',
    'semiconductors_and_semiconductor_equipment',
    'consumer_services',
    'consumer_durables_and_apparel',
    'transportation',
    'commercial_and_professional_services',
    'paper_and_forest_products',
    'containers_and_packaging',
    'construction_materials'
    ]
l_reg = 3
l_roll = 2
dt = 10
date_from = '2016-12-12'
date_to = '2022-06-09'
output_dir = 'data'

stonk_model = predict.XGBStonkModel()

In [68]:
i = 1
total_industries = len(industries)
for industry in industries:
    stonks = utils.get_stonk_data(date_from, date_to, filter_industries=[industry])
    X, Y = processing.combine_stonk_pairs(stonks)
    
    print('Industry ({0}/{1}): {2}'.format(i, total_industries, industry))
    
    print('Processing residuals...')
    residuals, betas, _, date_index = utils.measure_time(partial(processing.get_rolling_residuals, X=X, Y=Y, l_reg=l_reg, l_roll=l_roll, dt=dt))
    residuals.insert(0, "dates", date_index)
    betas.insert(0, "dates", date_index)
    
    std_residuals, _, _ = processing.get_standardized_residuals(residuals.drop(columns="dates"))

    trades_before = len(std_residuals)
    std_residuals = std_residuals[std_residuals.iloc[:, -1].abs() >= 2]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by residual values'.format(trades_after, trades_before))
    if trades_after == 0:
        print('No trades left after filtering residuals, skipping this industry...')
        continue
    residuals = residuals.loc[std_residuals.index]
    betas = betas.loc[std_residuals.index]
    
    print('Processing ADFs...')
    adfs, adfs_raw = utils.measure_time(partial(processing.get_aggregate_adfs, residuals.drop(columns="dates"), betas=betas.drop(columns="dates")))
    
    selected_by_adf = (adfs >= 0.5).values
    adfs = adfs[selected_by_adf]
    
    trades_before = len(std_residuals)
    std_residuals = std_residuals[selected_by_adf]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by ADF pass rates'.format(trades_after, trades_before))
    
    if len(std_residuals) == 0:
        print('No trades left after filtering ADF pass rates, skipping this industry...')
        continue

    betas = betas.loc[adfs.index]
    residuals = residuals.loc[adfs.index]
    adfs_raw = adfs_raw.loc[adfs.index]
    
    residuals_max_mean = processing.get_mean_residual_magnitude(std_residuals.to_numpy(), dt=21)
    print('Mean max residual value for {0} after filtering is {1}'.format(industry, residuals_max_mean))
    
    print('Preparing data for model...')
    dataset = utils.build_dataset_from_live_data_by_industry(std_residuals.to_numpy(), adfs.to_numpy().ravel(), industry, residuals_max_mean)
    
    print('Running model...')
    predictions, df_processed = stonk_model.predict(dataset)
    predictions = pd.DataFrame(predictions)
    predictions.index = adfs.index
    
    print('Writing results to CSV...')
    residuals.to_csv(os.path.join(output_dir, industry + '_residuals.csv'), header=False, index=True)
    betas.to_csv(os.path.join(output_dir, industry + '_betas.csv'), header=False, index=True)
    adfs_raw.to_csv(os.path.join(output_dir, industry + '_adfs_raw.csv'), header=False, index=True)
    adfs.to_csv(os.path.join(output_dir, industry + '_adfs.csv'), header=False, index=True)
    predictions.to_csv(os.path.join(output_dir, industry + '_predictions.csv'), header=False, index=True)
    i+= 1
    
print('*** All done ***')

Industry (1/19): health_care_equipment_and_services
Processing residuals...
Done after: 28s
1029 trades selected out of 5253 by residual values
Processing ADFs...
Done after: 107s
203 trades selected out of 1029 by ADF pass rates
Mean max residual value for health_care_equipment_and_services after filtering is 3.680000066757202
Preparing data for model...
Running model...
Writing results to CSV...
Industry (2/19): software_and_services
Processing residuals...
Done after: 40s
1250 trades selected out of 7626 by residual values
Processing ADFs...
Done after: 131s
270 trades selected out of 1250 by ADF pass rates
Mean max residual value for software_and_services after filtering is 3.930000066757202
Preparing data for model...
Running model...
Writing results to CSV...
Industry (3/19): retailing
Processing residuals...
Done after: 11s
463 trades selected out of 2145 by residual values
Processing ADFs...
Done after: 49s
50 trades selected out of 463 by ADF pass rates
Mean max residual value

# Data collection

In [3]:
stonks = utils.get_stonk_data('2017-04-20', '2022-04-18')

In [4]:
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [None]:
pipelines.data_collection_rolling_pipeline(stonks, industries=list(ticker_list['subindustry'].unique()), l_reg=3, l_roll=2, dt=20, market_cap_min_mm=1000, market_cap_max_mm=None, adf_pval_cutoff=0.1, adf_pass_rate_filter=0.5, trade_length_months=3, trading_interval_weeks=2)

# Model development

In [2]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe
import pickle

In [733]:
def train_production_xgb(df: pd.DataFrame, params: Dict[str, Any], scaling: str = 'minmax', add_noise: bool = False) -> Tuple[xgb.XGBClassifier, sklearn.base.TransformerMixin]:
    X_train, scalers = preprocessing.transform_features(df, scaling=scaling, add_noise=add_noise)
    y_train = df['label']
    
    clf = xgb.XGBClassifier(
        **params
    )

    clf.fit(X_train, y_train, eval_set=[(X_train, y_train)])
    clf.save_model(os.path.join('data', 'xgb_classifier.json'))
                   
    with open(os.path.join('data', 'scalers.json'), 'wb') as fp:
        pickle.dump(scalers, fp)
                   
    return clf, scalers

In [3]:
df = pd.read_csv('data/dataset.csv')
df = df[df.beta > 0]
df = preprocessing.assign_labels(df)

In [720]:
drop_dates = 16
selected_dates = np.sort(df['trade_date'].unique())[drop_dates:]
df_prod = df[df.trade_date.isin(selected_dates)]
print(len(df_prod))

52691


In [806]:
clf_prod, scalers_prod = train_production_xgb(df_prod, params)

[0]	validation_0-logloss:0.67999
[1]	validation_0-logloss:0.66883
[2]	validation_0-logloss:0.65929
[3]	validation_0-logloss:0.65111
[4]	validation_0-logloss:0.64403
[5]	validation_0-logloss:0.63783
[6]	validation_0-logloss:0.63248
[7]	validation_0-logloss:0.62774
[8]	validation_0-logloss:0.62355
[9]	validation_0-logloss:0.61988
[10]	validation_0-logloss:0.61662
[11]	validation_0-logloss:0.61372
[12]	validation_0-logloss:0.61116
[13]	validation_0-logloss:0.60887
[14]	validation_0-logloss:0.60682
[15]	validation_0-logloss:0.60489
[16]	validation_0-logloss:0.60323
[17]	validation_0-logloss:0.60155
[18]	validation_0-logloss:0.60014
[19]	validation_0-logloss:0.59891
[20]	validation_0-logloss:0.59765
[21]	validation_0-logloss:0.59661
[22]	validation_0-logloss:0.59559
[23]	validation_0-logloss:0.59472
[24]	validation_0-logloss:0.59396


In [746]:
splits = preprocessing.split_data(df, 2, 6, 10, random_state=69696969)
print(len(splits['train']))
print(len(splits['validation']))
print(splits['train']['label'].value_counts())

50721
6228
0    46195
1     4526
Name: label, dtype: int64


In [280]:
scaling = 'minmax'
add_noise = False

X_train, scalers = preprocessing.transform_features(splits['train'], scaling=scaling, add_noise=add_noise)
X_valid, _ = preprocessing.transform_features(splits['validation'], scalers=scalers, scaling=scaling, add_noise=False)

y_train = splits['train']['label']
y_valid = splits['validation']['label']

In [598]:
hyperparameter_space = {
    # Continuous:
    "gamma": hp.uniform("gamma", 0, 2),
    "scale_pos_weight" : hp.uniform("scale_pos_weight", 4, 7),
    # Integers:
    "max_depth": hp.quniform("max_depth", 2, 4, 1),
    "min_child_weight" : hp.quniform("min_child_weight", 1, 8, 1),
    "max_delta_step" : hp.quniform("max_delta_step", 1, 4, 1),
    # Choice:
    "colsample_bylevel" : hp.choice("colsample_bylevel", np.array([0.75, 1])),
    # "colsample_bylevel" : hp.uniform("colsample_bylevel", 0.5, 1),
    "n_estimators": hp.choice("n_estimators", np.array([25, 30, 35])),
    "subsample": hp.choice("subsample", np.array([0.75, 1])),
    # "subsample": hp.uniform("subsample", 0.5, 1),
    }

In [599]:
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [600]:
def optimization_objective(space):
    clf = xgb.XGBClassifier(
        gamma = space['gamma'],
        scale_pos_weight = space['scale_pos_weight'],
        #
        max_depth = int(space['max_depth']),
        min_child_weight = int(space['min_child_weight']),
        max_delta_step = int(space['max_delta_step']),
        #
        colsample_bylevel = space['colsample_bylevel'],
        # colsample_bylevel = 0.75,
        n_estimators = int(space['n_estimators']), 
        learning_rate = 0.1,
        subsample = space['subsample'],
        # subsample = 0.75,
        #
        tree_method = "hist",
        enable_categorical = True,
        max_cat_to_onehot = 1,
        random_state = np.random.randint(9999999),
    )
    
    clf.fit(
        X_train, y_train,
        verbose=False,
    )
    
    y_score = clf.predict_proba(X_valid)[:, 1]
    y_preds = y_score > 0.5
    
    f1 = f1_score(y_valid, y_preds, zero_division=0)
    precision = precision_score(y_valid, y_preds, zero_division=0)
    ap = evaluate.average_precision_from_cutoff(y_valid, y_score, 0.55)
    roc = roc_auc_score(y_valid, y_score)
    
    pos_preds = int(y_preds.sum())
    pos_labels = int(y_valid.sum())
    
    mean_proba = y_score[y_score >= 0.55].mean()
    
    ap = ap if pos_preds >= pos_labels else 0
    
    if f1 == 0 or precision == 0:
        return {'loss': 999, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'pos_labels': pos_labels, 'mean_proba': mean_proba, 'status': STATUS_FAIL}
    else:
        return {'loss': -ap, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'pos_labels': pos_labels, 'mean_proba': mean_proba, 'status': STATUS_OK}

In [619]:
trials = Trials()

best_hyperparams = fmin(
    fn = optimization_objective,
    space = hyperparameter_space,
    algo = tpe.suggest,
    max_evals = 500,
    trials = trials
)

trial_vals = trials.vals
trial_vals['f1_score'] = list(map(lambda x: x['f1_score'], trials.results))
trial_vals['precision'] = list(map(lambda x: x['precision'], trials.results))
trial_vals['ap'] = list(map(lambda x: x['ap'], trials.results))
trial_vals['auc'] = list(map(lambda x: x['auc'], trials.results))
trial_vals['pos_preds'] = list(map(lambda x: x['pos_preds'], trials.results))
trial_vals['pos_labels'] = list(map(lambda x: x['pos_labels'], trials.results))
trial_vals['mean_proba'] = list(map(lambda x: x['mean_proba'], trials.results))

df_trials = pd.DataFrame.from_dict(trial_vals)
df_trials.to_csv('data/optimise-data-window-size_2_6_8#9.csv', index=False)

100%|██████████| 500/500 [01:05<00:00,  7.63trial/s, best loss: -0.4503196691246591]


In [805]:
params = { 
    # reg def 0
    "gamma": 0.4,
    # L2 def 1
    # "reg_lambda" : 1,
    # "reg_alpha" : 0,
    # Class imbalance def 1
    "scale_pos_weight" : 5.84,
    # Integers:
    "max_depth": 2,
    # Reg def 1
    "min_child_weight" : 2,
    # Class imbalance def 0
    "max_delta_step" : 3,
    # Choice:
    "colsample_bylevel" : 1,
    "n_estimators": 25,
    "learning_rate": 0.1,
    "subsample": 1,
    # Fixed:
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
    "random_state": np.random.randint(999999)
}

clf = xgb.XGBClassifier(
        **params
    )

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])
clf.save_model(os.path.join('data', 'test_classifier.json'))

[0]	validation_0-logloss:0.66962	validation_1-logloss:0.66642
[1]	validation_0-logloss:0.64992	validation_1-logloss:0.64361
[2]	validation_0-logloss:0.63289	validation_1-logloss:0.62397
[3]	validation_0-logloss:0.61808	validation_1-logloss:0.60694
[4]	validation_0-logloss:0.60531	validation_1-logloss:0.59215
[5]	validation_0-logloss:0.59500	validation_1-logloss:0.57919
[6]	validation_0-logloss:0.58529	validation_1-logloss:0.56785
[7]	validation_0-logloss:0.57755	validation_1-logloss:0.55783
[8]	validation_0-logloss:0.57027	validation_1-logloss:0.54897
[9]	validation_0-logloss:0.56393	validation_1-logloss:0.54108
[10]	validation_0-logloss:0.55879	validation_1-logloss:0.53413
[11]	validation_0-logloss:0.55360	validation_1-logloss:0.52796
[12]	validation_0-logloss:0.54925	validation_1-logloss:0.52242
[13]	validation_0-logloss:0.54334	validation_1-logloss:0.51742
[14]	validation_0-logloss:0.54008	validation_1-logloss:0.51300
[15]	validation_0-logloss:0.53662	validation_1-logloss:0.50899
[1

In [804]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
threshold = 0.67
y_preds = y_score > threshold

evaluate.performance_summary(y_score, y_preds, y_valid, auc_cutoff=0.55)

df_results_valid = evaluate.returns_on_predictions(splits['validation'], y_preds)

evaluate.performance_on_slice(splits['validation'], y_score, y_preds, 'subindustry', False)

**Validation**
Precision: 0.5128205128205128
PR-AUC/AP score: 0.24356630587005057
ROC-AUC score: 0.5682735490510469
Total positive predictions: 39

Totals:
        prediction
result            
FN             642
FP              19
TN            5547
TP              20

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.052335          0.088484            0.117481
FP             -0.015579          0.002526           -0.001368
TN             -0.008263         -0.019475           -0.018342
TP              0.028650          0.115550            0.134500

Stds:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.057727          0.064269            0.068042
FP              0.034908          0.046587            0.047065
TN              0.049743          0.064664            0.076043
TP              0.0393

  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


In [155]:
df_results_valid[df_results_valid.result == 'FP']

Unnamed: 0,ticker_x,ticker_y,trade_date,adf_pass_rate,last_residual,beta,intercept,residual_mean_max,return_one_month,residual_one_month,return_two_month,residual_two_month,return_three_month,residual_three_month,data_window_start,subindustry,label,prediction,result,score
61693,AMBA,TSEM,2022-02-18,0.59,6.09,0.13,13.17,4.16,-0.105,8.72,-0.121,9.13,-0.101,8.62,2017-03-07,semiconductors_and_semiconductor_equipment,0,True,FP,0.560808
13511,ARCO,NCLH,2022-03-04,0.78,-3.89,10.47,-27.74,3.86,-0.026,-4.18,0.029,-3.57,-0.048,-4.44,2017-03-20,consumer_services,0,True,FP,0.570894
74371,APOG,GD,2022-03-04,0.90,4.05,2.63,79.06,4.78,0.041,2.78,0.017,3.51,0.020,3.44,2017-03-20,capital_goods,0,True,FP,0.554897
72526,AMKR,TSEM,2022-02-18,0.57,5.11,0.89,10.67,4.16,-0.030,5.79,-0.067,6.63,-0.037,5.94,2017-03-07,semiconductors_and_semiconductor_equipment,0,True,FP,0.560808
56190,UCTT,VECO,2022-03-04,0.61,4.11,0.35,6.16,4.80,0.023,3.60,0.016,3.74,0.084,2.23,2017-03-20,semiconductors_and_semiconductor_equipment,0,True,FP,0.567285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72861,MCHP,TSEM,2022-02-18,0.51,6.81,0.37,2.46,4.16,-0.003,6.90,-0.040,7.93,-0.009,7.06,2017-03-07,semiconductors_and_semiconductor_equipment,0,True,FP,0.560808
59419,OXY,RIG,2022-03-04,0.94,-4.77,0.14,-0.48,3.94,0.018,-4.45,-0.059,-5.83,-0.168,-7.81,2017-03-20,energy,0,True,FP,0.563827
67451,FTGC,KBWB,2022-03-04,0.88,-6.08,3.50,-16.25,4.81,0.002,-5.99,-0.036,-7.75,-0.045,-8.19,2017-03-20,diversified_financials,0,True,FP,0.553596
96080,GLOB,SHOP,2022-02-18,0.59,-4.20,5.70,-132.22,3.89,-0.011,-4.34,0.019,-3.95,0.069,-3.31,2017-03-07,software_and_services,0,True,FP,0.560808


In [40]:
clf.feature_names_in_

array(['adf_pass_rate', 'last_residual', 'residual_mean_max', 'industry',
       'residual_inter'], dtype='<U17')

In [41]:
clf.feature_importances_

array([0.10517278, 0.1543806 , 0.1008487 , 0.5313897 , 0.10820813],
      dtype=float32)

In [55]:
pd.set_option('display.max_rows', 100)

In [685]:
df_trials = pd.read_csv('data/optimise-data-window-size_2_6_10#3.csv')
df_trials.sort_values('ap', ascending=False).head(10)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels
279,1,1.245229,3.0,2.0,7.0,0,5.809424,2,0.202312,0.193906,0.373535,0.58407,722,662
280,1,1.143486,3.0,2.0,7.0,0,5.817047,2,0.202166,0.193638,0.373535,0.58407,723,662
379,1,1.91973,2.0,2.0,7.0,0,5.722965,2,0.20059,0.195965,0.373498,0.584072,694,662
271,1,1.405949,3.0,2.0,8.0,0,5.75387,2,0.200439,0.194326,0.373498,0.58407,705,662
266,1,1.471607,3.0,2.0,8.0,0,5.775054,2,0.200292,0.194051,0.373498,0.584071,706,662
265,1,1.467206,3.0,2.0,8.0,0,5.755698,2,0.200439,0.194326,0.373498,0.58407,705,662
260,1,1.312365,3.0,2.0,7.0,0,5.787308,2,0.199708,0.192958,0.373498,0.58407,710,662
258,1,1.272882,3.0,2.0,7.0,0,5.75788,2,0.200439,0.194326,0.373498,0.58407,705,662
257,1,1.266402,3.0,2.0,7.0,0,5.790751,2,0.199708,0.192958,0.373498,0.584069,710,662
342,1,1.551048,2.0,2.0,8.0,0,5.716279,2,0.20059,0.195965,0.373498,0.584072,694,662


In [282]:
df_trials = pd.read_csv('data/optimise-data-window-size_2_6_8#7.csv')
df_trials.sort_values('ap', ascending=False).head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds,pos_labels
423,1.82166,3.0,3.0,3.0,2,5.235265,0.191698,0.191554,0.464594,0.570336,663,662
544,1.720931,3.0,3.0,7.0,1,5.209525,0.195636,0.194903,0.441682,0.575264,667,662
68,0.902362,2.0,3.0,7.0,2,5.201881,0.188623,0.186944,0.439401,0.572935,674,662
452,1.452829,4.0,3.0,7.0,1,5.574485,0.198946,0.176402,0.435425,0.570312,856,662
926,1.82204,4.0,3.0,6.0,2,5.580976,0.197564,0.178922,0.431681,0.570896,816,662
934,1.524745,4.0,3.0,4.0,2,5.731257,0.203693,0.168142,0.431549,0.568409,1017,662
83,0.078826,3.0,3.0,6.0,3,5.495547,0.186969,0.176,0.43125,0.571642,750,662
274,2.982872,1.0,3.0,4.0,3,5.288843,0.195015,0.189459,0.43061,0.57093,702,662
306,1.932819,3.0,3.0,7.0,2,5.157385,0.206691,0.203514,0.429431,0.571964,683,662
600,1.978521,3.0,3.0,6.0,0,5.361042,0.203147,0.192935,0.429316,0.572164,736,662


In [621]:
df_trials = pd.read_csv('data/optimise-data-window-size_2_6_8#9.csv')
df_trials.sort_values('ap', ascending=False).head(50)

Unnamed: 0,colsample_bylevel,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,subsample,f1_score,precision,ap,auc,pos_preds,pos_labels,mean_proba
69,1,1.996141,3.0,2.0,3.0,0,5.628031,0,0.199856,0.190672,0.45032,0.572873,729,662,0.57913
422,1,0.438901,3.0,2.0,2.0,0,5.687869,1,0.184132,0.182493,0.440691,0.571582,674,662,0.576571
462,1,0.232207,3.0,2.0,4.0,0,5.692098,1,0.184132,0.182493,0.440691,0.571582,674,662,0.576738
105,1,0.426103,2.0,2.0,2.0,0,5.689655,1,0.184132,0.182493,0.440691,0.571582,674,662,0.576642
426,1,0.308946,3.0,2.0,2.0,0,5.691637,1,0.184132,0.182493,0.440691,0.571582,674,662,0.57672
402,1,0.330941,3.0,2.0,2.0,0,5.692377,1,0.184132,0.182493,0.440691,0.571582,674,662,0.576749
431,1,0.309122,3.0,2.0,3.0,0,5.677718,1,0.184132,0.182493,0.439729,0.571579,674,662,0.576171
146,1,0.475022,2.0,2.0,5.0,0,5.677048,1,0.184132,0.182493,0.439729,0.571579,674,662,0.576145
399,1,0.404932,3.0,2.0,2.0,0,5.843868,1,0.181422,0.174581,0.437705,0.57051,716,662,0.580437
410,1,0.20162,3.0,2.0,2.0,0,5.840338,1,0.181422,0.174581,0.437705,0.57051,716,662,0.580302


# Other

In [27]:
def predict_baseline_model(X_train, y_train, X_valid, y_valid, baseline='rule-based', residual_cutoff_adj=0, adf_cutoff=0.5):
    def class_positive(example):
        if all([
            np.abs(example['last_residual']) > (example['residual_mean_max'] + residual_cutoff_adj),
            example['adf_pass_rate'] > adf_cutoff,
        ]):
            return 1
        else:
            return 0
        
    if baseline == "rule-based":
        y_train_preds = X_train.apply(class_positive, axis=1).to_numpy()
        y_valid_preds = X_valid.apply(class_positive, axis=1).to_numpy()
    elif baseline == "random":
        y_train_preds = np.random.randint(0, 2, len(X_train))
        y_valid_preds = np.random.randint(0, 2, len(X_valid))
    
    precision = precision_score(y_train.to_numpy(), y_train_preds)
    f1 = f1_score(y_train.to_numpy(), y_train_preds)
    print("Final baseline precision on train:", precision)
    print("Final baseline F1 score on train:", f1)

    precision = precision_score(y_valid.to_numpy(), y_valid_preds)
    f1 = f1_score(y_valid.to_numpy(), y_valid_preds)
    print("Final baseline precision on validation:", precision)
    print("Final baseline F1 score on validation:", f1)

In [28]:
def predict_random_forest(X_train, y_train, X_valid, y_valid):
    clf = RandomForestClassifier(
        n_estimators = 100,
        max_depth = None,
        max_features = "auto",
        oob_score = False,
        class_weight = "balanced_subsample",
    )
    
    clf.fit(X_train, y_train)
    
    y_train_preds = clf.predict(X_train)
    y_valid_preds = clf.predict(X_valid)
    
    precision = precision_score(y_train.to_numpy(), y_train_preds)
    f1 = f1_score(y_train.to_numpy(), y_train_preds)
    auc = roc_auc_score(y_train.to_numpy(), y_train_preds)
    
    print("Final RF precision on train:", precision)
    print("Final RF F1 score on train:", f1)
    print("Final RF AUC score on train:", auc)
    
    precision = precision_score(y_valid.to_numpy(), y_valid_preds)
    f1 = f1_score(y_valid.to_numpy(), y_valid_preds)
    auc = roc_auc_score(y_valid.to_numpy(), y_valid_preds)
    
    print("Final RF precision on valid:", precision)
    print("Final RF F1 score on valid:", f1)
    print("Final RF AUC score on valid:", auc)