# Imports

In [None]:
# !pip install yfinance
# !pip install pmdarima
# !pip install hyperopt
# !pip install xgboost

In [6]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import preprocessing

# Download stock daily prices & indexes

In [None]:
### Gets all ticker names (no argument given)
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [None]:
### Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2021, 1, 18)
### Date of today
date_to = datetime.today()
### How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 6

In [None]:
df, df_clean = utils.download_stonk_prices(ticker_list.index, period_years=period_years, date_to=date_to)
vix, vix_clean = utils.download_stonk_prices(["^VIX"], period_years=period_years, date_to=date_to, fname_prefix="vix")
sp500, sp500_clean = utils.download_stonk_prices(["^GSPC"], period_years=period_years, date_to=date_to, fname_prefix="sp500")

# Run data pipeline

In [7]:
industries = [
    # 'health_care_equipment_and_services',
    # 'software_and_services',
    # 'retailing',
    # 'telecommunication_services',
    'capital_goods',
    # 'energy',
    # 'pharmaceuticals_biotechnology_and_life_sciences',
    # 'consumer_staples',
    # 'banks',
    # 'diversified_financials',
    # 'metals_and_mining',
    # 'technology_hardware_and_equipment',
    # 'utilities',
    # 'chemicals',
    # 'automobiles_and_components',
    # 'semiconductors_and_semiconductor_equipment',
    # 'media_and_entertainment',
    # 'real_estate',
    # 'consumer_services',
    # 'consumer_durables_and_apparel',
    # 'insurance',
    # 'transportation',
    # 'commercial_and_professional_services',
    # 'paper_and_forest_products',
    # 'containers_and_packaging',
    # 'construction_materials'
    ]

l_reg = 3
l_roll = 2
dt = 10

output_dir = 'data'

stonk_model = predict.XGBStonkModel()
vix = utils.get_stonk_data(fname_prefix='vix', disable_filter=True).iloc[0]

In [13]:
datasets = []
i = 1
total_industries = len(industries)
for industry in industries:
    stonks = utils.get_stonk_data(filter_industries=[industry])
    X, Y = processing.combine_stonk_pairs(stonks)
    
    print('Industry ({0}/{1}): {2}'.format(i, total_industries, industry))
    
    print('Processing residuals...')
    residuals, betas, _, date_index = utils.measure_time(partial(processing.get_rolling_residuals, X=X, Y=Y, l_reg=l_reg, l_roll=l_roll, dt=dt))
    residuals.insert(0, "dates", date_index)
    betas.insert(0, "dates", date_index)
    
    std_residuals, means, stds = processing.get_standardized_residuals(residuals.drop(columns="dates"))

    trades_before = len(std_residuals)
    std_residuals = std_residuals[std_residuals.iloc[:, -1].abs() >= 2.5]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by residual values'.format(trades_after, trades_before))
    if trades_after == 0:
        print('No trades left after filtering residuals, skipping this industry...')
        continue
    residuals = residuals.loc[std_residuals.index]
    betas = betas.loc[std_residuals.index]
    
    print('Processing ADFs...')
    adfs, adfs_raw = utils.measure_time(partial(processing.get_aggregate_adfs, residuals.drop(columns="dates"), betas=betas.drop(columns="dates")))
    
    selected_by_adf = (adfs >= 0.5).values
    adfs = adfs[selected_by_adf]
    
    trades_before = len(std_residuals)
    std_residuals = std_residuals[selected_by_adf]
    trades_after = len(std_residuals)
    print('{0} trades selected out of {1} by ADF pass rates'.format(trades_after, trades_before))
    
    if len(std_residuals) == 0:
        print('No trades left after filtering ADF pass rates, skipping this industry...')
        continue

    betas = betas.loc[adfs.index]
    residuals = residuals.loc[adfs.index]
    adfs_raw = adfs_raw.loc[adfs.index]
    means = means.loc[adfs.index]
    stds = stds.loc[adfs.index]
    
    residuals_max_mean = processing.get_mean_residual_magnitude(std_residuals.to_numpy(), dt=21)
    print('Mean max residual value for {0} after filtering is {1}'.format(industry, residuals_max_mean))
    
    print('Preparing data for model...')
    dataset = utils.build_dataset_from_live_data_by_industry(std_residuals.to_numpy(), adfs.to_numpy().ravel(), industry, residuals_max_mean, vix.loc[stonks.columns[-1]])
    
    print('Running model...')
    predictions, df_processed = stonk_model.predict(dataset)
    datasets.append((dataset, df_processed))
    predictions = pd.DataFrame(predictions)
    predictions.index = adfs.index
    
    print('Writing results to CSV...')
    # Very big industry, exceeds Git file size limit
    if industry == "diversified_financials":
        half = len(residuals) // 2
        residuals_fst = residuals.iloc[:half]
        residuals_snd = residuals.iloc[half:]
        residuals_fst.to_csv(os.path.join(output_dir, industry + '_one_residuals.csv'), header=False, index=True)
        residuals_snd.to_csv(os.path.join(output_dir, industry + '_two_residuals.csv'), header=False, index=True)
        del residuals_fst
        del residuals_snd
    else:
        residuals.to_csv(os.path.join(output_dir, industry + '_residuals.csv'), header=False, index=True)
    betas.to_csv(os.path.join(output_dir, industry + '_betas.csv'), header=False, index=True)
    adfs_raw.to_csv(os.path.join(output_dir, industry + '_adfs_raw.csv'), header=False, index=True)
    predictions.to_csv(os.path.join(output_dir, industry + '_predictions.csv'), header=False, index=True)
    i+= 1
    
print('*** All done ***')

Industry (1/1): capital_goods
Processing residuals...
Done after: 43s
807 trades selected out of 16836 by residual values
Processing ADFs...
Done after: 79s
166 trades selected out of 807 by ADF pass rates
Mean max residual value for capital_goods after filtering is 3.869999885559082
Preparing data for model...
Running model...
Writing results to CSV...
*** All done ***


In [14]:
from sklearn.linear_model import LinearRegression
from numpy.typing import ArrayLike

def calculate_beta_stability_rsquared(prices_X: pd.DataFrame, prices_Y: pd.DataFrame, betas: pd.DataFrame) -> np.array:
    betas = betas.copy()
    
    first_reg_date = betas["dates"][0].split('_')[0]
    last_reg_date = betas["dates"][-1].split('_')[-1]

    betas['dates_end'] = betas['dates'].map(lambda x: x.split('_')[-1])
    betas = betas.drop(columns='dates')
    betas_original_order = betas.index.unique()

    selected_tickers = utils.separate_pair_index(betas_original_order)
    prices_X_selected = prices_X.reset_index().drop_duplicates(subset="index").set_index("index").copy().loc[selected_tickers["x"]].loc[:, first_reg_date:last_reg_date]
    prices_Y_selected = prices_Y.reset_index().drop_duplicates(subset="index").set_index("index").copy().loc[selected_tickers["y"]].loc[:, first_reg_date:last_reg_date]

    betas = betas.reset_index().pivot(index='index', columns='dates_end', values=0).loc[betas_original_order]
    
    assert np.all((prices_Y_selected.index + "_" + prices_X_selected.index) == betas.index)
    
    # Finds the index of the closest earlier (ffill) date than the one given in pandas_index
    def _get_closest_loc(pandas_index: pd.Index, value: str) -> int:
        try:
            return pandas_index.get_loc(value, method='ffill')
        except KeyError:
            return 0
    
    betas = betas.iloc[:, prices_X_selected.columns.map(lambda x: _get_closest_loc(betas.columns, x))].values
    
    assert betas.shape == prices_X_selected.shape
    
    prices_X_selected = prices_X_selected.values
    prices_Y_selected = prices_Y_selected.values
    last_betas = betas[:, -1].copy().reshape(-1, 1)
    
    assert all([
        prices_X_selected.shape == prices_Y_selected.shape,
        len(last_betas) == len(prices_X_selected)
    ])
    
    spreads_a = prices_Y_selected - (betas * prices_X_selected)
    spreads_b = prices_Y_selected - (last_betas * prices_X_selected)
    
    lr = LinearRegression(n_jobs=-1)
    rsquared = pd.DataFrame([lr.fit(x.reshape(-1, 1), y.reshape(-1, 1)).score(x.reshape(-1, 1), y.reshape(-1, 1)) for x, y in zip(spreads_a, spreads_b)], index=betas_original_order, dtype=np.float32)
    return rsquared

In [127]:
rsquared = calculate_beta_stability_rsquared(X, Y, betas)
np.all(rsquared.index == std_residuals.index)

In [5]:
from pmdarima.arima import auto_arima
def calculate_arima_forecast_diff(std_residuals: pd.DataFrame, means: pd.DataFrame, stds: pd.DataFrame, forecast_months: int = 3) -> np.array:
    forecast_length = forecast_months * DAYS_IN_TRADING_MONTH
    return None

In [24]:
unstandardized_residuals = (std_residuals.values * stds.values) + means.values

In [77]:
result = np.apply_along_axis(
        lambda x: auto_arima(y=x, seasonal=False, stationary=True, information_criterion='aic', with_intercept=False, maxiter=5, d=0).fit_predict(y=x, n_periods=63),
        axis=1,
        arr=unstandardized_residuals,
    )

In [79]:
result.shape

(166, 63)

In [82]:
import time

In [89]:
t1 = time.time()

result_5 = np.apply_along_axis(
        lambda x: auto_arima(y=x, seasonal=False, stationary=True, information_criterion='aic', with_intercept=False, maxiter=5, d=0),
        axis=1,
        arr=unstandardized_residuals,
    )

t2 = time.time()
print("Done after: " + str(int(t2 - t1)) + "s")

Done after: 50s


In [90]:
t1 = time.time()

result_10 = np.apply_along_axis(
        lambda x: auto_arima(y=x, seasonal=False, stationary=True, information_criterion='aic', with_intercept=False, maxiter=10, d=0),
        axis=1,
        arr=unstandardized_residuals,
    )

t2 = time.time()
print("Done after: " + str(int(t2 - t1)) + "s")

Done after: 78s


In [91]:
t1 = time.time()

result_25 = np.apply_along_axis(
        lambda x: auto_arima(y=x, seasonal=False, stationary=True, information_criterion='aic', with_intercept=False, maxiter=25, d=0),
        axis=1,
        arr=unstandardized_residuals,
    )

t2 = time.time()
print("Done after: " + str(int(t2 - t1)) + "s")

Done after: 138s


In [None]:
result_full = np.apply_along_axis(
        lambda x: auto_arima(y=x, seasonal=False, stationary=True, information_criterion='aic', with_intercept=False, maxiter=50),
        axis=1,
        arr=unstandardized_residuals,
    )

In [92]:
sum([str(r1.get_params()['order']) == str(r2.get_params()['order']) for r1, r2 in zip(result_5, result_full)]) / len(result)

0.7168674698795181

In [93]:
sum([str(r1.get_params()['order']) == str(r2.get_params()['order']) for r1, r2 in zip(result_10, result_full)]) / len(result)

0.7289156626506024

In [94]:
sum([str(r1.get_params()['order']) == str(r2.get_params()['order']) for r1, r2 in zip(result_25, result_full)]) / len(result)

0.8674698795180723

In [62]:
for r1, r2 in zip(result, result_full):
    print(str(r1.get_params()['order']) + ' ' + str(r2.get_params()['order']))

(2, 0, 3) (2, 0, 3)
(1, 0, 1) (1, 0, 1)
(1, 0, 1) (1, 0, 1)
(1, 0, 1) (4, 0, 4)
(1, 0, 0) (1, 0, 0)
(2, 0, 0) (2, 0, 0)
(1, 0, 0) (1, 0, 0)
(1, 0, 1) (1, 0, 1)
(1, 0, 0) (1, 0, 0)
(1, 0, 0) (1, 0, 0)
(1, 0, 0) (1, 0, 0)
(2, 0, 1) (2, 0, 1)
(2, 0, 0) (2, 0, 0)
(2, 0, 0) (2, 0, 1)
(1, 0, 0) (3, 0, 2)
(1, 0, 1) (1, 0, 1)
(1, 0, 0) (1, 0, 0)
(1, 0, 0) (1, 0, 0)
(2, 0, 2) (3, 0, 1)
(1, 0, 0) (1, 0, 0)
(1, 0, 0) (1, 0, 0)
(1, 0, 0) (1, 0, 0)
(3, 0, 2) (1, 0, 1)
(1, 0, 1) (1, 0, 1)
(1, 0, 0) (1, 0, 0)
(1, 0, 0) (1, 0, 0)
(1, 0, 0) (1, 0, 0)
(1, 0, 1) (1, 0, 1)
(1, 0, 1) (1, 0, 1)
(1, 0, 1) (1, 0, 1)
(1, 0, 1) (1, 0, 1)
(1, 0, 0) (1, 0, 0)
(1, 0, 0) (1, 0, 0)
(1, 0, 0) (1, 0, 0)
(1, 0, 1) (1, 0, 1)
(1, 0, 0) (1, 0, 0)
(2, 0, 1) (2, 0, 2)
(1, 0, 2) (3, 0, 2)
(1, 0, 0) (1, 0, 0)
(1, 0, 0) (1, 0, 0)
(2, 0, 0) (2, 0, 0)
(1, 0, 0) (1, 0, 0)
(1, 0, 0) (1, 0, 0)
(1, 0, 1) (1, 0, 1)
(1, 0, 0) (2, 0, 1)
(1, 0, 1) (1, 0, 1)
(1, 0, 0) (1, 0, 0)
(1, 0, 1) (1, 0, 1)
(1, 0, 1) (1, 0, 1)
(3, 0, 0) (3, 0, 0)


In [57]:
result[0].get_params() == result[2].get_params()

False

In [56]:
result[1].get_params()

{'maxiter': 5,
 'method': 'lbfgs',
 'order': (1, 0, 1),
 'out_of_sample_size': 0,
 'scoring': 'mse',
 'scoring_args': {},
 'seasonal_order': (0, 0, 0, 0),
 'start_params': None,
 'trend': None,
 'with_intercept': False}

In [49]:
for r in result:
    print(r)

 ARIMA(2,0,3)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(2,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(2,0,1)(0,0,0)[0]          
 ARIMA(2,0,0)(0,0,0)[0]          
 ARIMA(2,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(2,0,2)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,1)(

In [38]:
for r in result:
    print(r)

 ARIMA(2,0,3)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(4,0,4)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(2,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(2,0,1)(0,0,0)[0]          
 ARIMA(2,0,0)(0,0,0)[0]          
 ARIMA(2,0,1)(0,0,0)[0]          
 ARIMA(3,0,2)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(3,0,1)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,0)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,1)(0,0,0)[0]          
 ARIMA(1,0,1)(

In [39]:
%timeit arima = auto_arima(y=unstandardized_residuals[0], seasonal=False, stationary=True, information_criterion='aic', with_intercept=False)

2.41 s ± 566 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [45]:
%timeit arima = auto_arima(y=unstandardized_residuals[0], seasonal=False, stationary=True, information_criterion='aic', with_intercept=False, maxiter=5)

630 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%timeit arima = auto_arima(y=unstandardized_residuals[0], seasonal=False, stationary=True, information_criterion='aic', with_intercept=False, max_q=4, max_p=4)

In [47]:
arima

      with_intercept=False)

In [11]:
# datasets[-23][0].head(200)
help(auto_arima)

Help on function auto_arima in module pmdarima.arima.auto:

    Automatically discover the optimal order for an ARIMA model.
    
    The auto-ARIMA process seeks to identify the most optimal
    parameters for an ``ARIMA`` model, settling on a single fitted ARIMA model.
    This process is based on the commonly-used R function,
    ``forecast::auto.arima`` [3].
    
    Auto-ARIMA works by conducting differencing tests (i.e.,
    Kwiatkowski–Phillips–Schmidt–Shin, Augmented Dickey-Fuller or
    Phillips–Perron) to determine the order of differencing, ``d``, and then
    fitting models within ranges of defined ``start_p``, ``max_p``,
    ``start_q``, ``max_q`` ranges. If the ``seasonal`` optional is enabled,
    auto-ARIMA also seeks to identify the optimal ``P`` and ``Q`` hyper-
    parameters after conducting the Canova-Hansen to determine the optimal
    order of seasonal differencing, ``D``.
    
    In order to find the best model, auto-ARIMA optimizes for a given
    ``informatio

# Data collection

In [None]:
stonks = utils.get_stonk_data()
stonks = stonks.loc[:, :'2022-07-01']

In [None]:
pipelines.data_collection_rolling_pipeline(
    stonks,
    l_reg=3,
    l_roll=2,
    dt=10,
    market_cap_min_mm=1000,
    market_cap_max_mm=None,
    last_residual_cutoff=2.5,
    mean_max_residual_dt=21,
    adf_pval_cutoff=0.1,
    adf_pass_rate_filter=0.5,
    trade_length_months=3,
    trading_interval_weeks=2,
    first_n_windows=1,
)

In [None]:
dataset = utils.ingest_trade_pipeline_outputs()

vix = utils.get_stonk_data(fname_prefix='vix', disable_filter=True).iloc[0]
sp500 = utils.get_stonk_data(fname_prefix='sp500', disable_filter=True).iloc[0]

sp500_chg = pd.Series((sp500.iloc[63:].values / sp500.iloc[:-63].values) - 1)
sp500_chg.index = sp500.iloc[63:].index

dataset['vix'] = dataset['trade_date'].apply(lambda x: vix.loc[x])
dataset['sp500'] = dataset['trade_date'].apply(lambda x: sp500_chg.loc[x])
dataset.to_csv('data/dataset.csv', header=True, index=False)

# Model development

In [None]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe, atpe, rand
import pickle

In [None]:
def train_production_xgb(df: pd.DataFrame, params: Dict[str, Any], noise_level: float = 0) -> Tuple[xgb.XGBClassifier, sklearn.base.TransformerMixin]:
    X_train, scalers = preprocessing.transform_features(df, noise_level=noise_level)
    y_train = df['label']
    
    clf = xgb.XGBClassifier(
        **params
    )

    clf.fit(X_train, y_train, eval_set=[(X_train, y_train)])
    clf.save_model(os.path.join('data', 'xgb_classifier.json'))
                   
    with open(os.path.join('data', 'scalers.json'), 'wb') as fp:
        pickle.dump(scalers, fp)
                   
    return clf, scalers

In [None]:
df = pd.read_csv('data/dataset.csv')
df = df[df.beta > 0]
df = df[df.last_residual.abs() >= 2.5]
df = preprocessing.assign_labels(df)

In [None]:
drop_dates = 2
selected_dates = np.sort(df['trade_date'].unique())[drop_dates:]
df_prod = df[df.trade_date.isin(selected_dates)].sample(frac=1)
print(len(df_prod))
print(df_prod['label'].value_counts())

In [None]:
clf_prod, scalers_prod = train_production_xgb(df_prod, params, noise_level=0.005)

In [None]:
splits = preprocessing.split_data(df, 2, 6, 2, random_state=3439)
print(len(splits['train']))
print(len(splits['validation']))
print(splits['train']['label'].value_counts())
print(splits['validation']['label'].value_counts())

In [None]:
noise_level = 0.005

X_train, scalers = preprocessing.transform_features(splits['train'], noise_level=noise_level)
X_valid, _ = preprocessing.transform_features(splits['validation'], scalers=scalers, noise_level=0)

y_train = splits['train']['label']
y_valid = splits['validation']['label']

In [None]:
hyperparameter_space = {
    "gamma": hp.uniform("gamma", 0, 5),
    "scale_pos_weight" : hp.uniform("scale_pos_weight", 2, 12),
    "max_depth": hp.quniform("max_depth", 3, 10, 1),
    "min_child_weight" : hp.quniform("min_child_weight", 1, 8, 1),
    "max_delta_step" : hp.quniform("max_delta_step", 1, 4, 1),
    "n_estimators": hp.quniform("n_estimators", 50, 200, 1),
    # "n_estimators": hp.choice("n_estimators", np.array([50, 75, 100, 150, 200])),
    # "subsample": hp.uniform("subsample", 0.5, 1),
    # "colsample_bylevel" : hp.uniform("colsample_bylevel", 0.5, 1),
    }

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

def optimization_objective(space):
    clf = xgb.XGBClassifier(
        gamma = space['gamma'],
        scale_pos_weight = space['scale_pos_weight'],
        #
        max_depth = int(space['max_depth']),
        min_child_weight = int(space['min_child_weight']),
        max_delta_step = int(space['max_delta_step']),
        #
        # colsample_bylevel = space['colsample_bylevel'],
        colsample_bylevel = 1,
        n_estimators = int(space['n_estimators']), 
        learning_rate = 0.1,
        # subsample = space['subsample'],
        subsample = 1,
        #
        tree_method = "hist",
        enable_categorical = True,
        max_cat_to_onehot = 1,
        random_state = np.random.randint(9999999),
    )
    
    clf.fit(
        X_train, y_train,
        verbose=False,
    )
    
    y_score = clf.predict_proba(X_valid)[:, 1]
    y_preds = y_score > 0.5
    
    f1 = f1_score(y_valid, y_preds, zero_division=0)
    precision = precision_score(y_valid, y_preds, zero_division=0)
    ap = evaluate.average_precision_from_cutoff(y_valid, y_score, 0.4)
    roc = roc_auc_score(y_valid, y_score)
    
    pos_preds = int(y_preds.sum())
    pos_labels = int(y_valid.sum())
    
    ap = ap if pos_preds >= pos_labels else 0
    
    if f1 == 0 or precision == 0:
        return {'loss': 100, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'status': STATUS_FAIL}
    else:
        return {'loss': -ap, 'precision': precision, 'f1_score': f1, 'ap': ap, 'auc': roc, 'pos_preds': pos_preds, 'status': STATUS_OK}

In [None]:
trials = Trials()

best_hyperparams = fmin(
    fn = optimization_objective,
    space = hyperparameter_space,
    algo = tpe.suggest,
    max_evals = 1000,
    trials = trials
)

trial_vals = trials.vals
trial_vals['f1_score'] = list(map(lambda x: x['f1_score'], trials.results))
trial_vals['precision'] = list(map(lambda x: x['precision'], trials.results))
trial_vals['ap'] = list(map(lambda x: x['ap'], trials.results))
trial_vals['auc'] = list(map(lambda x: x['auc'], trials.results))
trial_vals['pos_preds'] = list(map(lambda x: x['pos_preds'], trials.results))

df_trials = pd.DataFrame.from_dict(trial_vals)
df_trials.to_csv('data/data-window-size-2#7.csv', index=False)

In [None]:
params = {
    # reg def 0
    "gamma": 3.301387,
    # L2 def 1
    # "reg_lambda" : 1,
    # "reg_alpha" : 0,
    # Class imbalance def 1
    "scale_pos_weight" : 5.568589,
    # Integers:
    "max_depth": 6,
    # Reg def 1
    "min_child_weight" : 7,
    # Class imbalance def 0
    "max_delta_step" : 3,
    # Choice:
    "colsample_bylevel" : 1,
    "n_estimators": 63,
    "learning_rate": 0.1,
    "subsample": 1,
    # Fixed:
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
    "random_state": np.random.randint(999929)
}

clf = xgb.XGBClassifier(
        **params
    )

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])

In [None]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
thres = 0.7
y_preds = y_score > thres

evaluate.performance_summary(y_score, y_preds, y_valid, auc_cutoff=0.4)

df_results_valid = evaluate.returns_on_predictions(splits['validation'], y_preds)

evaluate.performance_on_slice(splits['validation'], y_score, y_preds, 'subindustry', False)

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
# df_results_valid[df_results_valid.result == 'TP'].iloc[:100]

In [None]:
# df_results_valid[df_results_valid.subindustry == 'consumer_services'].iloc[0:100]

In [None]:
print(clf.feature_names_in_)

In [None]:
print(clf.feature_importances_)

In [None]:
df_trials = pd.read_csv('data/last-residual-cutoff-check-2.5#13.csv')
df_trials.sort_values('ap', ascending=False).head(50)

In [None]:
df_trials = pd.read_csv('data/data-window-size-2#7.csv')
df_trials.sort_values('ap', ascending=False).head(50)

In [None]:
df_trials = pd.read_csv('data/data-window-size-2#6.csv')
df_trials.sort_values('ap', ascending=False).head(50)