# Imports

In [None]:
# !pip install yfinance
# !pip install pmdarima
# !pip install hyperopt
# !pip install xgboost
# !pip install numpy -U

In [2]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import train
import preprocessing

# Download stock daily prices & indexes

In [3]:
### Gets all ticker names (no argument given)
market_cap_min_mm = 1000
market_cap_max_mm = None

ticker_list = utils.get_ticker_names(
    market_cap_min_mm=market_cap_min_mm,
    market_cap_max_mm=market_cap_max_mm,
    remove_industries=[
        "pharmaceuticals_biotechnology_and_life_sciences",
        "health_care_equipment_and_services",
    ],
)

In [48]:
### Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2021, 1, 18)
### Date of today
date_to = datetime.today()
### How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 11

In [49]:
_, _ = utils.download_stonk_prices(
    ticker_list.index, period_years=period_years, date_to=date_to
)
_, _ = utils.download_stonk_prices(
    ["^VIX"], period_years=period_years, date_to=date_to, fname_prefix="vix"
)
_, _ = utils.download_stonk_prices(
    ["^GSPC"], period_years=period_years, date_to=date_to, fname_prefix="sp500"
)
_, _ = utils.download_stonk_prices(
    ["CL=F"], period_years=period_years, date_to=date_to, fname_prefix="oil"
)
_, _ = utils.download_stonk_prices(
    ["DX=F"], period_years=period_years, date_to=date_to, fname_prefix="usd"
)
_, _ = utils.download_stonk_prices(
    ["^TNX"], period_years=period_years, date_to=date_to, fname_prefix="yield"
)
_, _ = utils.download_stonk_prices(
    ["HG=F"], period_years=period_years, date_to=date_to, fname_prefix="copper"
)

[*********************100%***********************]  2261 of 2261 completed

38 Failed downloads:
- EPAY: No data found, symbol may be delisted
- BIP.PRB: No data found, symbol may be delisted
- POST WI: No data found, symbol may be delisted
- CNR: No data found, symbol may be delisted
- SAFM: No data found, symbol may be delisted
- DELL WI: No data found, symbol may be delisted
- O.WI: No data found, symbol may be delisted
- FB: No data found, symbol may be delisted
- JOBS: No data found, symbol may be delisted
- DIDI: No data found, symbol may be delisted
- OAS: No data found, symbol may be delisted
- CDK: No data found, symbol may be delisted
- SGMS: No data found, symbol may be delisted
- BLL: No data found, symbol may be delisted
- HNP: No data found, symbol may be delisted
- MSP: No data found, symbol may be delisted
- SNX.WI: No data found, symbol may be delisted
- T WD: No data found, symbol may be delisted
- WLL: No data found, symbol may be delisted
- ZNGA: No data found, symb

# Run data pipeline

In [4]:
industries = [
    # 'health_care_equipment_and_services',
    "software_and_services",
    "retailing",
    "telecommunication_services",
    "capital_goods",
    "energy",
    # 'pharmaceuticals_biotechnology_and_life_sciences',
    "consumer_staples",
    "banks",
    "diversified_financials",
    "metals_and_mining",
    "technology_hardware_and_equipment",
    "utilities",
    "chemicals",
    "automobiles_and_components",
    "semiconductors_and_semiconductor_equipment",
    "media_and_entertainment",
    "real_estate",
    "consumer_services",
    "consumer_durables_and_apparel",
    "insurance",
    "transportation",
    "commercial_and_professional_services",
    "paper_and_forest_products",
    "containers_and_packaging",
    "construction_materials",
]

l_reg = 3
l_roll = 2
dt = 10
last_residual_cutoff = 0
adf_pval_cutoff = 0.1
adf_pass_rate_filter = 0.5
mean_max_residual_dt = 21
arima_forecast_months = 3
arima_eval_models = 5

market_cap_max_string = "max" if market_cap_max_mm is None else str(market_cap_max_mm)
pipeline_dir = (
    "pipeline_run_" + str(market_cap_min_mm) + "_to_" + market_cap_max_string + "_cap"
)
output_dir = os.path.join("data", pipeline_dir)

stonk_model = predict.XGBStonkModel()

market_indexes = utils.get_market_indexes()

In [None]:
datasets = []
i = 0
total_industries = len(industries)
for industry in industries:
    i += 1
    stonks = utils.get_stonk_data(
        market_cap_min_mm,
        market_cap_max_mm,
        remove_industries=[industry],
        filter_industries=True,
    )
    X, Y = processing.combine_stonk_pairs(stonks)

    print("Industry ({0}/{1}): {2}".format(i, total_industries, industry))

    features = pipelines.process_features_from_price_data(
        X=X,
        Y=Y,
        market_indexes=market_indexes,
        l_reg=l_reg,
        l_roll=l_roll,
        dt=dt,
        last_residual_cutoff=last_residual_cutoff,
        adf_pval_cutoff=adf_pval_cutoff,
        adf_pass_rate_filter=adf_pass_rate_filter,
        mean_max_residual_dt=mean_max_residual_dt,
        arima_forecast_months=arima_forecast_months,
        arima_eval_models=arima_eval_models,
    )

    if len(features) == 0:
        print("No trades")
        continue

    print(
        "Mean max value for {0}: {1}".format(industry, features["residuals_max_mean"])
    )
    dataset = utils.build_dataset_from_live_data_by_industry(
        std_residuals=features["std_residuals"],
        adfs=features["adfs"],
        subindustry=industry,
        mean_max_residual=features["residuals_max_mean"],
        residual_quantile=features["residuals_quantile"],
        vix_index=market_indexes["vix"].loc[stonks.columns[-1]],
        betas_stability_rsquared=features["beta_stability_rsquared_vals"],
        arima_forecasts=features["arima_forecasts"],
    )

    predictions, df_processed = stonk_model.predict(dataset)
    datasets.append((dataset, df_processed))
    predictions = pd.DataFrame(predictions)
    predictions.index = features["adfs"].index

    features["residuals"].insert(0, "dates", features["dates_index"].values)
    features["betas"].insert(0, "dates", features["dates_index"].values)

    features["residuals"].to_csv(
        os.path.join(output_dir, industry + "_residuals.csv"),
        header=False,
        index=True,
    )
    features["betas"].to_csv(
        os.path.join(output_dir, industry + "_betas.csv"), header=False, index=True
    )
    features["adfs_raw"].to_csv(
        os.path.join(output_dir, industry + "_adfs_raw.csv"), header=False, index=True
    )
    predictions.to_csv(
        os.path.join(output_dir, industry + "_predictions.csv"),
        header=False,
        index=True,
    )
    features["arima_forecasts"].to_csv(
        os.path.join(output_dir, industry + "_arima.csv"),
        header=False,
        index=True,
    )
    features["beta_stability_rsquared_vals"].to_csv(
        os.path.join(output_dir, industry + "_rsquared.csv"),
        header=False,
        index=True,
    )
    features["market_correlations"].to_csv(
        os.path.join(output_dir, industry + "_correlations.csv"),
        header=True,
        index=True,
    )

print("*** All done ***")

Industry (1/24): software_and_services
Mean max value for software_and_services: 3.864000082015991
Industry (2/24): retailing
Mean max value for retailing: 3.9140000343322754
Industry (3/24): telecommunication_services
Mean max value for telecommunication_services: 3.4760000705718994
Industry (4/24): capital_goods
Mean max value for capital_goods: 3.86899995803833
Industry (5/24): energy
Mean max value for energy: 3.697999954223633
Industry (6/24): consumer_staples


# Data collection

In [3]:
stonks = utils.get_stonk_data(disable_filter=True)
stonks = stonks.loc[:, :"2022-09-16"]

In [None]:
pipelines.data_collection_rolling_pipeline(
    stonk_prices=stonks,
    l_reg=3,
    l_roll=2,
    dt=10,
    market_cap_min_mm=1000,
    market_cap_max_mm=None,
    last_residual_cutoff=2.5,
    mean_max_residual_dt=21,
    adf_pval_cutoff=0.1,
    adf_pass_rate_filter=0.5,
    arima_forecast_months=3,
    arima_eval_models=5,
    trade_length_months=3,
    trading_interval_weeks=2,
    remove_industries=[
        "pharmaceuticals_biotechnology_and_life_sciences",
        "health_care_equipment_and_services",
    ],
    first_n_windows=140,
)

Total data windows: 140


In [15]:
dataset = utils.ingest_trade_pipeline_outputs(
    data_dir="data/data_collection_pipeline/1000_to_max/"
)

vix = utils.get_stonk_data(fname_prefix="vix", disable_filter=True).iloc[0]
dataset["vix"] = dataset["trade_date"].apply(lambda x: vix.loc[x])

from utils import map_subindustries_to_industries

dataset.loc[:, "industry"] = dataset.apply(map_subindustries_to_industries, axis=1)

dataset.loc[:, "arima_forecast_normalized"] = dataset.apply(
    utils.normalize_arima_forecast, axis=1
)

# new_dataset = []
# for date, date_group in dataset.groupby(by="trade_date"):
#     for industry, industry_group in date_group.groupby(by="subindustry"):
#         residual_quantile = industry_group["last_residual"].abs().quantile(q=0.9)
#         industry_group.loc[:, "residual_quantile"] = np.full(len(industry_group), residual_quantile)
#         new_dataset.append(industry_group)

# new_dataset = pd.concat(new_dataset)

dataset.to_csv("data/dataset_bigcap.csv", header=True, index=False)

# Model development

In [61]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe, atpe, rand
import pickle

In [63]:
import importlib
importlib.reload(train)

<module 'train' from '/home/jupyter/stonk-rank/train.py'>

In [None]:
validation_results = pipelines.model_validation_pipeline(
    dataset=df,
    filename_prefix="final_one",
    fixed_train_window_size=True,
    data_window_max_train_size=60,
    data_window_test_size=2,
    data_window_gap_size=6,
    hp_model_evals=1000,
    top_n_best_trades=5,
    min_industry_confidence=0.4,
    random_noise=0.005,
    hp_nth_best_model=1,
    verbose=False,
)

Total data windows: 33
Period 2019-06-26 to 2022-06-16
  1%|          | 8/1000 [00:01<02:24,  6.87trial/s, best loss: -0.24873356851930287]

  recall = tps / tps[-1]



  5%|▌         | 54/1000 [00:09<03:09,  4.99trial/s, best loss: -0.27386190433735064]

  recall = tps / tps[-1]



 17%|█▋        | 173/1000 [00:30<02:09,  6.41trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 19%|█▊        | 186/1000 [00:32<02:17,  5.91trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 25%|██▍       | 249/1000 [00:45<04:37,  2.70trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 28%|██▊       | 281/1000 [00:51<02:24,  4.96trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 31%|███       | 311/1000 [00:57<02:12,  5.20trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 32%|███▏      | 322/1000 [00:59<02:15,  5.01trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 45%|████▌     | 450/1000 [01:26<01:51,  4.94trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 64%|██████▎   | 637/1000 [02:14<01:29,  4.06trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 64%|██████▍   | 638/1000 [02:14<01:28,  4.09trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 71%|███████▏  | 714/1000 [02:33<01:09,  4.14trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 88%|████████▊ | 877/1000 [03:17<00:34,  3.58trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 89%|████████▉ | 889/1000 [03:20<00:28,  3.94trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 92%|█████████▏| 917/1000 [03:27<00:22,  3.76trial/s, best loss: -0.30093178532531817]

  recall = tps / tps[-1]



 95%|█████████▌| 950/1000 [03:37<00:12,  3.91trial/s, best loss: -0.30093178532531817]

In [64]:
df = pd.read_csv("data/dataset_bigcap.csv")
df = df[df.beta > 0]
df = df[df.last_residual.abs() >= 2.5]
df = preprocessing.assign_labels(df)

In [65]:
updated_ticker_list = utils.get_ticker_names(
    1000,
    None,
    remove_industries=[
        "health_care_equipment_and_services",
        "pharmaceuticals_biotechnology_and_life_sciences",
    ],
)
df = df[df.ticker_x.isin(updated_ticker_list.index)]
df = df[df.ticker_y.isin(updated_ticker_list.index)]

## Production model training

In [55]:
train_size = 60
selected_dates = np.sort(df["trade_date"].unique())[-train_size:]
df_prod = df[df.trade_date.isin(selected_dates)].sample(frac=1)
print(len(df_prod))
print(df_prod["label"].value_counts())

32795
0    26803
1     5992
Name: label, dtype: int64


In [56]:
clf_prod, scalers_prod = train.train_production_xgb(df_prod, params, noise_level=0.005)

[0]	validation_0-logloss:0.68395
[1]	validation_0-logloss:0.67694
[2]	validation_0-logloss:0.67018
[3]	validation_0-logloss:0.66405
[4]	validation_0-logloss:0.65914
[5]	validation_0-logloss:0.65330
[6]	validation_0-logloss:0.64846
[7]	validation_0-logloss:0.64427
[8]	validation_0-logloss:0.64039
[9]	validation_0-logloss:0.63648
[10]	validation_0-logloss:0.63311
[11]	validation_0-logloss:0.63030
[12]	validation_0-logloss:0.62709
[13]	validation_0-logloss:0.62282
[14]	validation_0-logloss:0.61889
[15]	validation_0-logloss:0.61645
[16]	validation_0-logloss:0.61446
[17]	validation_0-logloss:0.61144
[18]	validation_0-logloss:0.60957
[19]	validation_0-logloss:0.60610
[20]	validation_0-logloss:0.60447
[21]	validation_0-logloss:0.60213
[22]	validation_0-logloss:0.60041
[23]	validation_0-logloss:0.59746
[24]	validation_0-logloss:0.59531
[25]	validation_0-logloss:0.59361
[26]	validation_0-logloss:0.59198
[27]	validation_0-logloss:0.58907
[28]	validation_0-logloss:0.58756
[29]	validation_0-loglos

## Model training experiments

In [50]:
import importlib
importlib.reload(preprocessing)

<module 'preprocessing' from '/home/jupyter/stonk-rank/preprocessing.py'>

In [51]:
splits = preprocessing.split_data(
    df, date_count_train=60, date_count_valid=2, date_count_gap=6, random_state=3303544
)
print(len(splits["train"]))
print(len(splits["validation"]))
print(splits["train"]["label"].value_counts())
print(splits["validation"]["label"].value_counts())

30621
964
0    24749
1     5872
Name: label, dtype: int64
0    781
1    183
Name: label, dtype: int64


In [52]:
X_train, scalers = preprocessing.transform_features(splits["train"], noise_level=0.005)

X_valid, _ = preprocessing.transform_features(
    splits["validation"], scalers=scalers, noise_level=0
)

y_train = splits["train"]["label"]
y_valid = splits["validation"]["label"]

In [44]:
df_trial_results = train.model_hp_search(
    df,
    n_evals=1000,
    fixed_train_window_size=False,
    max_train_window_size=68,
    trial_name="new_pipeline#3",
    additive_random_noise=0.005,
    train_window_min_size=60,
    train_window_stride=2,
    write_csv=True,
    random_state=420,
    data_dir="data",
    output_dir="experiments",
)

 72%|███████▎  | 725/1000 [03:32<02:28,  1.85trial/s, best loss: -0.48678320681977716]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [05:14<00:00,  3.18trial/s, best loss: -0.48678320681977716]


In [46]:
df_trials = pd.read_csv("data/experiments/new_pipeline#1.csv").sort_values(
    by="ap", ascending=False
)
df_trials.head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
0,4.098422,1.0,7.0,5.0,48.0,4.578455,0.285714,0.267943,0.536598,0.590195,209
1,2.408549,1.0,7.0,4.0,61.0,4.484318,0.285714,0.281915,0.530729,0.613488,188
2,2.7778,1.0,7.0,4.0,54.0,4.441888,0.276215,0.259615,0.529038,0.611931,208
3,2.124206,1.0,7.0,3.0,50.0,4.366102,0.335106,0.326425,0.527842,0.623108,193
4,3.066145,1.0,7.0,4.0,43.0,4.489866,0.318627,0.288889,0.510161,0.607079,225
5,2.818312,1.0,7.0,4.0,53.0,4.281667,0.293478,0.291892,0.500951,0.620694,185
6,3.90568,1.0,7.0,5.0,48.0,4.451125,0.303665,0.291457,0.49893,0.616279,199
7,2.498228,1.0,7.0,3.0,43.0,4.363691,0.277333,0.270833,0.491842,0.598182,192
8,2.400389,1.0,7.0,3.0,59.0,4.811645,0.304545,0.2607,0.48917,0.609905,257
9,4.990467,1.0,7.0,4.0,59.0,4.448196,0.29765,0.285,0.486946,0.619634,200


In [30]:
df_trials = pd.read_csv("data/experiments/adf_upgrade_yes-adf-recent#1.csv").sort_values(
    by="ap", ascending=False
)
df_trials.head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,train_window_size,f1_score,precision,ap,auc,pos_preds
0,1.802354,3.0,4.0,7.0,64.0,4.104953,64.0,0.276423,0.274194,0.466065,0.615832,186
1,1.708711,3.0,4.0,8.0,63.0,4.365494,64.0,0.298201,0.281553,0.454368,0.628979,206
2,2.033617,3.0,4.0,8.0,62.0,4.208674,64.0,0.293333,0.286458,0.453274,0.628314,192
3,1.160542,3.0,4.0,8.0,69.0,4.151177,56.0,0.275676,0.272727,0.44579,0.615212,187
4,1.291625,3.0,4.0,8.0,65.0,4.340644,64.0,0.305085,0.273913,0.443988,0.637553,230
5,2.506153,3.0,5.0,8.0,70.0,4.382457,64.0,0.323529,0.293333,0.439193,0.639323,225
6,0.846906,3.0,5.0,8.0,30.0,4.048634,64.0,0.295337,0.280788,0.432918,0.624504,203
7,2.397897,3.0,4.0,8.0,41.0,4.015005,64.0,0.317073,0.286344,0.429513,0.620754,227
8,2.164158,3.0,5.0,8.0,46.0,3.958084,64.0,0.293963,0.282828,0.428214,0.637969,198
9,1.959557,3.0,4.0,8.0,74.0,4.835569,64.0,0.298901,0.25,0.425112,0.618994,272


In [53]:
params = {
    "gamma": 4.098422,
    "scale_pos_weight": 4.578455,
    "max_depth": 7,
    "min_child_weight": 5,
    "max_delta_step": 1,
    "colsample_bylevel": 1,
    "n_estimators": 48,
    "learning_rate": 0.1,
    "subsample": 1,
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
    "random_state": np.random.randint(999929),
}

clf = xgb.XGBClassifier(**params)
clf = clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])

[0]	validation_0-logloss:0.68759	validation_1-logloss:0.68290
[1]	validation_0-logloss:0.68288	validation_1-logloss:0.67338
[2]	validation_0-logloss:0.67600	validation_1-logloss:0.66452
[3]	validation_0-logloss:0.67208	validation_1-logloss:0.65670
[4]	validation_0-logloss:0.66720	validation_1-logloss:0.65007
[5]	validation_0-logloss:0.66452	validation_1-logloss:0.64404
[6]	validation_0-logloss:0.65591	validation_1-logloss:0.63785
[7]	validation_0-logloss:0.65363	validation_1-logloss:0.63217
[8]	validation_0-logloss:0.65215	validation_1-logloss:0.62744
[9]	validation_0-logloss:0.64749	validation_1-logloss:0.62293
[10]	validation_0-logloss:0.64431	validation_1-logloss:0.61923
[11]	validation_0-logloss:0.64278	validation_1-logloss:0.61585
[12]	validation_0-logloss:0.64060	validation_1-logloss:0.61290
[13]	validation_0-logloss:0.63685	validation_1-logloss:0.60929
[14]	validation_0-logloss:0.63028	validation_1-logloss:0.60535
[15]	validation_0-logloss:0.62329	validation_1-logloss:0.60211
[1

In [54]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
thres = 0.5
y_preds = y_score > thres

df_results_valid = splits["validation"].copy()
df_results_valid["score"] = y_score
df_results_valid["prediction"] = y_preds

_ = evaluate.performance_summary(
    y_score=y_score, y_preds=y_preds, y_true=y_valid, auc_cutoff=0.5, verbose=True
)

_ = evaluate.returns_on_predictions(df_results_valid, verbose=True)

# _ = evaluate.performance_on_slice(df_results_valid, "subindustry")

**Validation**
Precision: 0.245136186770428
PR-AUC/AP score: 0.38282887033811264
ROC-AUC score: 0.5980283089495743
Total positive predictions: 257
Total positive labels: 183

Totals:
        prediction
result            
FN             120
FP             194
TN             587
TP              63

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.075925          0.093075            0.080333
FP              0.006701         -0.009912           -0.009582
TN              0.004467         -0.019247           -0.029538
TP              0.066444          0.117079            0.109556

Std:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.070681          0.062062            0.076462
FP              0.039699          0.060385            0.071674
TN              0.048008          0.065300            0.069

In [46]:
# pd.set_option("display.max_rows", 200)
# evaluate.performance_on_trading_use_case(
#         df_results_valid, top_n_trades=5, min_industry_score=0.4
#     )

In [31]:
for name, importance in zip(clf.feature_names_in_, clf.feature_importances_):
    print(name, importance)

adf_pass_rate 0.029732222
last_residual 0.08531821
residual_mean_max 0.0812026
industry 0.118948996
vix 0.26275504
betas_rsquared 0.14753708
arima_forecast_normalized 0.21244633
residual_inter 0.062059514


In [44]:
pd.set_option("display.max_rows", 200)

In [None]:
df_results_valid[df_results_valid.result == "FP"].iloc[:].drop(
    columns=["beta", "intercept", "data_window_start", "label", "prediction"]
).iloc[:100]

In [None]:
# df_results_valid[df_results_valid.return_three_month < -0.2]

In [None]:
# df_results_valid[df_results_valid.subindustry == 'consumer_services'].iloc[0:100]

In [1]:
vld = pd.read_csv(
    "data/experiments/validation/final_one_validation_pipeline_60_True_2_6_1000_5_0.4_0.005_1.csv"
)
vld[
    [
        "_pos_pred_ret_3mo",
        "_fp_ret_3mo",
        "_ap",
        "banksbanks_top5_ret_3mo",
        "capitgoods_top5_ret_3mo",
        "chemiicals_top5_ret_3mo",
        "divercials_top5_ret_3mo",
        "energnergy_top5_ret_3mo",
        "semicpment_top5_ret_3mo",
        "softwvices_top5_ret_3mo",
        "technpment_top5_ret_3mo",
        "transation_top5_ret_3mo",
        "utiliities_top5_ret_3mo",
    ]
].describe().drop(index=["std", "count"])

NameError: name 'pd' is not defined

In [37]:
vld = pd.read_csv(
    "data/experiments/validation/new_adf_recent_adf_included_validation_pipeline_64_False_2_6_1000_5_0.4_0.005_1.csv"
)
vld[
    [
        "_pos_pred_ret_3mo",
        "_fp_ret_3mo",
        "_ap",
        "banksbanks_top5_ret_3mo",
        "capitgoods_top5_ret_3mo",
        "chemiicals_top5_ret_3mo",
        "divercials_top5_ret_3mo",
        "energnergy_top5_ret_3mo",
        "semicpment_top5_ret_3mo",
        "softwvices_top5_ret_3mo",
        "technpment_top5_ret_3mo",
        "transation_top5_ret_3mo",
        "utiliities_top5_ret_3mo",
    ]
].describe().drop(index=["std", "count"])

Unnamed: 0,_pos_pred_ret_3mo,_fp_ret_3mo,_ap,banksbanks_top5_ret_3mo,capitgoods_top5_ret_3mo,chemiicals_top5_ret_3mo,divercials_top5_ret_3mo,energnergy_top5_ret_3mo,semicpment_top5_ret_3mo,softwvices_top5_ret_3mo,technpment_top5_ret_3mo,transation_top5_ret_3mo,utiliities_top5_ret_3mo
mean,-0.009548,-0.046613,0.319315,-0.002054,-0.01575,0.017282,-0.030595,0.031155,0.004512,-0.010848,0.031333,-0.012696,0.010837
min,-0.485,-0.597,0.062,-0.095,-0.213,-0.166,-0.273,-0.217,-0.305,-0.112,-0.181,-0.188,-0.093
25%,-0.03,-0.052,0.209,-0.024,-0.04975,-0.03,-0.057,-0.02475,-0.02425,-0.044,-0.0245,-0.0485,-0.01675
50%,0.0,-0.036,0.292,-0.004,-0.002,0.023,-0.023,0.0265,0.01,-0.011,0.01,-0.014,-0.0005
75%,0.016,-0.02,0.391,0.021,0.03,0.055,0.01025,0.07125,0.0545,0.0165,0.0485,0.047,0.03825
max,0.112,0.02,0.773,0.102,0.176,0.194,0.065,0.33,0.139,0.177,1.178,0.114,0.151


In [36]:
vld = pd.read_csv("data/experiments/validation/dynamic-train-window-sparse_validation_pipeline_62_False_2_6_1000_5_0.4_0.005_1.csv")
vld[["_pos_pred_ret_3mo", "_fp_ret_3mo", "_ap", "banksbanks_top5_ret_3mo", "capitgoods_top5_ret_3mo", "chemiicals_top5_ret_3mo", "divercials_top5_ret_3mo", "energnergy_top5_ret_3mo", "semicpment_top5_ret_3mo", "softwvices_top5_ret_3mo", "technpment_top5_ret_3mo", "transation_top5_ret_3mo", "utiliities_top5_ret_3mo"]].describe().drop(index=["std", "count"])

Unnamed: 0,_pos_pred_ret_3mo,_fp_ret_3mo,_ap,banksbanks_top5_ret_3mo,capitgoods_top5_ret_3mo,chemiicals_top5_ret_3mo,divercials_top5_ret_3mo,energnergy_top5_ret_3mo,semicpment_top5_ret_3mo,softwvices_top5_ret_3mo,technpment_top5_ret_3mo,transation_top5_ret_3mo,utiliities_top5_ret_3mo
mean,0.047659,-0.027708,0.35029,0.004711,-0.002697,0.010904,-0.004976,0.02812,-0.002802,0.000348,0.019851,0.006367,0.006594
min,-0.094,-0.101,0.0,-0.104,-0.19,-0.106,-0.146,-0.19,-0.226,-0.271,-0.675,-0.218,-0.064
25%,-0.02025,-0.04625,0.167,-0.02675,-0.048,-0.028,-0.04325,-0.042,-0.03,-0.032,-0.0275,-0.034,-0.02
50%,0.0155,-0.021,0.321,-0.0025,-0.002,0.001,-0.0025,0.003,0.0,0.003,0.032,0.0115,0.005
75%,0.04425,-0.003,0.473,0.0305,0.035,0.047,0.033,0.0675,0.048,0.053,0.077,0.0405,0.02
max,1.309,0.036,1.0,0.153,0.261,0.162,0.136,0.558,0.138,0.164,0.824,0.228,0.088
