# Imports

In [None]:
# !pip install yfinance
# !pip install pmdarima
# !pip install hyperopt
# !pip install xgboost
# !pip install numpy -U

In [2]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import train
import preprocessing

# Download stock daily prices & indexes

In [9]:
### Gets all ticker names (no argument given)
market_cap_min_mm = 5000
market_cap_max_mm = None

ticker_list = utils.get_ticker_names(
    market_cap_min_mm=market_cap_min_mm,
    market_cap_max_mm=market_cap_max_mm,
    remove_industries=[
        "pharmaceuticals_biotechnology_and_life_sciences",
        "health_care_equipment_and_services",
    ],
)

In [10]:
### Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2021, 1, 18)
### Date of today
date_to = datetime.today()
### How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 8

In [11]:
_, _ = utils.download_stonk_prices(
    ticker_list.index, period_years=period_years, date_to=date_to
)
_, _ = utils.download_stonk_prices(
    ["^VIX"], period_years=period_years, date_to=date_to, fname_prefix="vix"
)
_, _ = utils.download_stonk_prices(
    ["^GSPC"], period_years=period_years, date_to=date_to, fname_prefix="sp500"
)
_, _ = utils.download_stonk_prices(
    ["CL=F"], period_years=period_years, date_to=date_to, fname_prefix="oil"
)
_, _ = utils.download_stonk_prices(
    ["DX=F"], period_years=period_years, date_to=date_to, fname_prefix="usd"
)
_, _ = utils.download_stonk_prices(
    ["^TNX"], period_years=period_years, date_to=date_to, fname_prefix="yield"
)
_, _ = utils.download_stonk_prices(
    ["HG=F"], period_years=period_years, date_to=date_to, fname_prefix="copper"
)

[*********************100%***********************]  1063 of 1063 completed

41 Failed downloads:
- CTXS: No timezone found, symbol may be delisted
- NLSN: No timezone found, symbol may be delisted
- BIP.PRA: No timezone found, symbol may be delisted
- DRE: No timezone found, symbol may be delisted
- LFC: No timezone found, symbol may be delisted
- Y: No timezone found, symbol may be delisted
- HNP: No timezone found, symbol may be delisted
- MIME: No timezone found, symbol may be delisted
- SNP: No timezone found, symbol may be delisted
- FB: No timezone found, symbol may be delisted
- VG: No timezone found, symbol may be delisted
- DIDI: No timezone found, symbol may be delisted
- SAIL: No timezone found, symbol may be delisted
- TMX: No timezone found, symbol may be delisted
- ZEN: No timezone found, symbol may be delisted
- PTR: No timezone found, symbol may be delisted
- CLR: No timezone found, symbol may be delisted
- SNX.WI: No timezone found, symbol may be delisted
- PLAN: No ti

# Run data pipeline

In [12]:
industries = [
    # 'health_care_equipment_and_services',
    "software_and_services",
    "retailing",
    "telecommunication_services",
    "capital_goods",
    "energy",
    # 'pharmaceuticals_biotechnology_and_life_sciences',
    "consumer_staples",
    "banks",
    "diversified_financials",
    "metals_and_mining",
    "technology_hardware_and_equipment",
    "utilities",
    "chemicals",
    "automobiles_and_components",
    "semiconductors_and_semiconductor_equipment",
    "media_and_entertainment",
    "real_estate",
    "consumer_services",
    "consumer_durables_and_apparel",
    "insurance",
    "transportation",
    "commercial_and_professional_services",
    # "paper_and_forest_products",
    "containers_and_packaging",
    # "construction_materials",
]

l_reg = 4
l_roll = 3
dt = 10
last_residual_cutoff = 2.5
adf_pval_cutoff = 0.1
adf_pass_rate_filter = 0.5
mean_max_residual_dt = 21
arima_forecast_months = 3
arima_eval_models = 5

market_cap_max_string = "max" if market_cap_max_mm is None else str(market_cap_max_mm)
pipeline_dir = (
    "pipeline_run_" + str(market_cap_min_mm) + "_to_" + market_cap_max_string + "_cap"
)
output_dir = os.path.join("data", pipeline_dir)

# stonk_model = predict.XGBStonkModel()

market_indexes = utils.get_market_indexes()

In [13]:
datasets = []
i = 0
total_industries = len(industries)
for industry in industries:
    i += 1
    stonks = utils.get_stonk_data(
        market_cap_min_mm,
        market_cap_max_mm,
        remove_industries=[industry],
        filter_industries=True,
    )
    X, Y = processing.combine_stonk_pairs(stonks)

    print("Industry ({0}/{1}): {2}".format(i, total_industries, industry))

    features = pipelines.process_features_from_price_data(
        X=X,
        Y=Y,
        market_indexes=market_indexes,
        l_reg=l_reg,
        l_roll=l_roll,
        dt=dt,
        last_residual_cutoff=last_residual_cutoff,
        adf_pval_cutoff=adf_pval_cutoff,
        adf_pass_rate_filter=adf_pass_rate_filter,
        mean_max_residual_dt=mean_max_residual_dt,
        arima_forecast_months=arima_forecast_months,
        arima_eval_models=arima_eval_models,
    )

    if len(features) == 0:
        print("No trades")
        continue

    print(
        "Mean max value for {0}: {1}".format(industry, features["residuals_max_mean"])
    )
#     dataset = utils.build_dataset_from_live_data_by_industry(
#         std_residuals=features["std_residuals"],
#         adfs=features["adfs"],
#         subindustry=industry,
#         mean_max_residual=features["residuals_max_mean"],
#         residual_quantile=features["residuals_quantile"],
#         vix_index=market_indexes["vix"].loc[stonks.columns[-1]],
#         betas_stability_rsquared=features["beta_stability_rsquared_vals"],
#         arima_forecasts=features["arima_forecasts"],
#     )

#     predictions, df_processed = stonk_model.predict(dataset)
#     datasets.append((dataset, df_processed))
#     predictions = pd.DataFrame(predictions)
#     predictions.index = features["adfs"].index

    features["residuals"].insert(0, "dates", features["dates_index"].values)
    features["betas"].insert(0, "dates", features["dates_index"].values)

    features["residuals"].to_csv(
        os.path.join(output_dir, industry + "_residuals.csv"),
        header=False,
        index=True,
    )
    features["betas"].to_csv(
        os.path.join(output_dir, industry + "_betas.csv"), header=False, index=True
    )
    features["adfs_raw"].to_csv(
        os.path.join(output_dir, industry + "_adfs_raw.csv"), header=False, index=True
    )
    # predictions.to_csv(
    #     os.path.join(output_dir, industry + "_predictions.csv"),
    #     header=False,
    #     index=True,
    # )
    features["arima_forecasts"].to_csv(
        os.path.join(output_dir, industry + "_arima.csv"),
        header=False,
        index=True,
    )
    features["beta_stability_rsquared_vals"].to_csv(
        os.path.join(output_dir, industry + "_rsquared.csv"),
        header=False,
        index=True,
    )
    features["market_correlations"].to_csv(
        os.path.join(output_dir, industry + "_correlations.csv"),
        header=True,
        index=True,
    )

print("*** All done ***")

Industry (1/22): software_and_services
Mean max value for software_and_services: 3.5220000743865967
Industry (2/22): retailing
Mean max value for retailing: 3.053999900817871
Industry (3/22): telecommunication_services
No trades
Industry (4/22): capital_goods
Mean max value for capital_goods: 3.8440001010894775
Industry (5/22): energy
Mean max value for energy: 3.7639999389648438
Industry (6/22): consumer_staples
Mean max value for consumer_staples: 2.630000114440918
Industry (7/22): banks
Mean max value for banks: 3.9010000228881836
Industry (8/22): diversified_financials
Mean max value for diversified_financials: 3.384000062942505
Industry (9/22): metals_and_mining
Mean max value for metals_and_mining: 3.0
Industry (10/22): technology_hardware_and_equipment
Mean max value for technology_hardware_and_equipment: 4.349999904632568
Industry (11/22): utilities
Mean max value for utilities: 3.0810000896453857
Industry (12/22): chemicals
Mean max value for chemicals: 1.7510000467300415
Indu

# Data collection

In [3]:
stonks = utils.get_stonk_data(disable_filter=True)
stonks = stonks.loc[:, :"2022-09-16"]

In [None]:
pipelines.data_collection_rolling_pipeline(
    stonk_prices=stonks,
    l_reg=3,
    l_roll=2,
    dt=10,
    market_cap_min_mm=1000,
    market_cap_max_mm=None,
    last_residual_cutoff=2.5,
    mean_max_residual_dt=21,
    adf_pval_cutoff=0.1,
    adf_pass_rate_filter=0.5,
    arima_forecast_months=3,
    arima_eval_models=5,
    trade_length_months=3,
    trading_interval_weeks=2,
    remove_industries=[
        "pharmaceuticals_biotechnology_and_life_sciences",
        "health_care_equipment_and_services",
    ],
    first_n_windows=140,
)

Total data windows: 140


In [9]:
dataset = utils.ingest_trade_pipeline_outputs(
    data_dir="data/data_collection_pipeline/1000_to_max"
)

vix = utils.get_stonk_data(fname_prefix="vix", disable_filter=True).iloc[0]
dataset["vix"] = dataset["trade_date"].apply(lambda x: vix.loc[x])

from utils import map_subindustries_to_industries

dataset.loc[:, "industry"] = dataset.apply(map_subindustries_to_industries, axis=1)

dataset.loc[:, "arima_forecast_normalized"] = dataset.apply(
    utils.normalize_arima_forecast, axis=1
)

# new_dataset = []
# for date, date_group in dataset.groupby(by="trade_date"):
#     for industry, industry_group in date_group.groupby(by="subindustry"):
#         residual_quantile = industry_group["last_residual"].abs().quantile(q=0.9)
#         industry_group.loc[:, "residual_quantile"] = np.full(len(industry_group), residual_quantile)
#         new_dataset.append(industry_group)

# new_dataset = pd.concat(new_dataset)

dataset.to_csv("data/dataset_bigcap.csv", header=True, index=False)

# Model development

In [2]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe, atpe, rand
import pickle

In [13]:
import importlib
importlib.reload(train)

<module 'train' from '/home/jupyter/stonk-rank/train.py'>

In [None]:
validation_results = pipelines.model_validation_pipeline(
    dataset=df,
    filename_prefix="old_dataset_new_ticker_list_fixed_window",
    fixed_train_window_size=True,
    data_window_max_train_size=60,
    data_window_test_size=2,
    data_window_gap_size=6,
    hp_model_evals=1000,
    top_n_best_trades=5,
    min_industry_confidence=0.4,
    random_noise=0.005,
    hp_nth_best_model=1,
    verbose=False,
)

Total data windows: 33
Period 2019-06-17 to 2022-06-03
  3%|▎         | 32/1000 [00:07<03:55,  4.11trial/s, best loss: -0.3682821436212146]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [06:08<00:00,  2.71trial/s, best loss: -0.38095051451406037]
Period 2019-05-17 to 2022-05-06
 34%|███▎      | 336/1000 [01:31<02:31,  4.39trial/s, best loss: -0.6129285963057904]

  recall = tps / tps[-1]



 39%|███▉      | 393/1000 [01:46<02:17,  4.41trial/s, best loss: -0.6129285963057904]

  recall = tps / tps[-1]



 95%|█████████▌| 954/1000 [04:39<00:13,  3.42trial/s, best loss: -0.6186288922320173]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [04:53<00:00,  3.40trial/s, best loss: -0.6255996706616136]
Period 2019-04-18 to 2022-04-07
  6%|▌         | 55/1000 [00:15<03:44,  4.20trial/s, best loss: -0.45602472144089506]

  recall = tps / tps[-1]



 22%|██▏       | 217/1000 [00:48<02:40,  4.89trial/s, best loss: -0.48507146458158784]

  recall = tps / tps[-1]



 51%|█████     | 508/1000 [01:55<01:57,  4.18trial/s, best loss: -0.48507146458158784]

  recall = tps / tps[-1]



 84%|████████▍ | 844/1000 [03:20<00:49,  3.18trial/s, best loss: -0.48507146458158784]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [04:09<00:00,  4.02trial/s, best loss: -0.48507146458158784]
Period 2019-03-21 to 2022-03-10
  1%|          | 8/1000 [00:01<03:23,  4.88trial/s, best loss: -0.3803110005248789] 

  recall = tps / tps[-1]



  6%|▌         | 58/1000 [00:15<04:00,  3.92trial/s, best loss: -0.4038174046727976]

  recall = tps / tps[-1]



  8%|▊         | 85/1000 [00:20<03:26,  4.42trial/s, best loss: -0.4038174046727976]

  recall = tps / tps[-1]



 26%|██▌       | 256/1000 [01:10<03:49,  3.24trial/s, best loss: -0.4068154886309657]

  recall = tps / tps[-1]



 63%|██████▎   | 630/1000 [03:11<02:04,  2.97trial/s, best loss: -0.41870714500138306]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [05:24<00:00,  3.08trial/s, best loss: -0.419620746470948] 
Period 2019-02-21 to 2022-02-09
 57%|█████▋    | 574/1000 [02:49<01:55,  3.69trial/s, best loss: -0.3391111237750869] 

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [05:18<00:00,  3.14trial/s, best loss: -0.34964394130110366]
Period 2019-01-23 to 2022-01-11
  9%|▉         | 89/1000 [00:22<03:11,  4.77trial/s, best loss: -0.41648340072831663]

  recall = tps / tps[-1]



 19%|█▉        | 192/1000 [00:45<03:29,  3.86trial/s, best loss: -0.4436657623928054] 

  recall = tps / tps[-1]



 26%|██▌       | 255/1000 [01:01<03:42,  3.35trial/s, best loss: -0.4436657623928054]

  recall = tps / tps[-1]



 57%|█████▋    | 572/1000 [02:29<01:50,  3.87trial/s, best loss: -0.4692008970362271] 

  recall = tps / tps[-1]



 94%|█████████▍| 944/1000 [04:11<00:16,  3.37trial/s, best loss: -0.47233039188066667]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



 67%|██████▋   | 667/1000 [03:07<02:51,  1.94trial/s, best loss: -0.2790087814067491]

  recall = tps / tps[-1]



 68%|██████▊   | 678/1000 [03:10<01:42,  3.15trial/s, best loss: -0.2790087814067491]

  recall = tps / tps[-1]



 69%|██████▉   | 693/1000 [03:16<01:41,  3.04trial/s, best loss: -0.2790087814067491]

  recall = tps / tps[-1]



 72%|███████▏  | 720/1000 [03:24<01:19,  3.50trial/s, best loss: -0.294284557983736] 

  recall = tps / tps[-1]



 79%|███████▉  | 794/1000 [03:50<01:22,  2.51trial/s, best loss: -0.294284557983736]

  recall = tps / tps[-1]



 81%|████████▏ | 813/1000 [03:56<01:02,  2.99trial/s, best loss: -0.294284557983736]

  recall = tps / tps[-1]



 82%|████████▏ | 817/1000 [03:57<01:00,  3.01trial/s, best loss: -0.294284557983736]

  recall = tps / tps[-1]



 84%|████████▍ | 841/1000 [04:05<00:52,  3.01trial/s, best loss: -0.294284557983736]

  recall = tps / tps[-1]



 85%|████████▌ | 851/1000 [04:08<00:47,  3.12trial/s, best loss: -0.294284557983736]

  recall = tps / tps[-1]



 86%|████████▌ | 862/1000 [04:12<00:45,  3.06trial/s, best loss: -0.294284557983736]

  recall = tps / tps[-1]



 87%|████████▋ | 874/1000 [04:16<00:44,  2.86trial/s, best loss: -0.294284557983736]

  recall = tps / tps[-1]



 94%|█████████▍| 939/1000 [04:40<00:41,  1.46trial/s, best loss: -0.294284557983736]

  recall = tps / tps[-1]



 94%|█████████▍| 942/1000 [04:42<00:30,  1.91trial/s, best loss: -0.294284557983736]

  recall = tps / tps[-1]



 97%|█████████▋| 973/1000 [04:52<00:09,  2.95trial/s, best loss: -0.294284557983736]

  recall = tps / tps[-1]



 98%|█████████▊| 981/1000 [04:55<00:06,  2.93trial/s, best loss: -0.294284557983736]

  recall = tps / tps[-1]



 99%|█████████▉| 989/1000 [04:58<00:03,  2.92trial/s, best loss: -0.294284557983736]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [05:02<00:00,  3.31trial/s, best loss: -0.294284557983736]


  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


Period 2018-10-29 to 2021-10-15
 31%|███       | 312/1000 [01:22<02:54,  3.95trial/s, best loss: -0.32084164708088264]

  recall = tps / tps[-1]



 32%|███▏      | 317/1000 [01:23<03:06,  3.67trial/s, best loss: -0.32084164708088264]

  recall = tps / tps[-1]



 47%|████▋     | 471/1000 [02:03<02:00,  4.37trial/s, best loss: -0.32084164708088264]

  recall = tps / tps[-1]



 56%|█████▌    | 559/1000 [02:26<01:48,  4.07trial/s, best loss: -0.32084164708088264]

  recall = tps / tps[-1]



 76%|███████▋  | 763/1000 [03:21<01:09,  3.40trial/s, best loss: -0.3304684046457087] 

  recall = tps / tps[-1]



 87%|████████▋ | 869/1000 [03:53<00:38,  3.37trial/s, best loss: -0.3304684046457087]

  recall = tps / tps[-1]



 89%|████████▉ | 889/1000 [04:00<00:35,  3.10trial/s, best loss: -0.3304684046457087]

  recall = tps / tps[-1]



 96%|█████████▋| 963/1000 [04:24<00:11,  3.26trial/s, best loss: -0.3304684046457087]

  recall = tps / tps[-1]



 98%|█████████▊| 977/1000 [04:29<00:07,  3.23trial/s, best loss: -0.3304684046457087]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [04:39<00:00,  3.58trial/s, best loss: -0.3304684046457087]
Period 2018-10-01 to 2021-09-17
  3%|▎         | 26/1000 [00:06<04:29,  3.61trial/s, best loss: -0.25734291876692134]

  recall = tps / tps[-1]



 14%|█▍        | 139/1000 [00:34<03:29,  4.12trial/s, best loss: -0.3116002801524065]

  recall = tps / tps[-1]



 18%|█▊        | 181/1000 [00:45<03:56,  3.46trial/s, best loss: -0.34201038311664944]

  recall = tps / tps[-1]



 32%|███▏      | 318/1000 [01:21<03:38,  3.12trial/s, best loss: -0.34201038311664944]

  recall = tps / tps[-1]



 33%|███▎      | 332/1000 [01:25<03:12,  3.47trial/s, best loss: -0.34201038311664944]

  recall = tps / tps[-1]



 37%|███▋      | 366/1000 [01:35<02:54,  3.64trial/s, best loss: -0.34201038311664944]

  recall = tps / tps[-1]



 56%|█████▌    | 555/1000 [02:29<01:53,  3.92trial/s, best loss: -0.34821350545823054]

  recall = tps / tps[-1]



 69%|██████▉   | 694/1000 [03:11<01:28,  3.44trial/s, best loss: -0.35766618127707706]

  recall = tps / tps[-1]



 78%|███████▊  | 777/1000 [03:36<00:57,  3.89trial/s, best loss: -0.35766618127707706]

  recall = tps / tps[-1]



 92%|█████████▏| 917/1000 [04:22<00:27,  3.07trial/s, best loss: -0.35839712897359366]

  recall = tps / tps[-1]



 98%|█████████▊| 979/1000 [04:42<00:06,  3.19trial/s, best loss: -0.35839712897359366]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [04:49<00:00,  3.45trial/s, best loss: -0.35839712897359366]
Period 2018-08-31 to 2021-08-19
 13%|█▎        | 134/1000 [00:30<02:55,  4.92trial/s, best loss: -0.19935697053296073]

  recall = tps / tps[-1]



 15%|█▌        | 152/1000 [00:33<02:48,  5.02trial/s, best loss: -0.23417367923305016]

  recall = tps / tps[-1]



 19%|█▉        | 189/1000 [00:43<03:26,  3.92trial/s, best loss: -0.23417367923305016]

  recall = tps / tps[-1]



 30%|██▉       | 299/1000 [01:11<02:42,  4.32trial/s, best loss: -0.2346985676454938] 

  recall = tps / tps[-1]



 37%|███▋      | 371/1000 [01:30<02:20,  4.49trial/s, best loss: -0.2346985676454938]

  recall = tps / tps[-1]



 38%|███▊      | 378/1000 [01:31<02:17,  4.53trial/s, best loss: -0.2346985676454938]

  recall = tps / tps[-1]



 43%|████▎     | 433/1000 [01:44<02:15,  4.17trial/s, best loss: -0.2346985676454938]

  recall = tps / tps[-1]



 44%|████▎     | 435/1000 [01:44<02:13,  4.24trial/s, best loss: -0.2346985676454938]

  recall = tps / tps[-1]



 46%|████▌     | 459/1000 [01:50<02:03,  4.40trial/s, best loss: -0.2346985676454938]

  recall = tps / tps[-1]



 47%|████▋     | 469/1000 [01:52<02:03,  4.29trial/s, best loss: -0.2346985676454938]

  recall = tps / tps[-1]



 52%|█████▏    | 522/1000 [02:06<02:49,  2.82trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 54%|█████▍    | 544/1000 [02:11<01:42,  4.46trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 55%|█████▌    | 551/1000 [02:13<01:54,  3.91trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 58%|█████▊    | 580/1000 [02:21<02:09,  3.24trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 58%|█████▊    | 585/1000 [02:22<02:03,  3.36trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 60%|█████▉    | 595/1000 [02:25<02:03,  3.29trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 61%|██████    | 609/1000 [02:30<01:53,  3.44trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 62%|██████▏   | 618/1000 [02:32<01:53,  3.36trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 63%|██████▎   | 628/1000 [02:35<01:34,  3.93trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 63%|██████▎   | 634/1000 [02:37<01:42,  3.56trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 66%|██████▌   | 659/1000 [02:46<01:57,  2.89trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 68%|██████▊   | 676/1000 [02:51<01:41,  3.19trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 71%|███████▏  | 714/1000 [03:01<01:14,  3.82trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 77%|███████▋  | 774/1000 [03:19<00:59,  3.79trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 78%|███████▊  | 780/1000 [03:21<00:57,  3.86trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 80%|███████▉  | 797/1000 [03:25<00:57,  3.56trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 82%|████████▏ | 822/1000 [03:32<00:48,  3.71trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 86%|████████▌ | 859/1000 [03:43<00:39,  3.56trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 87%|████████▋ | 870/1000 [03:46<00:34,  3.72trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 87%|████████▋ | 872/1000 [03:47<00:37,  3.44trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 88%|████████▊ | 882/1000 [03:50<00:37,  3.12trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 90%|█████████ | 904/1000 [03:57<00:28,  3.36trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 91%|█████████ | 908/1000 [03:58<00:25,  3.65trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 92%|█████████▏| 915/1000 [04:00<00:23,  3.67trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 92%|█████████▏| 922/1000 [04:02<00:21,  3.63trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 94%|█████████▍| 943/1000 [04:11<00:20,  2.73trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 94%|█████████▍| 944/1000 [04:11<00:20,  2.69trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 99%|█████████▉| 989/1000 [04:24<00:02,  3.75trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



 99%|█████████▉| 993/1000 [04:25<00:01,  3.82trial/s, best loss: -0.26726035780828555]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [04:27<00:00,  3.74trial/s, best loss: -0.26726035780828555]
Period 2018-08-03 to 2021-07-22
 56%|█████▌    | 555/1000 [03:32<02:34,  2.88trial/s, best loss: -0.3197648490776086]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [06:36<00:00,  2.52trial/s, best loss: -0.3197648490776086]
Period 2018-07-06 to 2021-06-23
100%|██████████| 1000/1000 [05:16<00:00,  3.16trial/s, best loss: -0.38044688890447864]

  recall = tps / tps[-1]




Period 2018-06-07 to 2021-05-25
  1%|▏         | 14/1000 [00:03<03:42,  4.42trial/s, best loss: -0.22417317244033588]

  recall = tps / tps[-1]



  7%|▋         | 72/1000 [00:18<04:11,  3.69trial/s, best loss: -0.22417317244033588]

  recall = tps / tps[-1]



 12%|█▏        | 123/1000 [00:32<03:22,  4.32trial/s, best loss: -0.23384913060198834]

  recall = tps / tps[-1]



 25%|██▍       | 247/1000 [01:11<03:51,  3.25trial/s, best loss: -0.23384913060198834]

  recall = tps / tps[-1]



 25%|██▌       | 251/1000 [01:12<03:36,  3.45trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 25%|██▌       | 253/1000 [01:13<03:27,  3.60trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 29%|██▉       | 292/1000 [01:24<03:31,  3.35trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 34%|███▍      | 341/1000 [01:41<07:59,  1.37trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 42%|████▏     | 423/1000 [02:08<03:14,  2.97trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 49%|████▉     | 494/1000 [02:32<03:17,  2.57trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 54%|█████▍    | 541/1000 [02:49<02:38,  2.89trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 54%|█████▍    | 544/1000 [02:50<02:08,  3.56trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 55%|█████▍    | 546/1000 [02:51<02:14,  3.38trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 57%|█████▊    | 575/1000 [03:00<02:11,  3.23trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 61%|██████▏   | 613/1000 [03:11<01:33,  4.12trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 66%|██████▋   | 664/1000 [03:27<01:36,  3.47trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 71%|███████   | 710/1000 [03:44<01:45,  2.75trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 75%|███████▌  | 752/1000 [03:59<01:30,  2.75trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



 86%|████████▌ | 860/1000 [04:37<01:01,  2.28trial/s, best loss: -0.24430225501594194]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [05:30<00:00,  3.03trial/s, best loss: -0.24430225501594194]
Period 2018-05-09 to 2021-04-27
  5%|▍         | 49/1000 [00:14<03:57,  4.01trial/s, best loss: -0.2281541340319459] 

  recall = tps / tps[-1]



  6%|▌         | 59/1000 [00:16<03:13,  4.86trial/s, best loss: -0.24216501041663313]

  recall = tps / tps[-1]



  9%|▉         | 91/1000 [00:24<03:32,  4.27trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 10%|▉         | 96/1000 [00:25<03:25,  4.40trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 10%|█         | 102/1000 [00:27<03:58,  3.77trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 12%|█▏        | 119/1000 [00:32<03:45,  3.90trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 13%|█▎        | 128/1000 [00:34<03:29,  4.16trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 14%|█▍        | 140/1000 [00:37<03:59,  3.59trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 18%|█▊        | 178/1000 [00:48<04:21,  3.14trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 18%|█▊        | 182/1000 [00:49<04:20,  3.15trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 28%|██▊       | 275/1000 [01:19<02:47,  4.33trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 30%|██▉       | 295/1000 [01:24<02:52,  4.10trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 33%|███▎      | 328/1000 [01:32<02:41,  4.17trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 34%|███▍      | 338/1000 [01:35<02:45,  4.00trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 37%|███▋      | 374/1000 [01:44<02:32,  4.10trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 43%|████▎     | 432/1000 [02:01<02:44,  3.45trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 44%|████▍     | 445/1000 [02:05<02:39,  3.48trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 46%|████▌     | 461/1000 [02:11<03:20,  2.69trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 53%|█████▎    | 527/1000 [02:32<02:22,  3.31trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 54%|█████▎    | 535/1000 [02:35<02:20,  3.32trial/s, best loss: -0.30217282165747555]

  recall = tps / tps[-1]



 60%|██████    | 600/1000 [02:54<02:06,  3.17trial/s, best loss: -0.30217282165747555]

In [4]:
df = pd.read_csv("data/dataset_bigcap_old.csv")
df = df[df.beta > 0]
df = df[df.last_residual.abs() >= 2.5]
df = preprocessing.assign_labels(df)

In [5]:
updated_ticker_list = utils.get_ticker_names(
    1000,
    None,
    remove_industries=[
        "health_care_equipment_and_services",
        "pharmaceuticals_biotechnology_and_life_sciences",
    ],
)
df = df[df.ticker_x.isin(updated_ticker_list.index)]
df = df[df.ticker_y.isin(updated_ticker_list.index)]

## Production model training

In [55]:
train_size = 60
selected_dates = np.sort(df["trade_date"].unique())[-train_size:]
df_prod = df[df.trade_date.isin(selected_dates)].sample(frac=1)
print(len(df_prod))
print(df_prod["label"].value_counts())

32795
0    26803
1     5992
Name: label, dtype: int64


In [56]:
clf_prod, scalers_prod = train.train_production_xgb(df_prod, params, noise_level=0.005)

[0]	validation_0-logloss:0.68395
[1]	validation_0-logloss:0.67694
[2]	validation_0-logloss:0.67018
[3]	validation_0-logloss:0.66405
[4]	validation_0-logloss:0.65914
[5]	validation_0-logloss:0.65330
[6]	validation_0-logloss:0.64846
[7]	validation_0-logloss:0.64427
[8]	validation_0-logloss:0.64039
[9]	validation_0-logloss:0.63648
[10]	validation_0-logloss:0.63311
[11]	validation_0-logloss:0.63030
[12]	validation_0-logloss:0.62709
[13]	validation_0-logloss:0.62282
[14]	validation_0-logloss:0.61889
[15]	validation_0-logloss:0.61645
[16]	validation_0-logloss:0.61446
[17]	validation_0-logloss:0.61144
[18]	validation_0-logloss:0.60957
[19]	validation_0-logloss:0.60610
[20]	validation_0-logloss:0.60447
[21]	validation_0-logloss:0.60213
[22]	validation_0-logloss:0.60041
[23]	validation_0-logloss:0.59746
[24]	validation_0-logloss:0.59531
[25]	validation_0-logloss:0.59361
[26]	validation_0-logloss:0.59198
[27]	validation_0-logloss:0.58907
[28]	validation_0-logloss:0.58756
[29]	validation_0-loglos

## Model training experiments

In [50]:
import importlib
importlib.reload(preprocessing)

<module 'preprocessing' from '/home/jupyter/stonk-rank/preprocessing.py'>

In [51]:
splits = preprocessing.split_data(
    df, date_count_train=60, date_count_valid=2, date_count_gap=6, random_state=3303544
)
print(len(splits["train"]))
print(len(splits["validation"]))
print(splits["train"]["label"].value_counts())
print(splits["validation"]["label"].value_counts())

30621
964
0    24749
1     5872
Name: label, dtype: int64
0    781
1    183
Name: label, dtype: int64


In [52]:
X_train, scalers = preprocessing.transform_features(splits["train"], noise_level=0.005)

X_valid, _ = preprocessing.transform_features(
    splits["validation"], scalers=scalers, noise_level=0
)

y_train = splits["train"]["label"]
y_valid = splits["validation"]["label"]

In [44]:
df_trial_results = train.model_hp_search(
    df,
    n_evals=1000,
    fixed_train_window_size=False,
    max_train_window_size=68,
    trial_name="new_pipeline#3",
    additive_random_noise=0.005,
    train_window_min_size=60,
    train_window_stride=2,
    write_csv=True,
    random_state=420,
    data_dir="data",
    output_dir="experiments",
)

 72%|███████▎  | 725/1000 [03:32<02:28,  1.85trial/s, best loss: -0.48678320681977716]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [05:14<00:00,  3.18trial/s, best loss: -0.48678320681977716]


In [46]:
df_trials = pd.read_csv("data/experiments/new_pipeline#1.csv").sort_values(
    by="ap", ascending=False
)
df_trials.head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
0,4.098422,1.0,7.0,5.0,48.0,4.578455,0.285714,0.267943,0.536598,0.590195,209
1,2.408549,1.0,7.0,4.0,61.0,4.484318,0.285714,0.281915,0.530729,0.613488,188
2,2.7778,1.0,7.0,4.0,54.0,4.441888,0.276215,0.259615,0.529038,0.611931,208
3,2.124206,1.0,7.0,3.0,50.0,4.366102,0.335106,0.326425,0.527842,0.623108,193
4,3.066145,1.0,7.0,4.0,43.0,4.489866,0.318627,0.288889,0.510161,0.607079,225
5,2.818312,1.0,7.0,4.0,53.0,4.281667,0.293478,0.291892,0.500951,0.620694,185
6,3.90568,1.0,7.0,5.0,48.0,4.451125,0.303665,0.291457,0.49893,0.616279,199
7,2.498228,1.0,7.0,3.0,43.0,4.363691,0.277333,0.270833,0.491842,0.598182,192
8,2.400389,1.0,7.0,3.0,59.0,4.811645,0.304545,0.2607,0.48917,0.609905,257
9,4.990467,1.0,7.0,4.0,59.0,4.448196,0.29765,0.285,0.486946,0.619634,200


In [30]:
df_trials = pd.read_csv("data/experiments/adf_upgrade_yes-adf-recent#1.csv").sort_values(
    by="ap", ascending=False
)
df_trials.head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,train_window_size,f1_score,precision,ap,auc,pos_preds
0,1.802354,3.0,4.0,7.0,64.0,4.104953,64.0,0.276423,0.274194,0.466065,0.615832,186
1,1.708711,3.0,4.0,8.0,63.0,4.365494,64.0,0.298201,0.281553,0.454368,0.628979,206
2,2.033617,3.0,4.0,8.0,62.0,4.208674,64.0,0.293333,0.286458,0.453274,0.628314,192
3,1.160542,3.0,4.0,8.0,69.0,4.151177,56.0,0.275676,0.272727,0.44579,0.615212,187
4,1.291625,3.0,4.0,8.0,65.0,4.340644,64.0,0.305085,0.273913,0.443988,0.637553,230
5,2.506153,3.0,5.0,8.0,70.0,4.382457,64.0,0.323529,0.293333,0.439193,0.639323,225
6,0.846906,3.0,5.0,8.0,30.0,4.048634,64.0,0.295337,0.280788,0.432918,0.624504,203
7,2.397897,3.0,4.0,8.0,41.0,4.015005,64.0,0.317073,0.286344,0.429513,0.620754,227
8,2.164158,3.0,5.0,8.0,46.0,3.958084,64.0,0.293963,0.282828,0.428214,0.637969,198
9,1.959557,3.0,4.0,8.0,74.0,4.835569,64.0,0.298901,0.25,0.425112,0.618994,272


In [53]:
params = {
    "gamma": 4.098422,
    "scale_pos_weight": 4.578455,
    "max_depth": 7,
    "min_child_weight": 5,
    "max_delta_step": 1,
    "colsample_bylevel": 1,
    "n_estimators": 48,
    "learning_rate": 0.1,
    "subsample": 1,
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
    "random_state": np.random.randint(999929),
}

clf = xgb.XGBClassifier(**params)
clf = clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])

[0]	validation_0-logloss:0.68759	validation_1-logloss:0.68290
[1]	validation_0-logloss:0.68288	validation_1-logloss:0.67338
[2]	validation_0-logloss:0.67600	validation_1-logloss:0.66452
[3]	validation_0-logloss:0.67208	validation_1-logloss:0.65670
[4]	validation_0-logloss:0.66720	validation_1-logloss:0.65007
[5]	validation_0-logloss:0.66452	validation_1-logloss:0.64404
[6]	validation_0-logloss:0.65591	validation_1-logloss:0.63785
[7]	validation_0-logloss:0.65363	validation_1-logloss:0.63217
[8]	validation_0-logloss:0.65215	validation_1-logloss:0.62744
[9]	validation_0-logloss:0.64749	validation_1-logloss:0.62293
[10]	validation_0-logloss:0.64431	validation_1-logloss:0.61923
[11]	validation_0-logloss:0.64278	validation_1-logloss:0.61585
[12]	validation_0-logloss:0.64060	validation_1-logloss:0.61290
[13]	validation_0-logloss:0.63685	validation_1-logloss:0.60929
[14]	validation_0-logloss:0.63028	validation_1-logloss:0.60535
[15]	validation_0-logloss:0.62329	validation_1-logloss:0.60211
[1

In [54]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
thres = 0.5
y_preds = y_score > thres

df_results_valid = splits["validation"].copy()
df_results_valid["score"] = y_score
df_results_valid["prediction"] = y_preds

_ = evaluate.performance_summary(
    y_score=y_score, y_preds=y_preds, y_true=y_valid, auc_cutoff=0.5, verbose=True
)

_ = evaluate.returns_on_predictions(df_results_valid, verbose=True)

# _ = evaluate.performance_on_slice(df_results_valid, "subindustry")

**Validation**
Precision: 0.245136186770428
PR-AUC/AP score: 0.38282887033811264
ROC-AUC score: 0.5980283089495743
Total positive predictions: 257
Total positive labels: 183

Totals:
        prediction
result            
FN             120
FP             194
TN             587
TP              63

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.075925          0.093075            0.080333
FP              0.006701         -0.009912           -0.009582
TN              0.004467         -0.019247           -0.029538
TP              0.066444          0.117079            0.109556

Std:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.070681          0.062062            0.076462
FP              0.039699          0.060385            0.071674
TN              0.048008          0.065300            0.069

In [46]:
# pd.set_option("display.max_rows", 200)
# evaluate.performance_on_trading_use_case(
#         df_results_valid, top_n_trades=5, min_industry_score=0.4
#     )

In [31]:
for name, importance in zip(clf.feature_names_in_, clf.feature_importances_):
    print(name, importance)

adf_pass_rate 0.029732222
last_residual 0.08531821
residual_mean_max 0.0812026
industry 0.118948996
vix 0.26275504
betas_rsquared 0.14753708
arima_forecast_normalized 0.21244633
residual_inter 0.062059514


In [44]:
pd.set_option("display.max_rows", 200)

In [None]:
df_results_valid[df_results_valid.result == "FP"].iloc[:].drop(
    columns=["beta", "intercept", "data_window_start", "label", "prediction"]
).iloc[:100]

In [None]:
# df_results_valid[df_results_valid.return_three_month < -0.2]

In [None]:
# df_results_valid[df_results_valid.subindustry == 'consumer_services'].iloc[0:100]

In [3]:
vld = pd.read_csv(
    "data/experiments/validation/final_one_validation_pipeline_60_True_2_6_1000_5_0.4_0.005_1.csv"
)
vld[
    [
        "_pos_pred_ret_3mo",
        "_fp_ret_3mo",
        "_ap",
        "banksbanks_top5_ret_3mo",
        "capitgoods_top5_ret_3mo",
        "chemiicals_top5_ret_3mo",
        "divercials_top5_ret_3mo",
        "energnergy_top5_ret_3mo",
        "semicpment_top5_ret_3mo",
        "softwvices_top5_ret_3mo",
        "technpment_top5_ret_3mo",
        "transation_top5_ret_3mo",
        "utiliities_top5_ret_3mo",
    ]
].describe().drop(index=["std", "count"])

Unnamed: 0,_pos_pred_ret_3mo,_fp_ret_3mo,_ap,banksbanks_top5_ret_3mo,capitgoods_top5_ret_3mo,chemiicals_top5_ret_3mo,divercials_top5_ret_3mo,energnergy_top5_ret_3mo,semicpment_top5_ret_3mo,softwvices_top5_ret_3mo,technpment_top5_ret_3mo,transation_top5_ret_3mo,utiliities_top5_ret_3mo
mean,-0.000545,-0.041131,0.324907,0.010408,-0.012968,0.021529,-0.026084,0.014053,0.000707,-0.006556,0.021094,-0.004161,0.009587
min,-0.112,-0.134,0.044,-0.132,-0.228,-0.262,-0.308,-0.255,-0.293,-0.136,-0.181,-0.201,-0.089
25%,-0.0245,-0.056,0.205,-0.01275,-0.056,-0.04,-0.0635,-0.027,-0.0495,-0.034,-0.0235,-0.0425,-0.01925
50%,0.002,-0.032,0.274,0.006,-0.004,0.02,-0.017,0.012,0.015,-0.009,0.01,0.002,0.005
75%,0.016,-0.0215,0.397,0.0295,0.02475,0.054,0.017,0.042,0.0455,0.0195,0.03625,0.0415,0.042
max,0.164,0.038,1.0,0.176,0.203,0.376,0.146,0.229,0.18,0.138,0.359,0.126,0.104


In [37]:
vld = pd.read_csv(
    "data/experiments/validation/new_adf_recent_adf_included_validation_pipeline_64_False_2_6_1000_5_0.4_0.005_1.csv"
)
vld[
    [
        "_pos_pred_ret_3mo",
        "_fp_ret_3mo",
        "_ap",
        "banksbanks_top5_ret_3mo",
        "capitgoods_top5_ret_3mo",
        "chemiicals_top5_ret_3mo",
        "divercials_top5_ret_3mo",
        "energnergy_top5_ret_3mo",
        "semicpment_top5_ret_3mo",
        "softwvices_top5_ret_3mo",
        "technpment_top5_ret_3mo",
        "transation_top5_ret_3mo",
        "utiliities_top5_ret_3mo",
    ]
].describe().drop(index=["std", "count"])

Unnamed: 0,_pos_pred_ret_3mo,_fp_ret_3mo,_ap,banksbanks_top5_ret_3mo,capitgoods_top5_ret_3mo,chemiicals_top5_ret_3mo,divercials_top5_ret_3mo,energnergy_top5_ret_3mo,semicpment_top5_ret_3mo,softwvices_top5_ret_3mo,technpment_top5_ret_3mo,transation_top5_ret_3mo,utiliities_top5_ret_3mo
mean,-0.009548,-0.046613,0.319315,-0.002054,-0.01575,0.017282,-0.030595,0.031155,0.004512,-0.010848,0.031333,-0.012696,0.010837
min,-0.485,-0.597,0.062,-0.095,-0.213,-0.166,-0.273,-0.217,-0.305,-0.112,-0.181,-0.188,-0.093
25%,-0.03,-0.052,0.209,-0.024,-0.04975,-0.03,-0.057,-0.02475,-0.02425,-0.044,-0.0245,-0.0485,-0.01675
50%,0.0,-0.036,0.292,-0.004,-0.002,0.023,-0.023,0.0265,0.01,-0.011,0.01,-0.014,-0.0005
75%,0.016,-0.02,0.391,0.021,0.03,0.055,0.01025,0.07125,0.0545,0.0165,0.0485,0.047,0.03825
max,0.112,0.02,0.773,0.102,0.176,0.194,0.065,0.33,0.139,0.177,1.178,0.114,0.151


In [36]:
vld = pd.read_csv("data/experiments/validation/dynamic-train-window-sparse_validation_pipeline_62_False_2_6_1000_5_0.4_0.005_1.csv")
vld[["_pos_pred_ret_3mo", "_fp_ret_3mo", "_ap", "banksbanks_top5_ret_3mo", "capitgoods_top5_ret_3mo", "chemiicals_top5_ret_3mo", "divercials_top5_ret_3mo", "energnergy_top5_ret_3mo", "semicpment_top5_ret_3mo", "softwvices_top5_ret_3mo", "technpment_top5_ret_3mo", "transation_top5_ret_3mo", "utiliities_top5_ret_3mo"]].describe().drop(index=["std", "count"])

Unnamed: 0,_pos_pred_ret_3mo,_fp_ret_3mo,_ap,banksbanks_top5_ret_3mo,capitgoods_top5_ret_3mo,chemiicals_top5_ret_3mo,divercials_top5_ret_3mo,energnergy_top5_ret_3mo,semicpment_top5_ret_3mo,softwvices_top5_ret_3mo,technpment_top5_ret_3mo,transation_top5_ret_3mo,utiliities_top5_ret_3mo
mean,0.047659,-0.027708,0.35029,0.004711,-0.002697,0.010904,-0.004976,0.02812,-0.002802,0.000348,0.019851,0.006367,0.006594
min,-0.094,-0.101,0.0,-0.104,-0.19,-0.106,-0.146,-0.19,-0.226,-0.271,-0.675,-0.218,-0.064
25%,-0.02025,-0.04625,0.167,-0.02675,-0.048,-0.028,-0.04325,-0.042,-0.03,-0.032,-0.0275,-0.034,-0.02
50%,0.0155,-0.021,0.321,-0.0025,-0.002,0.001,-0.0025,0.003,0.0,0.003,0.032,0.0115,0.005
75%,0.04425,-0.003,0.473,0.0305,0.035,0.047,0.033,0.0675,0.048,0.053,0.077,0.0405,0.02
max,1.309,0.036,1.0,0.153,0.261,0.162,0.136,0.558,0.138,0.164,0.824,0.228,0.088
