# Imports

In [None]:
# !pip install yfinance
# !pip install pmdarima
# !pip install hyperopt
# !pip install xgboost
# !pip install numpy -U

In [2]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import train
import preprocessing

# Download stock daily prices & indexes

In [3]:
### Gets all ticker names (no argument given)
market_cap_min_mm = 1000
market_cap_max_mm = None

ticker_list = utils.get_ticker_names(
    market_cap_min_mm=market_cap_min_mm,
    market_cap_max_mm=market_cap_max_mm,
    remove_industries=[
        "pharmaceuticals_biotechnology_and_life_sciences",
        "health_care_equipment_and_services"
    ],
)

In [4]:
### Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2021, 1, 18)
### Date of today
date_to = datetime.today()
### How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 6

In [5]:
_, _ = utils.download_stonk_prices(
    ticker_list.index, period_years=period_years, date_to=date_to
)
_, _ = utils.download_stonk_prices(
    ["^VIX"], period_years=period_years, date_to=date_to, fname_prefix="vix"
)
_, _ = utils.download_stonk_prices(
    ["^GSPC"], period_years=period_years, date_to=date_to, fname_prefix="sp500"
)
_, _ = utils.download_stonk_prices(
    ["CL=F"], period_years=period_years, date_to=date_to, fname_prefix="oil"
)
_, _ = utils.download_stonk_prices(
    ["DX=F"], period_years=period_years, date_to=date_to, fname_prefix="usd"
)
_, _ = utils.download_stonk_prices(
    ["^TNX"], period_years=period_years, date_to=date_to, fname_prefix="yield"
)
_, _ = utils.download_stonk_prices(
    ["HG=F"], period_years=period_years, date_to=date_to, fname_prefix="copper"
)

[*********************100%***********************]  2289 of 2289 completed

31 Failed downloads:
- RLGY: No data found, symbol may be delisted
- MIME: No data found, symbol may be delisted
- NCBS: No data found, symbol may be delisted
- BIP.PRA: No data found, symbol may be delisted
- O.WI: No data found, symbol may be delisted
- MSP: No data found, symbol may be delisted
- PLAN: No data found, symbol may be delisted
- DELL WI: No data found, symbol may be delisted
- FB: No data found, symbol may be delisted
- APSG: No data found, symbol may be delisted
- BIP.PRB: No data found, symbol may be delisted
- OAS: No data found, symbol may be delisted
- ZNGA: No data found, symbol may be delisted
- CDK: No data found, symbol may be delisted
- ANAT: No data found, symbol may be delisted
- SGMS: No data found, symbol may be delisted
- FOE: No data found, symbol may be delisted
- DIDI: No data found, symbol may be delisted
- TSC: No data found, symbol may be delisted
- EPAY: No data found, symb

# Run data pipeline

In [25]:
industries = [
    # 'health_care_equipment_and_services',
    "software_and_services",
    "retailing",
    "telecommunication_services",
    "capital_goods",
    "energy",
    # 'pharmaceuticals_biotechnology_and_life_sciences',
    "consumer_staples",
    "banks",
    "diversified_financials",
    "metals_and_mining",
    "technology_hardware_and_equipment",
    "utilities",
    "chemicals",
    "automobiles_and_components",
    "semiconductors_and_semiconductor_equipment",
    "media_and_entertainment",
    "real_estate",
    "consumer_services",
    "consumer_durables_and_apparel",
    "insurance",
    "transportation",
    "commercial_and_professional_services",
    "paper_and_forest_products",
    "containers_and_packaging",
    "construction_materials",
]

l_reg = 3
l_roll = 2
dt = 10
last_residual_cutoff = 2.5
adf_pval_cutoff = 0.1
adf_pass_rate_filter = 0.5
mean_max_residual_dt = 21
arima_forecast_months = 3
arima_eval_models = 5

market_cap_max_string = "max" if market_cap_max_mm is None else str(market_cap_max_mm)
pipeline_dir = (
    "pipeline_run_" + str(market_cap_min_mm) + "_to_" + market_cap_max_string + "_cap"
)
output_dir = os.path.join("data", pipeline_dir)

stonk_model = predict.XGBStonkModel()

market_indexes = utils.get_market_indexes()

In [26]:
datasets = []
i = 0
total_industries = len(industries)
for industry in industries:
    i += 1
    stonks = utils.get_stonk_data(
        market_cap_min_mm,
        market_cap_max_mm,
        remove_industries=[industry],
        filter_industries=True,
    )
    X, Y = processing.combine_stonk_pairs(stonks)

    print("Industry ({0}/{1}): {2}".format(i, total_industries, industry))

    features = pipelines.process_features_from_price_data(
        X=X,
        Y=Y,
        market_indexes=market_indexes,
        l_reg=l_reg,
        l_roll=l_roll,
        dt=dt,
        last_residual_cutoff=last_residual_cutoff,
        adf_pval_cutoff=adf_pval_cutoff,
        adf_pass_rate_filter=adf_pass_rate_filter,
        mean_max_residual_dt=mean_max_residual_dt,
        arima_forecast_months=arima_forecast_months,
        arima_eval_models=arima_eval_models,
    )

    if len(features) == 0:
        print("No trades")
        continue

    print(
        "Mean max value for {0}: {1}".format(industry, features["residuals_max_mean"])
    )
    dataset = utils.build_dataset_from_live_data_by_industry(
        std_residuals=features["std_residuals"],
        adfs=features["adfs"],
        subindustry=industry,
        mean_max_residual=features["residuals_max_mean"],
        residual_quantile=features["residuals_quantile"],
        vix_index=market_indexes["vix"].loc[stonks.columns[-1]],
        betas_stability_rsquared=features["beta_stability_rsquared_vals"],
        arima_forecasts=features["arima_forecasts"],
    )

    predictions, df_processed = stonk_model.predict(dataset)
    datasets.append((dataset, df_processed))
    predictions = pd.DataFrame(predictions)
    predictions.index = features["adfs"].index

    features["residuals"].insert(0, "dates", features["dates_index"].values)
    features["betas"].insert(0, "dates", features["dates_index"].values)

    features["residuals"].to_csv(
        os.path.join(output_dir, industry + "_residuals.csv"),
        header=False,
        index=True,
    )
    features["betas"].to_csv(
        os.path.join(output_dir, industry + "_betas.csv"), header=False, index=True
    )
    features["adfs_raw"].to_csv(
        os.path.join(output_dir, industry + "_adfs_raw.csv"), header=False, index=True
    )
    predictions.to_csv(
        os.path.join(output_dir, industry + "_predictions.csv"),
        header=False,
        index=True,
    )
    features["arima_forecasts"].to_csv(
        os.path.join(output_dir, industry + "_arima.csv"),
        header=False,
        index=True,
    )
    features["beta_stability_rsquared_vals"].to_csv(
        os.path.join(output_dir, industry + "_rsquared.csv"),
        header=False,
        index=True,
    )
    features["market_correlations"].to_csv(
        os.path.join(output_dir, industry + "_correlations.csv"),
        header=True,
        index=True,
    )

print("*** All done ***")

Industry (1/24): software_and_services
Mean max value for software_and_services: 4.085000038146973
Industry (2/24): retailing
Mean max value for capital_goods: 4.089000225067139
Industry (5/24): energy
Mean max value for energy: 3.9860000610351562
Industry (6/24): consumer_staples
Mean max value for consumer_staples: 3.615999937057495
Industry (7/24): banks
Mean max value for banks: 4.0279998779296875
Industry (8/24): diversified_financials
Mean max value for diversified_financials: 3.4040000438690186
Industry (9/24): metals_and_mining
Mean max value for metals_and_mining: 3.1080000400543213
Industry (10/24): technology_hardware_and_equipment
Mean max value for technology_hardware_and_equipment: 4.026000022888184
Industry (11/24): utilities
Mean max value for utilities: 3.318000078201294
Industry (12/24): chemicals
Mean max value for chemicals: 3.25
Industry (13/24): automobiles_and_components
Mean max value for automobiles_and_components: 2.7339999675750732
Industry (14/24): semicondu

# Data collection

In [6]:
stonks = utils.get_stonk_data(disable_filter=True)
stonks = stonks.loc[:, :"2022-09-05"]

In [None]:
pipelines.data_collection_rolling_pipeline(
    stonk_prices=stonks,
    l_reg=3,
    l_roll=2,
    dt=10,
    market_cap_min_mm=1000,
    market_cap_max_mm=None,
    last_residual_cutoff=2.5,
    mean_max_residual_dt=21,
    adf_pval_cutoff=0.1,
    adf_pass_rate_filter=0.5,
    arima_forecast_months=3,
    arima_eval_models=5,
    trade_length_months=3,
    trading_interval_weeks=2,
    remove_industries=[
        "pharmaceuticals_biotechnology_and_life_sciences",
        "health_care_equipment_and_services"
    ],
    first_n_windows=1,
)

In [8]:
dataset = utils.ingest_trade_pipeline_outputs(
    data_dir="data/data_collection_pipeline/1000_to_max/"
)

vix = utils.get_stonk_data(fname_prefix="vix", disable_filter=True).iloc[0]
dataset["vix"] = dataset["trade_date"].apply(lambda x: vix.loc[x])

from utils import map_subindustries_to_industries
dataset.loc[:, "industry"] = dataset.apply(map_subindustries_to_industries, axis=1)

dataset.loc[:, "arima_forecast_normalized"] = dataset.apply(utils.normalize_arima_forecast, axis=1)

new_dataset = []
for date, date_group in dataset.groupby(by="trade_date"):
    for industry, industry_group in date_group.groupby(by="subindustry"):
        residual_quantile = industry_group["last_residual"].abs().quantile(q=0.9)
        industry_group.loc[:, "residual_quantile"] = np.full(len(industry_group), residual_quantile)
        new_dataset.append(industry_group)
        
new_dataset = pd.concat(new_dataset)

new_dataset.to_csv("data/dataset_bigcap.csv", header=True, index=False)

# Model development

In [7]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe, atpe, rand
import pickle

In [8]:
import importlib
importlib.reload(preprocessing)

<module 'preprocessing' from '/home/jupyter/stonk-rank/preprocessing.py'>

In [None]:
validation_results = pipelines.model_validation_pipeline(
    dataset=df,
    filename_prefix="quantile-test-meanmax-no_scaling",
    fixed_train_window_size=False,
    data_window_max_train_size=62,
    data_window_test_size=2,
    data_window_gap_size=6,
    hp_model_evals=1000,
    top_n_best_trades=5,
    min_industry_confidence=0.4,
    random_noise=0.005,
    hp_nth_best_model=1,
    verbose=False
)

In [9]:
df = pd.read_csv("data/dataset_bigcap.csv")
df = df[df.beta > 0]
df = df[df.last_residual.abs() >= 2.5]
df = preprocessing.assign_labels(df)

In [10]:
updated_ticker_list = utils.get_ticker_names(
    1000, None, remove_industries=["health_care_equipment_and_services", "pharmaceuticals_biotechnology_and_life_sciences"]
)
df = df[df.ticker_x.isin(updated_ticker_list.index)]
df = df[df.ticker_y.isin(updated_ticker_list.index)]

## Production model training

In [23]:
train_size = 60
selected_dates = np.sort(df["trade_date"].unique())[-train_size:]
df_prod = df[df.trade_date.isin(selected_dates)].sample(frac=1)
print(len(df_prod))
print(df_prod["label"].value_counts())

67646
0    53775
1    13871
Name: label, dtype: int64


In [24]:
clf_prod, scalers_prod = train.train_production_xgb(df_prod, params, noise_level=0.005)

[0]	validation_0-logloss:0.68224
[1]	validation_0-logloss:0.67311
[2]	validation_0-logloss:0.66524
[3]	validation_0-logloss:0.65843
[4]	validation_0-logloss:0.65282
[5]	validation_0-logloss:0.64780
[6]	validation_0-logloss:0.64332
[7]	validation_0-logloss:0.63927
[8]	validation_0-logloss:0.63581
[9]	validation_0-logloss:0.63276
[10]	validation_0-logloss:0.62995
[11]	validation_0-logloss:0.62543
[12]	validation_0-logloss:0.62210
[13]	validation_0-logloss:0.61925
[14]	validation_0-logloss:0.61694
[15]	validation_0-logloss:0.61448
[16]	validation_0-logloss:0.61255
[17]	validation_0-logloss:0.61102
[18]	validation_0-logloss:0.60915
[19]	validation_0-logloss:0.60756
[20]	validation_0-logloss:0.60630
[21]	validation_0-logloss:0.60496
[22]	validation_0-logloss:0.60349
[23]	validation_0-logloss:0.60197
[24]	validation_0-logloss:0.60088
[25]	validation_0-logloss:0.59941
[26]	validation_0-logloss:0.59861
[27]	validation_0-logloss:0.59721
[28]	validation_0-logloss:0.59630
[29]	validation_0-loglos

## Model training experiments

In [9]:
import importlib
importlib.reload(preprocessing)

<module 'preprocessing' from '/home/jupyter/stonk-rank/preprocessing.py'>

In [16]:
splits = preprocessing.split_data(
    df, date_count_train=60, date_count_valid=2, date_count_gap=6, random_state=3303544
)
print(len(splits["train"]))
print(len(splits["validation"]))
print(splits["train"]["label"].value_counts())
print(splits["validation"]["label"].value_counts())

65644
1834
0    51651
1    13993
Name: label, dtype: int64
0    1459
1     375
Name: label, dtype: int64


In [17]:
X_train, scalers = preprocessing.transform_features(splits["train"], noise_level=0.005)

X_valid, _ = preprocessing.transform_features(
    splits["validation"], scalers=scalers, noise_level=0
)

y_train = splits["train"]["label"]
y_valid = splits["validation"]["label"]

In [13]:
df_trial_results = train.model_hp_search(
            df,
            n_evals=1000,
            fixed_train_window_size=True,
            max_train_window_size=60,
            trial_name="new-model#2",
            additive_random_noise=0.005,
            train_window_min_size=58,
            train_window_stride=1,
            write_csv=True,
            random_state=420,
            data_dir="data",
            output_dir="experiments",
        )

 33%|███▎      | 334/1000 [01:34<02:52,  3.87trial/s, best loss: -0.4780931966324698]

  recall = tps / tps[-1]



 46%|████▌     | 455/1000 [02:12<02:39,  3.43trial/s, best loss: -0.4780931966324698]

  recall = tps / tps[-1]



100%|██████████| 1000/1000 [05:17<00:00,  3.15trial/s, best loss: -0.4780931966324698]


In [12]:
df_trials = pd.read_csv("data/experiments/new-model#1.csv").sort_values(by="ap", ascending=False)
df_trials.head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,train_window_size,f1_score,precision,ap,auc,pos_preds
0,2.555426,2.0,5.0,2.0,51.0,3.633763,60.0,0.344737,0.34026,0.506792,0.637368,385
1,1.905858,2.0,5.0,1.0,47.0,3.558655,60.0,0.353093,0.341646,0.490446,0.637844,401
2,2.06549,2.0,5.0,2.0,62.0,3.363152,60.0,0.357702,0.350384,0.481367,0.624459,391
3,2.299361,2.0,4.0,6.0,64.0,3.569238,60.0,0.333333,0.314421,0.468814,0.627862,423
4,2.828815,2.0,5.0,2.0,58.0,3.492441,60.0,0.338667,0.338667,0.461864,0.626416,375
5,2.43719,2.0,5.0,2.0,44.0,3.698415,60.0,0.346701,0.336683,0.461708,0.637959,398
6,0.623768,1.0,5.0,3.0,78.0,3.659642,60.0,0.349323,0.324201,0.460276,0.621932,438
7,3.514314,1.0,5.0,1.0,36.0,3.58109,60.0,0.343381,0.337629,0.459496,0.632612,388
8,2.444613,2.0,5.0,2.0,57.0,3.690582,60.0,0.37123,0.328542,0.459381,0.632417,487
9,2.811403,2.0,4.0,2.0,66.0,3.42425,58.0,0.349563,0.328638,0.458112,0.62563,426


In [14]:
df_trials = pd.read_csv("data/experiments/new-model#2.csv").sort_values(by="ap", ascending=False)
df_trials.head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
0,1.764003,3.0,5.0,6.0,66.0,3.524526,0.35101,0.333333,0.478093,0.613252,417
1,2.145694,2.0,4.0,6.0,58.0,3.702857,0.349741,0.34005,0.475609,0.626574,397
2,0.864657,4.0,5.0,1.0,55.0,3.552967,0.346106,0.322581,0.472435,0.622281,434
3,1.094405,4.0,5.0,1.0,58.0,3.55415,0.343826,0.314856,0.472211,0.617714,451
4,1.429453,4.0,5.0,1.0,61.0,3.517229,0.33463,0.325758,0.467992,0.62326,396
5,1.550489,4.0,5.0,6.0,65.0,3.46369,0.333333,0.330709,0.467825,0.617097,381
6,1.754838,3.0,4.0,6.0,69.0,3.480013,0.32732,0.316708,0.467468,0.616065,401
7,1.163131,1.0,3.0,1.0,61.0,3.806049,0.333333,0.330709,0.462433,0.627749,381
8,0.442671,2.0,5.0,1.0,57.0,3.480881,0.3329,0.324873,0.462165,0.624346,394
9,1.199104,4.0,5.0,5.0,62.0,3.451061,0.349081,0.343669,0.461025,0.62695,387


In [22]:
params = {
    "gamma": 2.555426,
    "scale_pos_weight": 3.633763,
    "max_depth": 5,
    "min_child_weight": 2,
    "max_delta_step": 2,
    "colsample_bylevel": 1,
    "n_estimators": 51,
    "learning_rate": 0.1,
    "subsample": 1,
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
    "random_state": np.random.randint(999929),
}

clf = xgb.XGBClassifier(**params)
clf = clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])

[0]	validation_0-logloss:0.68372	validation_1-logloss:0.68259
[1]	validation_0-logloss:0.67546	validation_1-logloss:0.67390
[2]	validation_0-logloss:0.66861	validation_1-logloss:0.66618
[3]	validation_0-logloss:0.66099	validation_1-logloss:0.65798
[4]	validation_0-logloss:0.65331	validation_1-logloss:0.65110
[5]	validation_0-logloss:0.64371	validation_1-logloss:0.64609
[6]	validation_0-logloss:0.63901	validation_1-logloss:0.64075
[7]	validation_0-logloss:0.63271	validation_1-logloss:0.63619
[8]	validation_0-logloss:0.62778	validation_1-logloss:0.63274
[9]	validation_0-logloss:0.62115	validation_1-logloss:0.62849
[10]	validation_0-logloss:0.61398	validation_1-logloss:0.62518
[11]	validation_0-logloss:0.61012	validation_1-logloss:0.62254
[12]	validation_0-logloss:0.60751	validation_1-logloss:0.61999
[13]	validation_0-logloss:0.60464	validation_1-logloss:0.61751
[14]	validation_0-logloss:0.60275	validation_1-logloss:0.61581
[15]	validation_0-logloss:0.59832	validation_1-logloss:0.61319
[1

In [19]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
thres = 0.5
y_preds = y_score > thres

df_results_valid = splits["validation"].copy()
df_results_valid["score"] = y_score
df_results_valid["prediction"] = y_preds

_ = evaluate.performance_summary(
    y_score=y_score, y_preds=y_preds, y_true=y_valid, auc_cutoff=0.5, verbose=True
)

_ = evaluate.returns_on_predictions(df_results_valid, verbose=True)

# _ = evaluate.performance_on_slice(df_results_valid, "subindustry")

**Validation**
Precision: 0.344559585492228
PR-AUC/AP score: 0.4195465726625667
ROC-AUC score: 0.6278537811286269
Total positive predictions: 386
Total positive labels: 375

Totals:
        prediction
result            
FN             242
FP             253
TN            1206
TP             133

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.079471          0.100996            0.096223
FP              0.014866          0.005739           -0.009040
TN              0.005054         -0.007694           -0.029577
TP              0.037887          0.072977            0.098218

Std:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.063946          0.054098            0.064299
FP              0.043778          0.055409            0.074640
TN              0.049565          0.066301            0.0742

In [46]:
# pd.set_option("display.max_rows", 200)
# evaluate.performance_on_trading_use_case(
#         df_results_valid, top_n_trades=5, min_industry_score=0.4
#     )

In [31]:
for name, importance in zip(clf.feature_names_in_, clf.feature_importances_):
    print(name, importance)

adf_pass_rate 0.029732222
last_residual 0.08531821
residual_mean_max 0.0812026
industry 0.118948996
vix 0.26275504
betas_rsquared 0.14753708
arima_forecast_normalized 0.21244633
residual_inter 0.062059514


In [44]:
pd.set_option("display.max_rows", 200)

In [None]:
df_results_valid[df_results_valid.result == "FP"].iloc[:].drop(
    columns=["beta", "intercept", "data_window_start", "label", "prediction"]
).iloc[:100]

In [None]:
# df_results_valid[df_results_valid.return_three_month < -0.2]

In [None]:
# df_results_valid[df_results_valid.subindustry == 'consumer_services'].iloc[0:100]

In [7]:
vld = pd.read_csv("data/experiments/validation/quantile-test-quantile-no_scaling_validation_pipeline_62_False_2_6_1000_5_0.4_0.005_1.csv")
vld[["_pos_pred_ret_3mo", "_fp_ret_3mo", "_ap", "banksbanks_top5_ret_3mo", "capitgoods_top5_ret_3mo", "chemiicals_top5_ret_3mo", "divercials_top5_ret_3mo", "energnergy_top5_ret_3mo", "semicpment_top5_ret_3mo", "softwvices_top5_ret_3mo", "technpment_top5_ret_3mo", "transation_top5_ret_3mo", "utiliities_top5_ret_3mo"]].describe().drop(index=["std", "count"])

Unnamed: 0,_pos_pred_ret_3mo,_fp_ret_3mo,_ap,banksbanks_top5_ret_3mo,capitgoods_top5_ret_3mo,chemiicals_top5_ret_3mo,divercials_top5_ret_3mo,energnergy_top5_ret_3mo,semicpment_top5_ret_3mo,softwvices_top5_ret_3mo,technpment_top5_ret_3mo,transation_top5_ret_3mo,utiliities_top5_ret_3mo
mean,0.006988,-0.032604,0.25776,0.006302,-0.009065,0.015985,-0.013571,0.015233,-0.000581,-0.012946,-0.001393,0.014352,0.006
min,-0.147,-0.176,0.0,-0.103,-0.212,-0.143,-0.229,-0.202,-0.15,-0.467,-0.667,-0.271,-0.084
25%,-0.01925,-0.05525,0.1175,-0.011,-0.0495,-0.02575,-0.03025,-0.047,-0.029,-0.056,-0.02,-0.01825,-0.01825
50%,0.006,-0.022,0.2525,0.004,0.0,0.0185,-0.014,0.007,0.01,-0.004,0.021,0.018,0.006
75%,0.03275,-0.0055,0.372,0.025,0.044,0.05825,0.0205,0.045,0.042,0.046,0.064,0.05125,0.027
max,0.168,0.025,1.0,0.157,0.14,0.188,0.103,0.343,0.157,0.149,0.22,0.182,0.077


In [8]:
vld = pd.read_csv("data/experiments/validation/quantile-test-quantile_validation_pipeline_62_False_2_6_1000_5_0.4_0.005_1.csv")
vld[["_pos_pred_ret_3mo", "_fp_ret_3mo", "_ap", "banksbanks_top5_ret_3mo", "capitgoods_top5_ret_3mo", "chemiicals_top5_ret_3mo", "divercials_top5_ret_3mo", "energnergy_top5_ret_3mo", "semicpment_top5_ret_3mo", "softwvices_top5_ret_3mo", "technpment_top5_ret_3mo", "transation_top5_ret_3mo", "utiliities_top5_ret_3mo"]].describe().drop(index=["std", "count"])

Unnamed: 0,_pos_pred_ret_3mo,_fp_ret_3mo,_ap,banksbanks_top5_ret_3mo,capitgoods_top5_ret_3mo,chemiicals_top5_ret_3mo,divercials_top5_ret_3mo,energnergy_top5_ret_3mo,semicpment_top5_ret_3mo,softwvices_top5_ret_3mo,technpment_top5_ret_3mo,transation_top5_ret_3mo,utiliities_top5_ret_3mo
mean,-0.010663,-0.046792,0.261427,0.001761,-0.022663,0.025,-0.014272,0.030567,-0.00582,-0.017899,0.020484,0.001739,0.005466
min,-0.236,-0.236,0.0,-0.131,-0.31,-0.219,-0.223,-0.145,-0.331,-0.584,-0.429,-0.53,-0.075
25%,-0.04975,-0.0715,0.122,-0.0315,-0.066,-0.02,-0.056,-0.0385,-0.046,-0.042,-0.012,-0.0295,-0.021
50%,0.0035,-0.0255,0.265,-0.0015,-0.015,0.027,-0.007,-0.002,0.014,0.001,0.023,0.0125,0.013
75%,0.02775,-0.009,0.365,0.0335,0.024,0.057,0.016,0.1,0.042,0.033,0.078,0.0585,0.028
max,0.162,0.025,0.832,0.153,0.208,0.355,0.103,0.387,0.12,0.12,0.272,0.267,0.107


In [3]:
# vld = pd.read_csv("data/experiments/validation/dynamic-train-window-sparse_validation_pipeline_62_False_2_6_1000_5_0.4_0.005_1.csv")
# vld[["_pos_pred_ret_3mo", "_fp_ret_3mo", "_ap", "banksbanks_top5_ret_3mo", "capitgoods_top5_ret_3mo", "chemiicals_top5_ret_3mo", "divercials_top5_ret_3mo", "energnergy_top5_ret_3mo", "semicpment_top5_ret_3mo", "softwvices_top5_ret_3mo", "technpment_top5_ret_3mo", "transation_top5_ret_3mo", "utiliities_top5_ret_3mo"]].describe().drop(index=["std", "count"])