# Imports

In [None]:
# !pip install yfinance
# !pip install pmdarima
# !pip install hyperopt
# !pip install xgboost
# !pip install numpy -U

In [1]:
import os
from functools import partial
from datetime import datetime

import typing
from typing import Dict
from typing import Any
from typing import Tuple

import numpy as np
import pandas as pd
import sklearn

import utils
import pipelines
import processing
import evaluate
import predict
import preprocessing

# Download stock daily prices & indexes

In [80]:
### Gets all ticker names (no argument given)
ticker_list = utils.get_ticker_names(market_cap_min_mm=1000, market_cap_max_mm=None)

In [81]:
### Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2021, 1, 18)
### Date of today
date_to = datetime.today()
### How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 6

In [82]:
df, df_clean = utils.download_stonk_prices(
    ticker_list.index, period_years=period_years, date_to=date_to
)
vix, vix_clean = utils.download_stonk_prices(
    ["^VIX"], period_years=period_years, date_to=date_to, fname_prefix="vix"
)
sp500, sp500_clean = utils.download_stonk_prices(
    ["^GSPC"], period_years=period_years, date_to=date_to, fname_prefix="sp500"
)

[*********************100%***********************]  2820 of 2820 completed

20 Failed downloads:
- BLL: No data found, symbol may be delisted
- MIME: No data found, symbol may be delisted
- MGP: No data found, symbol may be delisted
- EPAY: No data found, symbol may be delisted
- ZNGA: No data found, symbol may be delisted
- JOBS: No data found, symbol may be delisted
- O.WI: No data found, symbol may be delisted
- BIP.PRA: No data found, symbol may be delisted
- RXN WI: No data found, symbol may be delisted
- FOE: No data found, symbol may be delisted
- SGMS: No data found, symbol may be delisted
- NCBS: No data found, symbol may be delisted
- BIP.PRB: No data found, symbol may be delisted
- T WD: No data found, symbol may be delisted
- MRK.WI: No data found, symbol may be delisted
- PFE.WI: No data found, symbol may be delisted
- SNX.WI: No data found, symbol may be delisted
- DELL WI: No data found, symbol may be delisted
- POST WI: No data found, symbol may be delisted
- ANAT: No d

# Run data pipeline

In [83]:
industries = [
    # 'health_care_equipment_and_services',
    'software_and_services',
    'retailing',
    'telecommunication_services',
    "capital_goods",
    'energy',
    # 'pharmaceuticals_biotechnology_and_life_sciences',
    'consumer_staples',
    'banks',
    'diversified_financials',
    'metals_and_mining',
    'technology_hardware_and_equipment',
    'utilities',
    'chemicals',
    'automobiles_and_components',
    'semiconductors_and_semiconductor_equipment',
    'media_and_entertainment',
    'real_estate',
    'consumer_services',
    'consumer_durables_and_apparel',
    'insurance',
    'transportation',
    'commercial_and_professional_services',
    'paper_and_forest_products',
    'containers_and_packaging',
    'construction_materials'
]

l_reg = 3
l_roll = 2
dt = 10

output_dir = "data"

stonk_model = predict.XGBStonkModel()
vix = utils.get_stonk_data(fname_prefix="vix", disable_filter=True).iloc[0]

In [91]:
datasets = []
i = 1
total_industries = len(industries)
for industry in industries:
    stonks = utils.get_stonk_data(filter_industries=[industry])
    X, Y = processing.combine_stonk_pairs(stonks)

    print("Industry ({0}/{1}): {2}".format(i, total_industries, industry))

    print("Processing residuals...")
    residuals, betas, _, dates_index = utils.measure_time(
        partial(
            processing.get_rolling_residuals,
            X=X,
            Y=Y,
            l_reg=l_reg,
            l_roll=l_roll,
            dt=dt,
        )
    )

    std_residuals, means, stds = processing.get_standardized_residuals(
        residuals
    )

    trades_before = len(std_residuals)
    std_residuals = std_residuals[std_residuals.iloc[:, -1].abs() >= 2.5]
    trades_after = len(std_residuals)
    print(
        "{0} trades selected out of {1} by residual values".format(
            trades_after, trades_before
        )
    )
    if trades_after == 0:
        continue
        
    residuals = residuals.loc[std_residuals.index]
    betas = betas.loc[std_residuals.index]
    dates_index = dates_index.loc[std_residuals.index]

    print("Processing ADFs...")
    adfs, adfs_raw = utils.measure_time(
        partial(
            processing.get_aggregate_adfs,
            residuals,
            betas=betas,
        )
    )

    selected_by_adf = (adfs >= 0.5).values
    adfs = adfs[selected_by_adf]

    trades_before = len(std_residuals)
    std_residuals = std_residuals[selected_by_adf]
    trades_after = len(std_residuals)
    print(
        "{0} trades selected out of {1} by ADF pass rates".format(
            trades_after, trades_before
        )
    )

    if len(std_residuals) == 0:
        continue

    betas = betas.loc[adfs.index]
    
    residuals = residuals.loc[adfs.index]
    adfs_raw = adfs_raw.loc[adfs.index]
    
    dates_index = dates_index.loc[std_residuals.index]
    
    means = means.loc[std_residuals.index]
    stds = stds.loc[std_residuals.index]

    residuals_max_mean = processing.get_mean_residual_magnitude(
        std_residuals.to_numpy(), dt=21
    )
    print(
        "Mean max residual value for {0} after filtering is {1}".format(
            industry, residuals_max_mean
        )
    )
    
    print("Processing beta stability tests...")
    beta_stability_rsquared_vals = utils.measure_time(
        partial(
            processing.calculate_beta_stability_rsquared,
            prices_X=X, prices_Y=Y, betas=betas, dates_index=dates_index
        )
    )
    assert np.all(beta_stability_rsquared_vals.index == std_residuals.index)
    
    print("Processing ARIMA forecasts...")
    arima_forecasts = utils.measure_time(
        partial(
            processing.calculate_arima_forecast,
            std_residuals=std_residuals,
            forecast_months=3,
            eval_models=5,
        )
    )

    print("Preparing data for model...")
    dataset = utils.build_dataset_from_live_data_by_industry(
        std_residuals=std_residuals,
        adfs=adfs,
        subindustry=industry,
        mean_max_residual=residuals_max_mean,
        vix_index=vix.loc[stonks.columns[-1]],
        betas_stability_rsquared=beta_stability_rsquared_vals,
        arima_forecasts=arima_forecasts
    )

    print("Running model...")
    predictions, df_processed = stonk_model.predict(dataset)
    datasets.append((dataset, df_processed))
    predictions = pd.DataFrame(predictions)
    predictions.index = adfs.index

    residuals.insert(0, "dates", dates_index.values)
    betas.insert(0, "dates", dates_index.values)
    
    print("Writing results to CSV...")
    # Very big industry, exceeds Git file size limit
    if industry == "diversified_financials":
        half = len(residuals) // 2
        residuals_fst = residuals.iloc[:half]
        residuals_snd = residuals.iloc[half:]
        residuals_fst.to_csv(
            os.path.join(output_dir, industry + "_one_residuals.csv"),
            header=False,
            index=True,
        )
        residuals_snd.to_csv(
            os.path.join(output_dir, industry + "_two_residuals.csv"),
            header=False,
            index=True,
        )
        del residuals_fst
        del residuals_snd
    else:
        residuals.to_csv(
            os.path.join(output_dir, industry + "_residuals.csv"),
            header=False,
            index=True,
        )
    betas.to_csv(
        os.path.join(output_dir, industry + "_betas.csv"), header=False, index=True
    )
    adfs_raw.to_csv(
        os.path.join(output_dir, industry + "_adfs_raw.csv"), header=False, index=True
    )
    predictions.to_csv(
        os.path.join(output_dir, industry + "_predictions.csv"),
        header=False,
        index=True,
    )
    arima_forecasts.to_csv(
        os.path.join(output_dir, industry + "_arima.csv"),
        header=False,
        index=True,
    )
    beta_stability_rsquared_vals.to_csv(
        os.path.join(output_dir, industry + "_rsquared.csv"),
        header=False,
        index=True,
    )
    
    i += 1

print("*** All done ***")

Industry (1/24): software_and_services
Processing residuals...
Done after: 18s
167 trades selected out of 7503 by residual values
Processing ADFs...
Done after: 12s
27 trades selected out of 167 by ADF pass rates
Mean max residual value for software_and_services after filtering is 3.430000066757202
Processing beta stability tests...
Done after: 0s
Processing ARIMA forecasts...
Done after: 9s
Preparing data for model...
Running model...
Writing results to CSV...
Industry (2/24): retailing
Processing residuals...
Done after: 4s
122 trades selected out of 2145 by residual values
Processing ADFs...
Done after: 9s
19 trades selected out of 122 by ADF pass rates
Mean max residual value for retailing after filtering is 3.5999999046325684
Processing beta stability tests...
Done after: 0s
Processing ARIMA forecasts...
Done after: 9s
Preparing data for model...
Running model...
Writing results to CSV...
Industry (3/24): telecommunication_services
Processing residuals...
Done after: 1s
55 trades 

# Data collection

In [2]:
stonks = utils.get_stonk_data()
stonks = stonks.loc[:, :"2020-05-08"]

In [None]:
pipelines.data_collection_rolling_pipeline(
    stonks,
    l_reg=3,
    l_roll=2,
    dt=10,
    market_cap_min_mm=1000,
    market_cap_max_mm=None,
    last_residual_cutoff=2.5,
    mean_max_residual_dt=21,
    adf_pval_cutoff=0.1,
    adf_pass_rate_filter=0.5,
    arima_forecast_months=3,
    arima_eval_models=5,
    trade_length_months=3,
    trading_interval_weeks=2,
    # first_n_windows=72,
)

Total data windows: 15


In [2]:
dataset = utils.ingest_trade_pipeline_outputs()

vix = utils.get_stonk_data(fname_prefix="vix", disable_filter=True).iloc[0]
sp500 = utils.get_stonk_data(fname_prefix="sp500", disable_filter=True).iloc[0]

sp500_chg = pd.Series((sp500.iloc[63:].values / sp500.iloc[:-63].values) - 1)
sp500_chg.index = sp500.iloc[63:].index

dataset["vix"] = dataset["trade_date"].apply(lambda x: vix.loc[x])
dataset["sp500"] = dataset["trade_date"].apply(lambda x: sp500_chg.loc[x])
dataset.to_csv("data/dataset.csv", header=True, index=False)

# Model development

In [2]:
import xgboost as xgb
from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe, atpe, rand
import pickle

In [3]:
def train_production_xgb(
    df: pd.DataFrame, params: Dict[str, Any], noise_level: float = 0
) -> Tuple[xgb.XGBClassifier, sklearn.base.TransformerMixin]:
    X_train, scalers = preprocessing.transform_features(df, noise_level=noise_level)
    y_train = df["label"]

    clf = xgb.XGBClassifier(**params)

    clf.fit(X_train, y_train, eval_set=[(X_train, y_train)])
    clf.save_model(os.path.join("data", "xgb_classifier.json"))

    with open(os.path.join("data", "scalers.json"), "wb") as fp:
        pickle.dump(scalers, fp)

    return clf, scalers

In [4]:
df = pd.read_csv("data/dataset.csv")
df = df[df.beta > 0]
df = df[df.last_residual.abs() >= 2.5]
df = preprocessing.assign_labels(df)

In [78]:
drop_dates = 18
selected_dates = np.sort(df["trade_date"].unique())[drop_dates:]
df_prod = df[df.trade_date.isin(selected_dates)].sample(frac=1)
print(len(df_prod))
print(df_prod["label"].value_counts())

92655
0    82289
1    10366
Name: label, dtype: int64


In [79]:
clf_prod, scalers_prod = train_production_xgb(df_prod, params, noise_level=0.005)

[0]	validation_0-logloss:0.66739
[1]	validation_0-logloss:0.64578
[2]	validation_0-logloss:0.62685
[3]	validation_0-logloss:0.61101
[4]	validation_0-logloss:0.59703
[5]	validation_0-logloss:0.58501
[6]	validation_0-logloss:0.57426
[7]	validation_0-logloss:0.56452
[8]	validation_0-logloss:0.55590
[9]	validation_0-logloss:0.54830
[10]	validation_0-logloss:0.54174
[11]	validation_0-logloss:0.53544
[12]	validation_0-logloss:0.53016
[13]	validation_0-logloss:0.52465
[14]	validation_0-logloss:0.52014
[15]	validation_0-logloss:0.51577
[16]	validation_0-logloss:0.51189
[17]	validation_0-logloss:0.50843
[18]	validation_0-logloss:0.50519
[19]	validation_0-logloss:0.50218
[20]	validation_0-logloss:0.49960
[21]	validation_0-logloss:0.49689
[22]	validation_0-logloss:0.49455
[23]	validation_0-logloss:0.49263
[24]	validation_0-logloss:0.49037
[25]	validation_0-logloss:0.48866
[26]	validation_0-logloss:0.48669
[27]	validation_0-logloss:0.48523
[28]	validation_0-logloss:0.48320
[29]	validation_0-loglos

In [52]:
splits = preprocessing.split_data(df, 2, 6, 14, random_state=43223293)
print(len(splits["train"]))
print(len(splits["validation"]))
print(splits["train"]["label"].value_counts())
print(splits["validation"]["label"].value_counts())

93404
3605
0    80060
1    13344
Name: label, dtype: int64
0    2966
1     639
Name: label, dtype: int64


In [53]:
noise_level = 0.005

X_train, scalers = preprocessing.transform_features(
    splits["train"], noise_level=noise_level
)
X_valid, _ = preprocessing.transform_features(
    splits["validation"], scalers=scalers, noise_level=0
)

y_train = splits["train"]["label"]
y_valid = splits["validation"]["label"]

In [20]:
hyperparameter_space = {
    "gamma": hp.uniform("gamma", 0, 5),
    "scale_pos_weight": hp.uniform("scale_pos_weight", 3, 7),
    "max_depth": hp.quniform("max_depth", 3, 8, 1),
    "min_child_weight": hp.quniform("min_child_weight", 1, 8, 1),
    "max_delta_step": hp.quniform("max_delta_step", 1, 4, 1),
    "n_estimators": hp.quniform("n_estimators", 25, 80, 1),
    # "n_estimators": hp.choice("n_estimators", np.array([50, 75, 100, 150, 200])),
    # "subsample": hp.uniform("subsample", 0.9, 1),
    # "colsample_bylevel" : hp.uniform("colsample_bylevel", 0.5, 1),
}

In [21]:
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score


def optimization_objective(space):
    clf = xgb.XGBClassifier(
        gamma=space["gamma"],
        scale_pos_weight=space["scale_pos_weight"],
        #
        max_depth=int(space["max_depth"]),
        min_child_weight=int(space["min_child_weight"]),
        max_delta_step=int(space["max_delta_step"]),
        #
        # colsample_bylevel = space['colsample_bylevel'],
        colsample_bylevel=1,
        n_estimators=int(space["n_estimators"]),
        learning_rate=0.1,
        # subsample = space['subsample'],
        subsample=0.99,
        #
        tree_method="hist",
        enable_categorical=True,
        max_cat_to_onehot=1,
        random_state=np.random.randint(9999999),
    )

    clf.fit(
        X_train,
        y_train,
        verbose=False,
    )

    y_score = clf.predict_proba(X_valid)[:, 1]
    y_preds = y_score > 0.5

    f1 = f1_score(y_valid, y_preds, zero_division=0)
    precision = precision_score(y_valid, y_preds, zero_division=0)
    ap = evaluate.average_precision_from_cutoff(y_valid, y_score, 0)
    roc = roc_auc_score(y_valid, y_score)

    pos_preds = int(y_preds.sum())
    pos_labels = int(y_valid.sum())

    ap = ap if pos_preds >= pos_labels else 0

    if f1 == 0 or precision == 0:
        return {
            "loss": 100,
            "precision": precision,
            "f1_score": f1,
            "ap": ap,
            "auc": roc,
            "pos_preds": pos_preds,
            "status": STATUS_FAIL,
        }
    else:
        return {
            "loss": -ap,
            "precision": precision,
            "f1_score": f1,
            "ap": ap,
            "auc": roc,
            "pos_preds": pos_preds,
            "status": STATUS_OK,
        }

In [22]:
trials = Trials()

best_hyperparams = fmin(
    fn=optimization_objective,
    space=hyperparameter_space,
    algo=tpe.suggest,
    max_evals=1000,
    trials=trials,
)

trial_vals = trials.vals
trial_vals["f1_score"] = list(map(lambda x: x["f1_score"], trials.results))
trial_vals["precision"] = list(map(lambda x: x["precision"], trials.results))
trial_vals["ap"] = list(map(lambda x: x["ap"], trials.results))
trial_vals["auc"] = list(map(lambda x: x["auc"], trials.results))
trial_vals["pos_preds"] = list(map(lambda x: x["pos_preds"], trials.results))

df_trials = pd.DataFrame.from_dict(trial_vals)
df_trials.to_csv("data/experiments/data_window_size_14#1.csv", index=False)

100%|██████████| 1000/1000 [08:12<00:00,  2.03trial/s, best loss: -0.2510937839243924]


In [71]:
params = {
    # reg def 0
    "gamma": 4.159451,
    # L2 def 1
    # "reg_lambda" : 1,
    # "reg_alpha" : 0,
    # Class imbalance def 1
    "scale_pos_weight": 5.631922,
    # Integers:
    "max_depth": 6,
    # Reg def 1
    "min_child_weight": 3,
    # Class imbalance def 0
    "max_delta_step": 2,
    # Choice:
    "colsample_bylevel": 1,
    "n_estimators": 55,
    "learning_rate": 0.1,
    "subsample": 1,
    # Fixed:
    "tree_method": "hist",
    "enable_categorical": True,
    "max_cat_to_onehot": 1,
    "eval_metric": ["logloss"],
    "random_state": np.random.randint(999929),
}

clf = xgb.XGBClassifier(**params)

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])

[0]	validation_0-logloss:0.67560	validation_1-logloss:0.67255
[1]	validation_0-logloss:0.66043	validation_1-logloss:0.65543
[2]	validation_0-logloss:0.64723	validation_1-logloss:0.64112
[3]	validation_0-logloss:0.63553	validation_1-logloss:0.62865
[4]	validation_0-logloss:0.62576	validation_1-logloss:0.61824
[5]	validation_0-logloss:0.61776	validation_1-logloss:0.60855
[6]	validation_0-logloss:0.60987	validation_1-logloss:0.60075
[7]	validation_0-logloss:0.60446	validation_1-logloss:0.59406
[8]	validation_0-logloss:0.59795	validation_1-logloss:0.58724
[9]	validation_0-logloss:0.59369	validation_1-logloss:0.58206
[10]	validation_0-logloss:0.58966	validation_1-logloss:0.57757
[11]	validation_0-logloss:0.58651	validation_1-logloss:0.57159
[12]	validation_0-logloss:0.58383	validation_1-logloss:0.56804
[13]	validation_0-logloss:0.58153	validation_1-logloss:0.56257
[14]	validation_0-logloss:0.58103	validation_1-logloss:0.55868
[15]	validation_0-logloss:0.57927	validation_1-logloss:0.55490
[1

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=True,
              eval_metric=['logloss'], gamma=4.159451, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_bin=256,
              max_cat_to_onehot=1, max_delta_step=2, max_depth=6, max_leaves=0,
              min_child_weight=3, missing=nan, monotone_constraints='()',
              n_estimators=55, n_jobs=0, num_parallel_tree=1, predictor='auto',
              random_state=55700, reg_alpha=0, reg_lambda=1, ...)

In [77]:
print("**Validation**")
y_score = clf.predict_proba(X_valid)[:, 1]
thres = 0.5
y_preds = y_score > thres

evaluate.performance_summary(y_score, y_preds, y_valid, auc_cutoff=0)

df_results_valid = evaluate.returns_on_predictions(splits["validation"], y_preds)

evaluate.performance_on_slice(
    splits["validation"], y_score, y_preds, "subindustry", False
)

**Validation**
Precision: 0.26419213973799127
PR-AUC/AP score: 0.23870773454445704
ROC-AUC score: 0.5942958643446805
Total positive predictions: 916

Totals:
        prediction
result            
FN             397
FP             674
TN            2292
TP             242

Means:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.068416          0.113670            0.125249
FP             -0.000749          0.001182           -0.004631
TN              0.000474         -0.000233            0.012985
TP              0.082682          0.144562            0.143847

Stds:
        return_one_month  return_two_month  return_three_month
result                                                        
FN              0.051829          0.065684            0.070138
FP              0.051256          0.059817            0.070593
TN              0.046262          0.062113            0.059930
TP              0.05

  recall = tps / tps[-1]
  recall = tps / tps[-1]


In [66]:
pd.set_option("display.max_rows", 100)

In [70]:
# df_results_valid[df_results_valid.result == "FP"].iloc[:100]

In [None]:
# df_results_valid[df_results_valid.subindustry == 'consumer_services'].iloc[0:100]

In [49]:
for name, importance in zip(clf.feature_names_in_, clf.feature_importances_):
    print(name, importance)

adf_pass_rate 0.0339131
last_residual 0.10790185
residual_mean_max 0.10434068
vix 0.30436495
betas_rsquared 0.054459076
arima_forecast 0.16788594
industry 0.18662527
residual_inter 0.0405091


In [69]:
df_trials = pd.read_csv("data/experiments/data_window_size_10#3.csv")
df_trials.sort_values("ap", ascending=False).head(10)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
839,4.159451,2.0,6.0,3.0,55.0,5.631922,0.318154,0.259921,0.252514,0.592166,1008
741,4.743078,2.0,6.0,2.0,52.0,5.864701,0.320046,0.251567,0.251942,0.590328,1117
978,3.031823,2.0,6.0,3.0,78.0,4.91505,0.309786,0.264381,0.251612,0.587809,904
656,3.738893,3.0,6.0,3.0,54.0,5.07565,0.309054,0.273494,0.250727,0.591386,830
13,0.130878,3.0,6.0,3.0,44.0,4.989235,0.306441,0.270335,0.250427,0.59446,836
720,4.403761,2.0,6.0,1.0,52.0,5.745009,0.318949,0.265625,0.250259,0.593727,960
493,4.536695,2.0,6.0,2.0,68.0,4.795071,0.306306,0.274876,0.25015,0.590077,804
841,3.958092,2.0,6.0,2.0,46.0,5.614887,0.319899,0.26765,0.250112,0.595168,949
910,3.699133,2.0,6.0,3.0,53.0,5.360094,0.31203,0.260188,0.249954,0.587134,957
935,3.690268,2.0,6.0,3.0,56.0,5.907478,0.317073,0.252078,0.249952,0.590871,1083


In [248]:
df_trials = pd.read_csv("data/experiments/data_window_size_14#3.csv")
df_trials.sort_values("ap", ascending=False).head(20)

Unnamed: 0,gamma,max_delta_step,max_depth,min_child_weight,n_estimators,scale_pos_weight,f1_score,precision,ap,auc,pos_preds
326,1.688328,4.0,6.0,7.0,80.0,5.641467,0.325294,0.253497,0.250155,0.596745,1144
807,1.326854,4.0,8.0,4.0,78.0,4.967942,0.305945,0.2669,0.24962,0.599559,858
438,1.300462,4.0,6.0,6.0,76.0,4.553483,0.294203,0.273954,0.249371,0.599819,741
596,0.735622,4.0,8.0,6.0,65.0,5.516225,0.307049,0.248309,0.249322,0.597955,1035
543,1.27358,3.0,8.0,5.0,69.0,5.197366,0.315589,0.265176,0.248366,0.600261,939
837,1.007793,4.0,8.0,7.0,66.0,4.768494,0.299584,0.268991,0.248091,0.597515,803
950,0.468104,4.0,6.0,7.0,63.0,4.902799,0.308548,0.2775,0.247911,0.599045,800
409,2.426724,4.0,8.0,7.0,77.0,5.217669,0.298718,0.252986,0.247849,0.598017,921
595,0.816746,4.0,8.0,7.0,65.0,5.305828,0.312146,0.261053,0.246886,0.603868,950
582,0.882248,4.0,8.0,7.0,71.0,5.469849,0.311995,0.258763,0.246834,0.595037,970


In [None]:
df_trials = pd.read_csv("data/data-window-size-2#6.csv")
df_trials.sort_values("ap", ascending=False).head(50)