In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics
import glob

sns.set_context("paper", font_scale=1.7)
sns.set_style("ticks", {"axes.grid": True, "grid.color": "0.95", "grid.linestyle": "-"})

# Import Results

In [2]:
forecast_results_df = pd.read_parquet('../../data/_temp/20211206_215320_test_.parquet')
forecast_log_df = pd.read_pickle('../../data/_temp/20211206_215320_test_log.pkl')
forecast_results_df.columns = ['model_' + x for x in forecast_results_df]

## Load true values

In [3]:
fret_df = pd.concat(
    [
        pd.read_parquet(x, columns=["ff__mkt"])
        for x in glob.glob("/Users/au515538/Desktop/HFML//data/proc/_temp/*_all.parquet")
    ]
)
forecast_results_df["truth"] = fret_df["ff__mkt"]

In [4]:
forecast_results_df

Unnamed: 0_level_0,model_0,model_1,model_2,truth
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996-03-04 09:30:00,5.764345e+04,-0.000042,0.000072,0.003246
1996-03-04 09:45:00,2.438620e+05,-0.000182,0.000072,0.001974
1996-03-04 10:00:00,1.573895e+05,-0.000228,0.000072,-0.001224
1996-03-04 10:15:00,1.061418e+05,-0.000253,0.000072,0.000082
1996-03-04 10:30:00,8.807400e+04,-0.000172,0.000072,0.000435
...,...,...,...,...
2020-12-31 15:00:00,-4.887977e+06,0.000081,0.000085,-0.000660
2020-12-31 15:15:00,1.442115e+06,0.000088,0.000085,0.001633
2020-12-31 15:30:00,1.126051e+07,0.000082,0.000085,-0.000421
2020-12-31 15:45:00,-7.838498e+05,0.000080,0.000085,0.000072


In [5]:
print(len(forecast_results_df))
forecast_results_df = forecast_results_df.dropna()
print(len(forecast_results_df))

168804
168804


# Check error

In [6]:
sklearn.metrics.mean_squared_error(forecast_results_df['truth'], forecast_results_df['model_0'])

939705110223791.1

In [7]:
def compute_rsquared(truth, pred):
    return 1 - np.sum(np.square(truth-pred))/np.sum(np.square(truth))

metrics_df = pd.DataFrame([], index = [col for col in forecast_results_df.columns if 'model' in col])

for col in forecast_results_df.columns:
    if 'model' in col:
        metrics_df.loc[col, 'MSE'] = sklearn.metrics.mean_squared_error(forecast_results_df['truth'], forecast_results_df[col])
        metrics_df.loc[col, 'R2'] = compute_rsquared(forecast_results_df['truth'], forecast_results_df[col])

metrics_df*100

Unnamed: 0,MSE,R2
model_0,9.397051e+16,-1.948717e+22
model_1,0.000482757,-0.1119025
model_2,0.0004840466,-0.3793434


In [15]:
[col for col in forecast_results_df.columns if 'model' not in col]

['truth']