In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.precision", 4)

In [2]:
# load data
data_dic = {}
directory = '/Users/jinghan/Documents/UCB/Winter/IAQF_Competition/prediction'

for subdir in os.listdir(directory):
    
    if subdir != '.DS_Store':

        data_dic[subdir] = {}
        
        for filename in os.listdir(os.path.join(directory, subdir, 'predict')):
            
            if filename.endswith('_d.pkl'):
                data_dic[subdir]['D'] = pd.read_pickle(os.path.join(directory, subdir, 'predict',filename))
            elif filename.endswith('_w.pkl'):
                data_dic[subdir]['W'] = pd.read_pickle(os.path.join(directory, subdir, 'predict',filename))
            elif filename.endswith('_M.pkl'):
                data_dic[subdir]['M'] = pd.read_pickle(os.path.join(directory, subdir, 'predict',filename))
            else:
                pass

In [3]:
def calculate_accuracy(df):
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [c[0] for c in df.columns]
    df = df.reset_index(drop=True)
    df['Date'] = pd.to_datetime(df['Date'],format="%Y-%m-%d")
    train_df = df[df.Date<'2017-01-01']
    test_df = df[df.Date>='2017-01-01']

    train_df['abs_error'] = np.abs(train_df['actual_spread'] - train_df['pred_spread'])
    train_df['sqrt_error'] = np.square(train_df['actual_spread'] - train_df['pred_spread'])
    test_df['abs_error'] = np.abs(test_df['actual_spread'] - test_df['pred_spread'])
    test_df['sqrt_error'] = np.square(test_df['actual_spread'] - test_df['pred_spread'])

    measures = {}
    measures['train MAE'] = [train_df['abs_error'].mean()]
    measures['train RMSE'] = [np.sqrt(train_df.groupby('pair').aggregate({'sqrt_error':'mean'})).mean().values[0]]
    measures['test MAE'] = [test_df['abs_error'].mean()]
    measures['test RMSE'] = [np.sqrt(test_df.groupby('pair').aggregate({'sqrt_error':'mean'})).mean().values[0]]

    return measures

In [4]:
def summarize_accuracy(data_dic, period):
    results_table = pd.DataFrame()
    for k in data_dic.keys():
        if period in data_dic[k]:
            df = data_dic[k][period].copy()
            measures = calculate_accuracy(df)
            out = pd.DataFrame(measures)
            out['Period'] = period
            out['Model'] = k
            results_table = pd.concat([results_table,out],axis=0)
    results_table = results_table.groupby(['Period','Model']).max()
    return results_table

In [8]:
d_table = summarize_accuracy(data_dic, 'D')
w_table = summarize_accuracy(data_dic, 'W')
m_table = summarize_accuracy(data_dic, 'M')

results = pd.concat([d_table, w_table, m_table])

In [9]:
results

Unnamed: 0_level_0,Unnamed: 1_level_0,train MAE,train RMSE,test MAE,test RMSE
Period,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D,baseline,,,0.0083,0.0116
D,elastic_net,,,0.0085,0.0121
D,xgboost_model,,,0.0129,0.0191
W,baseline,,,0.0195,0.026
W,elastic_net,,,0.0173,0.0243
W,lstm,0.0106,0.0144,0.0102,0.0151
W,xgboost_model,,,0.0291,0.0434
M,baseline,,,0.0445,0.0564
M,elastic_net,,,0.0335,0.0484
M,xgboost_model,0.0145,0.0208,,
