In [None]:
import sys
from pathlib import Path

# Add root folder to Python path (to import modules)
notebook_dir = Path().absolute()
project_root = notebook_dir.parent
sys.path.append(str(project_root))

In [None]:
import pandas as pd
import statsmodels.api as sm
path = '../saved/final_results/results_df.csv'

In [None]:
results_df = pd.read_csv(path, index_col=0)
results_df = results_df.dropna()
trials_list = results_df.trial_id.to_list()

In [None]:
import json
trials_df = pd.DataFrame()
for trial in trials_list:
    file_name = str(trial) + '_trial_full.json'
    file_path = '../saved/final_results/trial_info/'
    try:
        with open(file_path+file_name) as f:
            data = json.load(f)
            params = data['params']
        trial_df = pd.DataFrame(params, index=[0])
        trial_df['trial_id'] = trial
        trials_df = pd.concat([trials_df, trial_df], ignore_index=True) 
    except:
        print('Not found')

In [None]:
trials_df = trials_df.drop(['epochs', 'batch_size', 'patience','huber_delta','log_returns', 'hidden_layer6', 'hidden_layer7', 'hidden_layer8', 'hidden_layer9', 'hidden_layer10', 'loss_fn'], axis=1)
trials_df['batch_norm'] = trials_df['batch_norm'].fillna(0)
trials_df['l1_lambda1'] = trials_df['l1_lambda1'].fillna(0)
trials_df['l2_lambda'] = trials_df['l2_lambda'].fillna(0)
trials_df['dropout_prob'] = trials_df['dropout_prob'].fillna(0)
trials_df['n_layers'] = trials_df.iloc[:,:4].astype(bool).sum(axis=1)
trials_df['n_neurons'] = trials_df.iloc[:,:4].sum(axis=1)
trials_df = trials_df.loc[(trials_df.act_func != 'Tanh') & (trials_df.act_func != 'Sigmoid')]
trials_df.iloc[:,:5] = trials_df.iloc[:,:5].astype(bool).astype(int)

# Remove layers info
trials_df = trials_df.iloc[:,5:]

In [None]:
trials_df = trials_df.merge(results_df[['trial_id', 'oosSpearman']], on='trial_id', how='left')

In [None]:
columns = trials_df.columns.to_list()#.remove(['trial_id', 'oosSpearman'])
columns.remove('trial_id')
columns.remove('oosSpearman')
columns_new = ['oosSpearman']
columns_new.extend(columns)
trials_df = trials_df[columns_new]

In [None]:
trials_df = pd.get_dummies(trials_df, columns=['act_func','optimizer'])

In [None]:
trials_df.corr()

In [None]:
# On long-short returns        
X = trials_df.iloc[:, 1:]

# Column 1 is long returns on max quantile, 
# Column 2 is long-short returns
y = trials_df.iloc[:,0]

X = sm.add_constant(X)
lm = sm.OLS(y, X).fit()


In [None]:
lm.summary()

In [None]:
# Dropping as these are dummy variables
trials_df = trials_df.drop(['act_func_LeakyReLU', 'optimizer_Adagrad'], axis=1)
trials_df.iloc[:,0] = trials_df.iloc[:,0]*100

In [None]:
import statsmodels.formula.api as smf
params = pd.DataFrame()
tvalues = pd.DataFrame()

for quantile in [0.1,0.25,0.5,0.75,0.9]:
    # print(f'\n\n QUANTILE: {quantile}\n\n')
    y_var = trials_df.iloc[:,0]
    mod = smf.quantreg(f"y_var~ {' + '.join(trials_df.columns[1:])}", trials_df)

    res = mod.fit(q=quantile)
    params_temp = pd.DataFrame(res.params).reset_index(drop=False).rename({0:'Q'+str(int(quantile*100))}, axis=1)
    tvalues_temp = pd.DataFrame(res.tvalues).reset_index(drop=False).rename({0:'Q'+str(int(quantile*100))}, axis=1)
    
    if quantile == 0.1:
        params = pd.concat([params, params_temp])
        tvalues = pd.concat([tvalues, tvalues_temp])

    else:
        params = params.merge(params_temp, on='index')
        tvalues = tvalues.merge(tvalues_temp, on='index')

    # params.append(res.params)

    # print(res.summary())

In [None]:
params

In [None]:
tvalues

In [None]:
results_df = pd.read_csv('/home/ge65cuw/thesis/saved/final_results/results_df.csv', index_col=0)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),  
            (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),  
            (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),  
            (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),  
            (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]  
color_index = 18
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.  
for i in range(len(tableau20)):  
    r, g, b = tableau20[i]  
    tableau20[i] = (r / 255., g / 255., b / 255.)  
path_top = path + '_feature_importance_top20.png'
path = path + '_feature_importance.png'

# x_pos = (np.arange(max['feature'])))

fig = plt.figure(figsize=(12,6.5))
fig = plt.figure(figsize=(12,9))
ax = fig.add_subplot(3,2,1)
# ax = plt.axes()
# ax.set_title('Feature Importance', fontsize=25)
# ax.set_xticks(x_pos)

ax.spines["top"].set_visible(False)  
ax.spines["bottom"].set_visible(True)  
ax.spines["right"].set_visible(False)  
ax.spines["left"].set_visible(True)  
# ax.set_xticklabels(results_df['FF5_Mom_STRev_alpha_VW'], rotation=90, ha='center', fontsize=12)
# ax.bar(results_df['FF5_Mom_STRev_alpha_VW'],align='center', zorder=3, color=tableau20[color_index], height=1)
ax.set_xlabel('7 Factor Model alpha')
ax.hist(results_df['FF5_Mom_STRev_alpha_VW'], density=True, color=tableau20[color_index], align='mid', zorder=3)
# plt.margins(y=0.01, x=.005)
ax.xaxis.grid(True, linestyle='--',  zorder=0)
ax.yaxis.grid(True, linestyle='--',  zorder=0)

ax = fig.add_subplot(3,2,2)

ax.spines["top"].set_visible(False)  
ax.spines["bottom"].set_visible(True)  
ax.spines["right"].set_visible(False)  
ax.spines["left"].set_visible(True)  
color_index = 16
ax.hist(results_df['oosSpearman'], density=True, zorder=3, color=tableau20[color_index], align='mid')
ax.set_xlabel('Out of Sample Spearman Coefficient')
# plt.margins(y=0.01, x=.005)
ax.xaxis.grid(True, linestyle='--',  zorder=0)
ax.yaxis.grid(True, linestyle='--',  zorder=0)

fig.tight_layout()
# plt.savefig(path)