In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import warnings
warnings.filterwarnings("ignore")

import glob
import shutil
import itertools
import os
import sys
import importlib  

import pickle

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import random

from sklearn.utils import resample, shuffle

import six
import sys
sys.modules['sklearn.externals.six'] = six

import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

from sklearn.utils import _safe_indexing
sys.modules['sklearn.utils.safe_indexing'] = _safe_indexing

from pairs_trading_package.clustering import *

import pairs_trading_package as lfl
from pairs_trading_package.utils import flatten, postfix_keys_to_dict, get_current_time_hash, get_random_hash

from pairs_trading_package.pairs_trading_backtester import (
    Trader
)

In [3]:

def get_merged_analytics_as_dataframe(file_list, filter_results=True):

    results_per_period = []

    for file in file_list:
        backtest_results_ae_df = pd.read_csv(file['filepath'])

        working_example_df_ae = backtest_results_ae_df[backtest_results_ae_df['clust_algo'] == file['clust_algo']]
        
        if filter_results:
            working_example_df_ae = working_example_df_ae[working_example_df_ae['distance_measure'] == file['distance_measure']] #54 #70
            working_example_df_ae = working_example_df_ae[working_example_df_ae['n_clusters'] == file['n_clusters']] #54 #70

        results_per_period.append(working_example_df_ae)

    return pd.concat(results_per_period)

def get_merged_results_as_dataframe(file_list):

    results_per_period = []

    for file in file_list:
        backtest_results_ae_df = pd.read_csv(meta_data_files[file['split_idx']])

        working_example_df_ae = backtest_results_ae_df[backtest_results_ae_df['clust_algo'] == file['clust_algo']]
        working_example_df_ae = working_example_df_ae[working_example_df_ae['distance_measure'] == file['distance_measure']]
        working_example_df_ae = working_example_df_ae[working_example_df_ae['n_clusters'] == file['n_clusters']]

        results_per_period.append(working_example_df_ae)

    merged_results = []

    for period_zero_sample in results_per_period[0].iterrows():

        working_period_one_df_ae = results_per_period[1].dropna()
        working_period_one_df_ae = working_period_one_df_ae[working_period_one_df_ae['seed'] == period_zero_sample[1]['seed']]

        if len(working_period_one_df_ae) != 0:
            working_period_one_df_ae = working_period_one_df_ae.dropna().sample(1)
        else: 
            working_period_one_df_ae = results_per_period[1].dropna().sample(1)

        working_period_two_df_ae = results_per_period[2].dropna()
        working_period_two_df_ae = working_period_two_df_ae[working_period_two_df_ae['seed'] == period_zero_sample[1]['seed']].dropna()

        if len(working_period_two_df_ae) != 0:
            working_period_two_df_ae = working_period_two_df_ae.dropna().sample(1)
        else: 
            working_period_two_df_ae = results_per_period[2].dropna().sample(1)

        merged_results.append([period_zero_sample[1]['clust_algo'], period_zero_sample[1]['seed'],

                               period_zero_sample[1]['portfolio_returns_saved_file_insample'], 
                               period_zero_sample[1]['portfolio_returns_saved_file_oosample'],

                               working_period_one_df_ae['portfolio_returns_saved_file_insample'].values[0], 
                               working_period_one_df_ae['portfolio_returns_saved_file_oosample'].values[0],

                               working_period_two_df_ae['portfolio_returns_saved_file_insample'].values[0], 
                               working_period_two_df_ae['portfolio_returns_saved_file_oosample'].values[0],
                              ])

    merged_results_df = pd.DataFrame(merged_results, columns=['clust_algo', 'rand_seed', 
                           'period_zero_returns_file_insample', 'period_zero_returns_file_oosample',
                           'period_one_returns_file_insample', 'period_one_returns_file_oosample',
                           'period_two_returns_file_insample', 'period_two_returns_file_oosample',
                          ])

    return merged_results_df

def sample_n_times(N, file_list):
    sampled_dfs = []
    
    for _ in range(N):
        sample_df = get_merged_results_as_dataframe(file_list)
        sampled_dfs.append(sample_df)
        
    return pd.concat(sampled_dfs)

def get_collated_returns_as_dataframe(merged_results_df, sample_period='oosample'):
    collated_dfs = []

    for full_period_sample in merged_results_df.iterrows():
        per_zero = pd.read_csv('../data_folder/results_visualizations/backtest_results/return_series/' + full_period_sample[1]['period_zero_returns_file_' + sample_period])
        per_one = pd.read_csv('../data_folder/results_visualizations/backtest_results/return_series/' + full_period_sample[1]['period_one_returns_file_' + sample_period])
        per_two = pd.read_csv('../data_folder/results_visualizations/backtest_results/return_series/' + full_period_sample[1]['period_two_returns_file_' + sample_period])

        collated_df = pd.concat([per_zero, per_one, per_two])
        collated_df['Date'] = pd.to_datetime(collated_df['Date'])
        collated_df.set_index('Date', inplace=True, drop=True)

        collated_dfs.append(collated_df)
        
    return collated_dfs


def get_sharpe_distribution_from_collated_returns(collated_dfs, rf_rate=0.01):
    
    annualized_ret = pd.concat(collated_dfs, axis=1).mean()*252
    vol = pd.concat(collated_dfs, axis=1).std()*np.sqrt(252)

    rf_daily = (1+rf_rate)**(1/252)-1

    sharpe_ratio_assuming_iid = (annualized_ret-rf_daily) /vol
    
    return sharpe_ratio_assuming_iid


def get_mdd_distribution_from_collated_returns(collated_dfs, rf_rate=0.01):
    trader_obj = Trader()
    
    cum_rets = pd.concat(collated_dfs, axis=1).cumsum()

    mdd_dist = []
    for cr in range(len(cum_rets.columns)):
        mdd_dist.append( trader_obj.calculate_maximum_drawdown(1+cum_rets.iloc[:, cr]*1, False)[0] )
    
    return np.array(mdd_dist)

In [4]:
import pandas as pd
import numpy as np

meta_data_files = ['../data_folder/results_visualizations/backtest_results/a0ffcabf03d9c1320e60b09556b7dd0a_ae.csv', 
                   '../data_folder/results_visualizations/backtest_results/fin_bf672f94b8548e9bb32e9a6a20463a07_ae.csv', 
                   '../data_folder/results_visualizations/backtest_results/38d975761f1e86d3b2002e92d2f83f7b_ae_.csv']


meta_data_splits = [ 
    ['2012 - 2016'],
    ['2013 - 2017'],
    ['2014 - 2018'] 
]

cluster_scores = [{'split_idx': 0, 'clust_algo': 'kmeans', 'distance_measure': 'euclidean', 'n_clusters': 30},
                  {'split_idx': 1, 'clust_algo': 'kmeans', 'distance_measure': 'euclidean', 'n_clusters': 30},
                  {'split_idx': 2, 'clust_algo': 'kmeans', 'distance_measure': 'euclidean', 'n_clusters': 30}]


clust_data_list = []

for first_pass_idx in range(3):

    cs = cluster_scores[first_pass_idx]

    current_clust_data = {}
    current_clust_data['split_idx'] = meta_data_splits[cs['split_idx']][0] 
    current_clust_data['clust_algo'] = cs['clust_algo']
    current_clust_data['distance_measure'] = cs['distance_measure']

    working_df = pd.read_csv(meta_data_files[cs['split_idx']])
    working_df.drop(['Unnamed: 0'], axis=1, inplace=True)
    working_df.reset_index(drop=True, inplace=True)
    working_df.drop_duplicates(inplace=True)
    working_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    working_df.dropna(inplace=True)

    first_pass_df = working_df[working_df['clust_algo'] == cs['clust_algo']]
    second_pass_df = first_pass_df[first_pass_df['distance_measure'] == cs['distance_measure']]

    cols_to_select = ['annual_sharpe_ratio_iid_insample', 'avg_total_roi_insample', 'max_dd_insample', 
                                                                        'annual_sharpe_ratio_iid_oosample', 'avg_total_roi_oosample', 'max_dd_oosample']
    third_pass_df = second_pass_df[second_pass_df['n_clusters'] == cs['n_clusters']].loc[:, cols_to_select]


    if len(third_pass_df) == 0: 
        nearest_available = second_pass_df['n_clusters'].values[np.argmin(np.abs(second_pass_df['n_clusters'].values - cs['n_clusters']))]

        third_pass_df = second_pass_df[second_pass_df['n_clusters'] == nearest_available].loc[:, cols_to_select]
        cluster_scores[first_pass_idx+second_pass_idx]['n_clusters'] = nearest_available

    third_pass_df_means = np.around(third_pass_df.mean(), 2) 
    third_pass_df_stds = np.around(third_pass_df.std(), 2)

    for col_to_select in cols_to_select:
        current_clust_data[col_to_select] = str(third_pass_df_means[col_to_select]) + ' (' + str(third_pass_df_stds[col_to_select]) + ')'

    clust_data_list.append(current_clust_data)

In [6]:
# pd.DataFrame(clust_data_list).to_csv('./pretty_results/per_period_performance_table_ae.csv')

In [18]:
clust_data_list_full_period = []


intermediate_cluster_scores_array = []
    
for first_pass_idx in range(3):
    cs = cluster_scores[first_pass_idx]
    intermediate_cluster_scores_array.append(cs)
        
current_clust_data_full = {}
current_clust_data_full['clust_algo'] = cs['clust_algo']
current_clust_data_full['distance_measure'] = cs['distance_measure']

merged_results_pca_df = sample_n_times(2, intermediate_cluster_scores_array)

pca_returns_is_collated_df = get_collated_returns_as_dataframe(merged_results_pca_df, 'insample')
pca_returns_oos_collated_df = get_collated_returns_as_dataframe(merged_results_pca_df, 'oosample')

pca_is_cumrets_results = pd.concat(pca_returns_is_collated_df, axis=1).mean()*252#.iloc[-1]
pca_oos_cumrets_results = pd.concat(pca_returns_oos_collated_df, axis=1).mean()*252#.iloc[-1]

current_clust_data_full['insample_roi'] = str(np.round(pca_is_cumrets_results.mean()*100, 2)) + ' (' + str(np.round(pca_is_cumrets_results.std()*100, 2)) + ')'
current_clust_data_full['oosample_roi'] = str(np.round(pca_oos_cumrets_results.mean()*100, 2)) + ' (' + str(np.round(pca_oos_cumrets_results.std()*100, 2)) + ')'

pca_is_sharpe_results = get_sharpe_distribution_from_collated_returns(pca_returns_is_collated_df)
pca_oos_sharpe_results = get_sharpe_distribution_from_collated_returns(pca_returns_oos_collated_df)

current_clust_data_full['insample_sharpe'] = str(np.round(pca_is_sharpe_results.mean(), 2)) + ' (' + str(np.round(pca_is_sharpe_results.std(), 2)) + ')'
current_clust_data_full['oosample_sharpe'] = str(np.round(pca_oos_sharpe_results.mean(), 2)) + ' (' + str(np.round(pca_oos_sharpe_results.std(), 2)) + ')'

pca_is_mdd_results = get_mdd_distribution_from_collated_returns(pca_returns_is_collated_df)
pca_oos_mdd_results = get_mdd_distribution_from_collated_returns(pca_returns_oos_collated_df)

current_clust_data_full['insample_mdd'] = str(np.round(pca_is_mdd_results.mean(), 2)) + ' (' + str(np.round(pca_is_mdd_results.std(), 2)) + ')'
current_clust_data_full['oosample_mdd'] = str(np.round(pca_oos_mdd_results.mean(), 2)) + ' (' + str(np.round(pca_oos_mdd_results.std(), 2)) + ')'

clust_data_list_full_period.append(current_clust_data_full)

In [20]:
pd.DataFrame(clust_data_list_full_period).to_csv('./pretty_results/full_period_performance_table_ae.csv')