In [4]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [5]:
import warnings
warnings.filterwarnings("ignore")

import glob
import shutil
import itertools
import os
import sys

import pickle

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import random

from scipy.stats import norm
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

import sklearn.metrics
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, OPTICS
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.utils import resample, shuffle

import six
import sys
sys.modules['sklearn.externals.six'] = six

import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

from sklearn.utils import _safe_indexing
sys.modules['sklearn.utils.safe_indexing'] = _safe_indexing

from pairs_trading_package.clustering import *

import pairs_trading_package as lfl
from pairs_trading_package.utils import flatten, postfix_keys_to_dict, get_current_time_hash, get_random_hash


In [6]:
ticker_path = "../data_folder/original/ticker_segment_dict.pickle"
with open(ticker_path, 'rb') as handle:
    ticker_segment_dict = pickle.load(handle)

In [7]:

from pairs_trading_package.pairs_trading_backtester import (
    SeriesAnalyser, DataProcessor, MPSeriesAnalyser, MPTrader, Trader
)

series_analyser = SeriesAnalyser()
data_processor = DataProcessor()

# load etf metadata
etfs, etfs_unique, tickers = data_processor.read_ticker_excel(path='../data_folder/original/commodity_ETFs_long_updated.xlsx')

SPLIT_IDX = 1

pickle_folders = ['2012-2016', '2013-2017', '2014-2018']

# train split, test split, train_val_split
splits = [ [('01-01-2012', '31-12-2014'), ('01-01-2015', '31-12-2015'), '2014-01-01'],
[('01-01-2013', '31-12-2015'), ('01-01-2016', '31-12-2016'), '2015-01-01'],
[('01-01-2014', '31-12-2016'), ('01-01-2017', '31-12-2017'), '2016-01-01'] ]

subsample = 2500

min_half_life = 78 # number of points in a day

max_half_life = 20000 #~number of points in a year: 78*252

# intraday
n_years_val = 1


In [8]:

naming_scheme = dict({'kmeans': {'no_clusters': 'n_clusters', 'algo': 'kmeans', 'distance': 'euclidean'},
                      'agglomerative': {'no_clusters': 'n_clusters', 'algo': 'linkage', 'distance': 'affinity'},
                      'spectral': {'no_clusters': 'n_clusters', 'algo': 'spectral', 'distance': 'affinity'},
                      'optics': {'no_clusters': 'min_samples', 'algo': 'optics', 'distance': 'metric'},                      
                      })
  

In [9]:
from pairs_trading_package.multiple_hypothesis_corrections.fdr import abh

def correct_pairs_fdr(pairs_unsupervised, pvals, q=0.1):

    significant_pairs_unsupervised = []
    
    for pair in pairs_unsupervised:
        if pair[2]['p_value'] in pvals[abh(pvals, q=q)]:
            significant_pairs_unsupervised.append(pair)
            
    return significant_pairs_unsupervised

In [11]:
from datetime import timedelta

def save_portfolio_returns(performance_df, number_of_pairs, folder='./backtest_results/portfolio_return_series/'):
    
    total_account_balance = performance_df[0][1]['account_balance'].resample('D').last().dropna()
    portfolio_returns = total_account_balance.pct_change().fillna(0)
    for index in range(1, len(number_of_pairs)):
        pair_balance = performance_df[index][1]['account_balance'].resample('D').last().dropna()
        portfolio_returns = pd.concat([portfolio_returns, pair_balance.pct_change().fillna(0)], axis=1)

    weights = np.array([1 / len(number_of_pairs)] * len(number_of_pairs))
    
    weighted_portfolio_returns = pd.Series(np.dot(portfolio_returns.fillna(0), weights), index=portfolio_returns.index)

    return_file_name = get_random_hash() + '.csv'
    
    weighted_portfolio_returns.to_csv(folder + return_file_name)
    
    return return_file_name


def save_spread_returns(performance_df, folder='./backtest_results/spread_series/'):

    spreads_series = []

    for ptt_spread in performance_df:
        spread_name = ptt_spread[0][0] + '_' + ptt_spread[0][1]
        spread_series = ptt_spread[1]['norm_spread'].resample('D').last().dropna()
        spread_series.name = spread_name
        spreads_series.append(spread_series)

    spreads_series_df = pd.concat(spreads_series, axis=1)

    return_file_name = get_random_hash() + '.csv'

    spreads_series_df.to_csv(folder + return_file_name)

    return return_file_name

In [13]:
P_VAL_THRESHOLD = 0.10
Q_VAL_THRESHOLD = 0.10
HURST_THRESHOLD = 0.5
ZERO_CROSSINGS = 12
MHC_METHOD = 'Benjamini Hochberg'
DIM_RED_METHOD = 'PCA'
    
mp_series_analyser = MPSeriesAnalyser(109)
   
dict_tobe_filled = { 
    # Describing Dates 
    'train_period_start': '', 'train_period_end': '',  
    'validation_period': '', 
    'test_period_start': '', 'test_period_end': '', 

    # Describing the clustering parameters used
    'seed': 0, 'n_clusters': 0, 'clust_algo': '', 'distance_measure': '', 

    # Describing the clustering achieved
    'infomax_hofs': 0, 'infomax_hofk': 0, 'no_found_pairs_from_clustering': 0, 

    # Describing the dimensionality reduction  technique and resulting product
    'n_principal_components': 0, 'explained_variance': 0, 

    # Statistical thresholds used for filtering of FP
    'qvalue_threshold': Q_VAL_THRESHOLD, 'multiple_hypothesis_correction_method': MHC_METHOD, 
    'pvalue_threshold': P_VAL_THRESHOLD, 'hurst_threshold': HURST_THRESHOLD, 
    'min_zero_crossings': ZERO_CROSSINGS,
    
    # Insample trading results
    'n_pairs_insample': 0, 'annual_sharpe_ratio_iid_insample': 0, 'auto_corr_insample': 0, 
    'daily_sharpe_ratio_insample': 0, 'portfolio_vol_insample': 0, 'avg_total_roi_insample': 0, 
    'avg_annual_roi_insample': 0, 'total_trades_insample': 0, 'positive_trades_insample': 0,
    'negative_trades_insample': 0, 'pct_positive_trades_per_pair_insample': 0, 'pct_pairs_with_positive_results_insample': 0, 
    'max_dd_insample': 0, 'max_dd_duration_insample': 0, 'total_dd_duration_insample': 0, 'avg_half_life_insample': 0, 
    'avg_hurst_exponent_insample': 0, 'count_positive_trades_dist_insample': 0, 'mean_positive_trades_dist_insample': 0, 
    'std_positive_trades_dist_insample': 0, 'min_positive_trades_dist_insample': 0, '25%_positive_trades_dist_insample': 0, 
    '50%_positive_trades_dist_insample': 0, '75%_positive_trades_dist_insample': 0, 'max_positive_trades_dist_insample': 0, 
    'count_negative_trades_dist_insample': 0, 'mean_negative_trades_dist_insample': 0, 'std_negative_trades_dist_insample': 0, 
    'min_negative_trades_dist_insample': 0, '25%_negative_trades_dist_insample': 0, '50%_negative_trades_dist_insample': 0, 
    '75%_negative_trades_dist_insample': 0, 'max_negative_trades_dist_insample': 0, 'count_pairs_sharpe_dist_insample': 0, 
    'mean_pairs_sharpe_dist_insample': 0, 'std_pairs_sharpe_dist_insample': 0, 'min_pairs_sharpe_dist_insample': 0, 
    '25%_pairs_sharpe_dist_insample': 0, '50%_pairs_sharpe_dist_insample': 0, '75%_pairs_sharpe_dist_insample': 0, 
    'max_pairs_sharpe_dist_insample': 0, 'portfolio_returns_saved_file_insample': '', 'spreads_saved_file_insample': '',
    
    # Out of Sample trading results
    'n_pairs_oosample': 0, 'annual_sharpe_ratio_iid_oosample': 0, 'auto_corr_oosample': 0, 
    'daily_sharpe_ratio_oosample': 0, 'portfolio_vol_oosample': 0, 'avg_total_roi_oosample': 0, 'avg_annual_roi_oosample': 0,
    'total_trades_oosample': 0, 'positive_trades_oosample': 0, 'negative_trades_oosample': 0, 'pct_positive_trades_per_pair_oosample': 0,
    'pct_pairs_with_positive_results_oosample': 0, 'max_dd_oosample': 0, 'max_dd_duration_oosample': 0, 'total_dd_duration_oosample': 0, 
    'avg_half_life_oosample': 0, 'avg_hurst_exponent_oosample': 0, 'count_positive_trades_dist_oosample': 0, 'mean_positive_trades_dist_oosample': 0, 
    'std_positive_trades_dist_oosample': 0, 'min_positive_trades_dist_oosample': 0, '25%_positive_trades_dist_oosample': 0, 
    '50%_positive_trades_dist_oosample': 0, '75%_positive_trades_dist_oosample': 0, 'max_positive_trades_dist_oosample': 0, 
    'count_negative_trades_dist_oosample': 0, 'mean_negative_trades_dist_oosample': 0, 'std_negative_trades_dist_oosample': 0,
    'min_negative_trades_dist_oosample': 0, '25%_negative_trades_dist_oosample': 0, '50%_negative_trades_dist_oosample': 0, 
    '75%_negative_trades_dist_oosample': 0, 'max_negative_trades_dist_oosample': 0, 'count_pairs_sharpe_dist_oosample': 0, 
    'mean_pairs_sharpe_dist_oosample': 0, 'std_pairs_sharpe_dist_oosample': 0, 'min_pairs_sharpe_dist_oosample': 0, '25%_pairs_sharpe_dist_oosample': 0, 
    '50%_pairs_sharpe_dist_oosample': 0, '75%_pairs_sharpe_dist_oosample': 0, 'max_pairs_sharpe_dist_oosample': 0,
    'portfolio_returns_saved_file_oosample': '', 'spreads_saved_file_oosample': '',
    
    # Coint Test distribution statistics
    'count_coint_pvals_dist': 0, 'mean_coint_pvals_dist': 0, 'std_coint_pvals_dist': 0, 'min_coint_pvals_dist': 0, 
    '25%_coint_pvals_dist': 0, '50%_coint_pvals_dist': 0, '75%_coint_pvals_dist': 0, 'max_coint_pvals_dist': 0
} 

ex_args = dict({'min_half_life': min_half_life, 'max_half_life': max_half_life, 'min_zero_crossings': ZERO_CROSSINGS,
                'p_value_threshold': P_VAL_THRESHOLD, 'hurst_threshold': HURST_THRESHOLD, 'subsample': subsample})


In [None]:
global_frame = pd.DataFrame()

for SPLIT_IDX in range(0, len(splits)):
    
    pair_results_cache = []
    
    for N_PRIN_COMPONENTS in [25]:
        
        for rand_seed in [109, 112, 1327, 8222, 6985]:
            
            mp_series_analyser = MPSeriesAnalyser(rand_seed)

            if pd.to_datetime(splits[SPLIT_IDX][0][0]) < pd.to_datetime('2012-01-01'):
                df_prices = pd.read_pickle('../data_folder/original/commodity_ETFs_intraday_interpolated_screened_no_outliers.pickle') 
            else:
                df_prices = pd.read_pickle('../data_folder/original/commodity_ETFs_from_2014_complete.pickle')

            # split data in training and test
            df_prices_train, df_prices_test = data_processor.split_data(df_prices,
                                                                      splits[SPLIT_IDX][0],
                                                                      splits[SPLIT_IDX][1],
                                                                      remove_nan=True)
            
            mp_series_analyser.df_prices_train = df_prices_train
            mp_series_analyser.df_prices_test = df_prices_test

            train_val_split = splits[SPLIT_IDX][2]

            # intraday
            n_years_val = round(len(df_prices_train[train_val_split:])/(240*78))

            df_returns = data_processor.get_return_series(df_prices_train)

            del df_prices

            full_arg_templates = generate_clustering_arg_templates(2, len(df_returns.columns))

            for alg_templ in full_arg_templates.keys():
                for templ in full_arg_templates[alg_templ]:
                    no_clusters_param = naming_scheme[alg_templ]['no_clusters']
                    algo_param = naming_scheme[alg_templ]['algo']
                    distance_param = naming_scheme[alg_templ]['distance']

                    if no_clusters_param in templ:
                        N_CLUSTERS = templ[no_clusters_param]
                    else:
                        N_CLUSTERS = no_clusters_param

                    if algo_param in templ:
                        CLUST_ALGO = templ[algo_param]
                    else:
                        CLUST_ALGO = algo_param

                    if distance_param in templ:
                        DISTANCE = templ[distance_param]
                    else:
                        DISTANCE = distance_param


                    working_analytic_dict = dict_tobe_filled.copy()

                    working_analytic_dict['seed'] = rand_seed
                    working_analytic_dict['train_period_start'] = splits[SPLIT_IDX][0][0]
                    working_analytic_dict['train_period_end'] = splits[SPLIT_IDX][0][1]
                    working_analytic_dict['validation_period'] = splits[SPLIT_IDX][2] 
                    working_analytic_dict['test_period_start'] = splits[SPLIT_IDX][1][0] 
                    working_analytic_dict['test_period_end'] = splits[SPLIT_IDX][1][1]


                    if DIM_RED_METHOD == 'PCA':

                        X, explained_variance = mp_series_analyser.apply_PCA(N_PRIN_COMPONENTS, df_returns, ignore_first_eigenvector=False)

                        working_analytic_dict['dimensionality_reduction_method'] = 'PCA'
                        working_analytic_dict['n_principal_components'] = N_PRIN_COMPONENTS
                        working_analytic_dict['explained_variance'] = np.sum(explained_variance) 

#                     elif DIM_RED_METHOD == 'AutoEncoder':

#                         X = pd.read_csv('../notebooks/test-strategy/thesis/autoencoder_work/ae_exhaust/embedding_sets/' + str(SPLIT_IDX) + '.csv').iloc[:, 1:]

#                         working_analytic_dict['dimensionality_reduction_method'] = 'AutoEncoder'
#                         working_analytic_dict['n_principal_components'] = 10
#                         working_analytic_dict['explained_variance'] = -1 
                        

                    if CLUST_ALGO == 'spectral':
                        # spectral
                        clustered_series_all, clustered_series, counts, assigned_labels = mp_series_analyser.apply_clustering_algo('spectral', pd.DataFrame(X), df_returns.columns, templ)
                    elif (CLUST_ALGO == 'agglomerative') or (CLUST_ALGO == 'single') or (CLUST_ALGO == 'complete') or (CLUST_ALGO == 'average'):
                        # agglomerative
                        clustered_series_all, clustered_series, counts, assigned_labels = mp_series_analyser.apply_clustering_algo('agglomerative', pd.DataFrame(X), df_returns.columns, templ)
                    elif CLUST_ALGO == 'kmeans':
                        # kmeans
                        clustered_series_all, clustered_series, counts, assigned_labels = mp_series_analyser.apply_clustering_algo('kmeans', pd.DataFrame(X), df_returns.columns, templ)
                    elif CLUST_ALGO == 'optics':
                        # optics
                        clustered_series_all, clustered_series, counts, assigned_labels = mp_series_analyser.apply_clustering_algo('optics', pd.DataFrame(X), df_returns.columns, templ)
                        N_CLUSTERS = clustered_series.values.max()
                    elif CLUST_ALGO == 'dbscan':
                        # dbscan 
                        clustered_series_all, clustered_series, counts, assigned_labels = mp_series_analyser.apply_clustering_algo('dbscan', pd.DataFrame(X), df_returns.columns, templ)
                        N_CLUSTERS = clustered_series.values.max()

                    no_found_pairs = (counts * (counts - 1) / 2).sum()

                    working_analytic_dict['n_clusters'] = N_CLUSTERS
                    working_analytic_dict['clust_algo'] = CLUST_ALGO 
                    working_analytic_dict['distance_measure'] = DISTANCE 

                    infomax_scores = InfoMax().get_entropies(assigned_labels)
                    working_analytic_dict['infomax_hofs'] = infomax_scores[0]
                    working_analytic_dict['infomax_hofk'] = infomax_scores[1]
                    working_analytic_dict['no_found_pairs_from_clustering'] = no_found_pairs


                    if (no_found_pairs <= 500):

                        pairs_unsupervised, unique_tickers, coint_pval_counts = mp_series_analyser.mp_apply_check_properties(pair_results_cache=pair_results_cache, 
                                                                                                                             clustered_series=clustered_series, 
                                                                                                                             ex_args=ex_args)

                        for _pair in pairs_unsupervised:
                            if not mp_series_analyser.check_if_cached(pair_results_cache, _pair[0], _pair[1]):
                                pair_results_cache.append(_pair)

                        flat_pvals = np.array(flatten(coint_pval_counts))
                        
                        if len(flat_pvals) > 3:

                            significant_pairs_unsupervised = correct_pairs_fdr([valid_pair for valid_pair in pairs_unsupervised if valid_pair[2] != None],
                                                                               flat_pvals, q=Q_VAL_THRESHOLD)

                            if len(significant_pairs_unsupervised) > 1:

                                mp_trader = lfl.backtester.MPTrader(rand_seed)

                                mp_trader.df_prices_train = df_prices_train
                                mp_trader.df_prices_test = df_prices_test
                                
                                train_results_with_costs, performance_threshold_train = \
                                      mp_trader.mp_apply_trading_strategy_with_costs(significant_pairs_unsupervised, 
                                                                      2,#entry_multiplier,
                                                                      0,#exit_multiplier,
                                                                      test_mode=False,
                                                                      train_val_split=train_val_split
                                                                    )

                                insample_statistics = mp_trader.get_results(train_results_with_costs, performance_threshold_train,
                                                                significant_pairs_unsupervised, ticker_segment_dict, n_years_val, 'insample')

                                saved_returns_file = save_portfolio_returns(performance_threshold_train, significant_pairs_unsupervised, './backtest_results/return_series/')
                                insample_statistics['portfolio_returns_saved_file_insample'] = saved_returns_file

                                working_analytic_dict.update(insample_statistics)
                                
                                working_analytic_dict['spreads_saved_file_insample'] = save_spread_returns(performance_threshold_train)

                                del train_results_with_costs
                                del performance_threshold_train

                                # intraday
                                n_years_test = round(len(df_prices_test)/(240*78))

                                results_with_costs, performance_threshold_test = \
                                      mp_trader.mp_apply_trading_strategy_with_costs(significant_pairs_unsupervised, 
                                                                      2,#entry_multiplier,
                                                                      0,#exit_multiplier,
                                                                      test_mode=True,
                                                                      train_val_split=train_val_split
                                                                    )

                                oosample_statistics = mp_trader.get_results(results_with_costs, performance_threshold_test,
                                                                significant_pairs_unsupervised, ticker_segment_dict, n_years_val, 'oosample')

                                saved_returns_file = save_portfolio_returns(performance_threshold_test, significant_pairs_unsupervised, './backtest_results/return_series/')
                                oosample_statistics['portfolio_returns_saved_file_oosample'] = saved_returns_file

                                working_analytic_dict.update(oosample_statistics)
                                
                                working_analytic_dict['spreads_saved_file_oosample'] = save_spread_returns(performance_threshold_test)

                                del results_with_costs
                                del performance_threshold_test

                                # del significant_pairs_unsupervised
                                # del pairs_unsupervised

                                coint_tests_statistics = postfix_keys_to_dict(dict(pd.Series(flat_pvals).describe()), 'coint_pvals_dist')

                                working_analytic_dict.update(coint_tests_statistics)
                                
                    row_frame = pd.DataFrame.from_dict(working_analytic_dict, orient='index').T

                    global_frame = pd.concat([global_frame, row_frame])
                                
    global_frame.to_csv('./backtest_results/' + get_current_time_hash() + '.csv')