In [5]:
from tslearn.metrics import soft_dtw
from sklearn.preprocessing import StandardScaler
from pandas import DataFrame
import pandas as pd
import numpy as np
import extraction_process as ep
import stationary_metrics as sm
import kalman_spread_test as kst

# 1. Processing the data

In [6]:
# Get the original close data, cumulative return, and log return
data = ep.extract_and_process_data(0.2)
cumret = ep.metrics_process(data, "cumret")
logret = ep.metrics_process(data, 'logret')

# Get normalized logret
scaler = StandardScaler()
normal_logret = pd.DataFrame(scaler.fit_transform(logret), columns = data.columns, index = data.index).dropna(how='all')

Data Shape before cleaning = (1310, 503)
Data Shape after cleaning = (1310, 494)


In [7]:
normal_logret

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-03,1.326301,-0.299344,0.407551,-0.056682,0.833972,0.175977,0.104914,0.015584,0.234294,0.765551,...,-0.298822,-0.465124,0.901744,-0.135488,0.590527,-0.078245,0.348560,0.729280,-0.034100,0.221744
2018-01-04,-0.435991,0.192347,1.659336,0.173123,-0.347323,-0.141910,-0.128547,0.153895,0.636008,0.484357,...,0.160341,-0.537685,0.049924,0.012244,0.317083,0.598736,-0.075971,0.725374,0.172724,0.298412
2018-01-05,0.826376,0.016423,0.479564,0.492730,0.930182,0.620899,0.145372,-0.222611,0.436674,0.464686,...,0.195536,-0.485621,-0.053326,0.647006,-0.109626,0.332687,0.500232,0.566605,0.027195,0.604878
2018-01-08,0.086050,-0.235555,-0.331538,-0.226241,-0.927381,0.856711,-0.199495,-0.023262,0.422346,-0.090141,...,-0.369564,0.464147,0.196216,0.323175,0.164556,0.078690,0.094097,0.352125,-0.179516,0.635647
2018-01-09,1.279448,0.001119,-0.379631,-0.053841,0.388214,0.318427,0.074424,-0.665334,0.162802,0.355901,...,0.198514,-0.794421,-0.215936,-1.899549,-0.001371,-0.188947,-0.826970,1.135295,0.914803,0.620215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-10,-1.291185,-0.681091,-0.644935,-0.716341,0.854081,-0.165596,-0.572983,-1.314466,-1.512983,-1.170295,...,-0.325415,-1.084676,-0.602200,-0.175614,-0.768206,-0.748171,0.485319,-1.181331,-0.948729,-2.081129
2023-03-13,0.354695,-1.036167,-0.006620,0.581512,0.796369,-0.464410,0.726107,-0.028811,-0.023824,-0.672378,...,-0.540807,2.098653,-0.560892,0.018975,-0.564622,-0.025365,0.292805,-0.030229,-11.543801,-0.037053
2023-03-14,0.641300,-0.313464,-2.341520,0.620324,0.661557,0.307931,0.191594,0.505520,-0.128075,1.143286,...,0.554531,0.640239,0.161397,1.458296,0.995892,1.014362,0.961976,0.537728,1.711527,1.011384
2023-03-15,-1.761138,-1.454801,0.019818,0.076764,0.046488,0.091792,-0.483499,-3.497431,-1.442542,0.013839,...,-0.909323,1.823406,-2.421231,-1.080366,-1.722079,0.183400,-0.555697,-0.658532,-0.739927,-0.377470


# 2. Filter out stationary stocks that satisfy at least one metrics

In [12]:
# Set the training period before testing
start_train = '2019-01-01'
end_train = '2021-12-31'

In [15]:
stat_metrics = sm.compute_stationary_long(data.loc[start_train: end_train])
stock_chosen = sm.choose_stocks(stat_metrics, ["Hurst Exponent", "ADF_value"])
filter_list = stock_chosen.index 
filter_list

Index(['ABC', 'AEE', 'AEP', 'ALLE', 'AMGN', 'AMT', 'ATO', 'BAX', 'BDX', 'BF-B',
       ...
       'TJX', 'VRSN', 'VRTX', 'VZ', 'WAB', 'WDC', 'WEC', 'WRB', 'WYNN', 'XEL'],
      dtype='object', length=102)

# 3. Calculate three dictionary in 3 Cases: Euclidean Distance, soft-dtw with only return data, soft-dtw with return & Volatility Data

In [24]:
def calculate_distances(metrics: DataFrame) -> dict:
    # Generate pairs of stocks
    gen = ((x, y) for x in metrics.columns for y in metrics.columns)
    
    # Distance dictionary
    distances = dict()
    
    # Loop through every pairs 
    for x, y in gen:
        if x != y and (f'{x}-{y}' not in distances.keys()) and (f'{x}-{y}' not in distances.keys()):
            
            # Calculate Euclidean Distance
            dist = np.sqrt(np.sum((metrics[x] - metrics[y])**2))
            distances[f'{x}-{y}'] = dist
    
    sorted_distances = {k:v for k,v in sorted(distances.items(), key = lambda item: item[1])}
    return sorted_distances

In [25]:
def calculate_soft_dtw(metrics: DataFrame) -> dict:
    # Generate pairs of stocks
    gen = ((x, y) for x in metrics.columns for y in metrics.columns)
    
    # Distance dictionary
    distances = dict()

    for x, y in gen:
        if x != y and (f'{x}-{y}' not in distances.keys()) and (f'{x}-{y}' not in distances.keys()):
            # Calculate Soft-DTW with gamma = 0.01
            dist = soft_dtw(metrics[x].dropna(), metrics[y].dropna(), gamma = 0.01)
            distances[f'{x}-{y}'] = dist

    sorted_distances = {k:v for k,v in sorted(distances.items(), key = lambda item: item[1])}
    return sorted_distances

In [32]:
def calculate_dtw_with_vol(returns: DataFrame, roll_std: DataFrame) -> dict :
    
    # Generate pairs of stocks
    gen = ((x, y) for x in returns.columns for y in returns.columns)
    # Distance dictionary
    distances = dict()
    
    # List 2 Features
    returns = returns.dropna(how = 'all')
    roll_std = roll_std.dropna(how = 'all')

    for x, y in gen:
        if x != y and (f'{x}-{y}' not in distances.keys()) and (f'{x}-{y}' not in distances.keys()):
            # Calculate soft-DTW from two features, returns & roll_std
            arr1, arr2 = returns[x], roll_std[x]
            arr3, arr4 = returns[y], roll_std[y]
            dist = soft_dtw([[x, y] for x, y in zip(arr1, arr2)], [[x, y] for x, y in zip(arr3, arr4)],gamma = 0.01)
            distances[f'{x}-{y}'] = dist
            
    sorted_distances = {k:v for k,v in sorted(distances.items(), key = lambda item: item[1])}
    return sorted_distances

- Euclidean Distances and its 20 first pairs

In [27]:
dist_euclid = calculate_distances(normal_logret[filter_list].loc[start:end])
euclid_pairs = list(dist_euclid)[0:20]
euclid_pairs = []

['CMS-LNT',
 'LNT-CMS',
 'CMS-XEL',
 'XEL-CMS',
 'AEE-CMS',
 'CMS-AEE',
 'LNT-XEL',
 'XEL-LNT',
 'CMS-WEC',
 'WEC-CMS',
 'AEE-XEL',
 'XEL-AEE',
 'AEP-CMS',
 'CMS-AEP',
 'AEP-LNT',
 'LNT-AEP',
 'NTRS-STT',
 'STT-NTRS',
 'LVS-WYNN',
 'WYNN-LVS']

- Soft-DTW with only return feature and its 20 first pairs

In [29]:
dist_dtw = calculate_soft_dtw(normal_logret[filter_list].loc[start:end])
dtw_return_pairs = list(dist_dtw)[0:20]
dtw_return_pairs

['CMS-XEL',
 'XEL-CMS',
 'CMS-LNT',
 'LNT-CMS',
 'AEE-CMS',
 'CMS-AEE',
 'NTRS-STT',
 'STT-NTRS',
 'AMT-CCI',
 'CCI-AMT',
 'LNT-XEL',
 'XEL-LNT',
 'LMT-NOC',
 'NOC-LMT',
 'AEE-XEL',
 'XEL-AEE',
 'WEC-ES',
 'ES-WEC',
 'FIS-GPN',
 'GPN-FIS']

- Soft-DTW with return & volatility feature and its 20 first pairs

In [30]:
std_2 = data.rolling(200).std()
std_2 = pd.DataFrame(data= scaler.fit_transform(std_2), index = data.index, columns = data.columns)

In [33]:
dist_dtw_2 = calculate_dtw_with_vol(normal_logret[filter_list].loc[start:end], std_2[filter_list].loc[start:end])
dtw_return_vol = list(dist_dtw_2)[0:20]
dtw_return_vol

['FIS-GPN',
 'GPN-FIS',
 'C-MTB',
 'MTB-C',
 'ROST-TJX',
 'TJX-ROST',
 'ATO-CMS',
 'CMS-ATO',
 'MTB-HST',
 'HST-MTB',
 'ATO-ED',
 'ED-ATO',
 'AMT-CCI',
 'CCI-AMT',
 'BAX-VRSN',
 'VRSN-BAX',
 'ATO-NI',
 'NI-ATO',
 'EVRG-LNT',
 'LNT-EVRG']

# 4. Backtest with all selected pairs

In [18]:
euclid_pairs = ['CMS-LNT',
 'LNT-CMS',
 'CMS-XEL',
 'XEL-CMS',
 'AEE-CMS',
 'CMS-AEE',
 'LNT-XEL',
 'XEL-LNT',
 'CMS-WEC',
 'WEC-CMS',
 'AEE-XEL',
 'XEL-AEE',
 'AEP-CMS',
 'CMS-AEP',
 'AEP-LNT',
 'LNT-AEP',
 'NTRS-STT',
 'STT-NTRS',
 'LVS-WYNN',
 'WYNN-LVS']


dtw_return_pairs = ['CMS-XEL',
 'XEL-CMS',
 'CMS-LNT',
 'LNT-CMS',
 'AEE-CMS',
 'CMS-AEE',
 'NTRS-STT',
 'STT-NTRS',
 'AMT-CCI',
 'CCI-AMT',
 'LNT-XEL',
 'XEL-LNT',
 'LMT-NOC',
 'NOC-LMT',
 'AEE-XEL',
 'XEL-AEE',
 'WEC-ES',
 'ES-WEC',
 'FIS-GPN',
 'GPN-FIS']

dtw_return_vol = ['FIS-GPN',
 'GPN-FIS',
 'C-MTB',
 'MTB-C',
 'ROST-TJX',
 'TJX-ROST',
 'ATO-CMS',
 'CMS-ATO',
 'MTB-HST',
 'HST-MTB',
 'ATO-ED',
 'ED-ATO',
 'AMT-CCI',
 'CCI-AMT',
 'BAX-VRSN',
 'VRSN-BAX',
 'ATO-NI',
 'NI-ATO',
 'EVRG-LNT',
 'LNT-EVRG']

In [46]:
test_dict = [euclid_pairs, dtw_return_pairs, dtw_return_vol]

result = pd.DataFrame(columns = ['type', 'return_s1', 'return_s2', 'strategy_return'])
for idx, pairs in enumerate(test_dict):
    print("Type", idx)
    for pair in pairs:
        # Split the stock
        s1 = pair.split('-')[0]
        s2 = pair.split('-')[1]
        
        # Calculate spread
        km_spread = kst.calculate_kalman_spread(cumret, s1, s2, '2022-01-01', '2022-12-31')
        return_s1, return_s2, enhanced_return = kst.backtest_pairs(data, km_spread, s1, s2, '2022-01-01', '2022-12-31')
        
        # Assign its type, and value
        result.loc[f'{pair} - {idx}', "type"] = idx
        result.loc[f'{pair} - {idx}', "return_s1"] = return_s1
        result.loc[f'{pair} - {idx}', "return_s2"] = return_s2
        result.loc[f'{pair} - {idx}', "strategy_return"] = enhanced_return

Type 0
Calculation for CMS - LNT Pairs
Buy-And-Hold Stratgies for CMS generates 0.70%
Buy-And-Hold Stratgies for LNT generates -6.49%
Pair Trading Stratgies for kalman spread generates 29.15%

Calculation for LNT - CMS Pairs
Buy-And-Hold Stratgies for LNT generates -6.49%
Buy-And-Hold Stratgies for CMS generates 0.70%
Pair Trading Stratgies for kalman spread generates 29.15%

Calculation for CMS - XEL Pairs
Buy-And-Hold Stratgies for CMS generates 0.70%
Buy-And-Hold Stratgies for XEL generates 6.07%
Pair Trading Stratgies for kalman spread generates 21.98%

Calculation for XEL - CMS Pairs
Buy-And-Hold Stratgies for XEL generates 6.07%
Buy-And-Hold Stratgies for CMS generates 0.70%
Pair Trading Stratgies for kalman spread generates 21.98%

Calculation for AEE - CMS Pairs
Buy-And-Hold Stratgies for AEE generates 3.04%
Buy-And-Hold Stratgies for CMS generates 0.70%
Pair Trading Stratgies for kalman spread generates 14.20%

Calculation for CMS - AEE Pairs
Buy-And-Hold Stratgies for CMS gen

Calculation for TJX - ROST Pairs
Buy-And-Hold Stratgies for TJX generates 7.25%
Buy-And-Hold Stratgies for ROST generates 3.96%
Pair Trading Stratgies for kalman spread generates 91.06%

Calculation for ATO - CMS Pairs
Buy-And-Hold Stratgies for ATO generates 9.38%
Buy-And-Hold Stratgies for CMS generates 0.70%
Pair Trading Stratgies for kalman spread generates 43.81%

Calculation for CMS - ATO Pairs
Buy-And-Hold Stratgies for CMS generates 0.70%
Buy-And-Hold Stratgies for ATO generates 9.38%
Pair Trading Stratgies for kalman spread generates 43.81%

Calculation for MTB - HST Pairs
Buy-And-Hold Stratgies for MTB generates -5.81%
Buy-And-Hold Stratgies for HST generates -6.07%
Pair Trading Stratgies for kalman spread generates 110.35%

Calculation for HST - MTB Pairs
Buy-And-Hold Stratgies for HST generates -6.07%
Buy-And-Hold Stratgies for MTB generates -5.81%
Pair Trading Stratgies for kalman spread generates 110.35%

Calculation for ATO - ED Pairs
Buy-And-Hold Stratgies for ATO gener

# 5. Result Comparison

In [47]:
result["evaluation"] = result["strategy_return"] - result["return_s1"] - result["return_s2"]
result

Unnamed: 0,type,return_s1,return_s2,strategy_return,evaluation
CMS-LNT - 0,0,0.007048,-0.064948,0.291459,0.349359
LNT-CMS - 0,0,-0.064948,0.007048,0.291459,0.349359
CMS-XEL - 0,0,0.007048,0.060694,0.219802,0.15206
XEL-CMS - 0,0,0.060694,0.007048,0.219802,0.15206
AEE-CMS - 0,0,0.030401,0.007048,0.141965,0.104516
CMS-AEE - 0,0,0.007048,0.030401,0.141965,0.104516
LNT-XEL - 0,0,-0.064948,0.060694,0.257021,0.261275
XEL-LNT - 0,0,0.060694,-0.064948,0.254072,0.258326
CMS-WEC - 0,0,0.007048,0.00563,0.262863,0.250185
WEC-CMS - 0,0,0.00563,0.007048,0.278003,0.265325


In [48]:
name_dict = ['euclid_pairs', 'dtw_return_pairs', 'dtw_return_vol']
for i in range(3):
    print(name_dict[i], "Strategy Return Avg : ", result[result["type"] == i]["strategy_return"].mean())
    print(name_dict[i], "Evaluation Metric Avg : ", result[result["type"] == i]["evaluation"].mean())
    print("\n")

euclid_pairs Strategy Return Avg :  0.32592982530973896
euclid_pairs Evaluation Metric Avg :  0.31726850128044704


dtw_return_pairs Strategy Return Avg :  0.3911389003259028
dtw_return_pairs Evaluation Metric Avg :  0.4600983958038551


dtw_return_vol Strategy Return Avg :  0.7678480532957388
dtw_return_vol Evaluation Metric Avg :  0.9463232279682495




In [45]:
import importlib 
importlib.reload(kst)

<module 'kalman_spread_test' from 'C:\\Users\\johnn\\pair_trade_final\\Pair-Trading\\kalman_spread_test.py'>