In [1]:
import extraction_process as ep
import stationary_metrics as sms
import density_metrics as dm
import centroid_based_metrics as cbm
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 1. PreProcessing of Data

In [2]:
# Get the original close data, cumulative return, and log return
data = ep.extract_and_process_data(0.2)
cumret = ep.metrics_process(data, "cumret")
logret = ep.metrics_process(data, 'logret')

# Get normalized logret
scaler = StandardScaler()
normal_logret = pd.DataFrame(scaler.fit_transform(logret), columns = data.columns, index = data.index).dropna(how='all')

Data Shape before cleaning = (1310, 503)
Data Shape after cleaning = (1310, 494)


In [3]:
data

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02,65.095131,51.647560,99.734367,40.888062,77.285172,86.405487,53.904888,29.433332,142.543961,177.699997,...,154.734467,41.177719,64.879845,64.099930,63.911427,74.335281,115.880241,103.709999,43.878445,69.429123
2018-01-03,66.751389,51.014030,100.636848,40.880947,78.494598,86.727074,54.024090,29.459999,143.201859,181.039993,...,153.058090,40.902119,66.154076,63.880337,64.690742,74.271515,116.683548,105.769997,43.826511,69.748352
2018-01-04,66.250664,51.335663,104.350220,41.070835,78.046959,86.534111,53.932404,29.570000,144.897491,183.220001,...,153.886826,40.583443,66.245644,63.870785,65.122620,75.027611,116.515419,107.860001,44.008263,70.164337
2018-01-05,67.309898,51.316174,105.459518,41.538448,79.405594,87.581573,54.088280,29.453333,146.092758,185.339996,...,154.913391,40.299225,66.192230,64.768211,65.000565,75.464859,117.673660,109.540001,44.025570,70.967278
2018-01-08,67.454346,50.809345,104.716873,41.384144,78.133331,89.033318,53.932404,29.456667,147.260239,185.039993,...,152.850906,40.600658,66.489830,65.207420,65.235306,75.592407,117.897827,110.629997,43.809204,71.818565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-10,135.720001,15.460000,128.039993,148.500000,149.710007,149.600006,96.959999,66.610001,252.949997,329.299988,...,108.339996,62.716564,107.779999,36.930000,98.980003,124.580002,123.529999,288.190002,40.349998,161.529999
2023-03-13,136.690002,14.850000,128.039993,150.470001,151.949997,148.369995,98.190002,66.610001,252.949997,324.269989,...,106.250000,64.780006,106.540001,36.930000,97.900002,124.580002,124.250000,288.190002,29.969999,161.529999
2023-03-14,138.399994,14.660000,121.699997,152.589996,153.850006,149.289993,98.550003,67.339996,252.479996,333.329987,...,108.330002,65.440002,106.940002,38.130001,99.900002,126.699997,126.629997,292.459991,31.309999,164.559998
2023-03-15,134.029999,13.860000,121.769997,152.990005,154.059998,149.610001,97.800003,62.740002,246.169998,333.609985,...,104.860001,67.309998,101.620003,37.209999,96.550003,127.129997,125.260002,287.739990,30.709999,163.570007


- Reduce Dimensions using PCA

In [4]:
# Set the training period before testing
start_train = '2019-01-01'
end_train = '2021-12-31'

data_train = data.loc[start_train: end_train]
cumret_train = cumret.loc[start_train: end_train]
cumret_train.apply(lambda x: x / x.iloc[0])
logret_train = logret.loc[start_train: end_train]

- Test for all three return metrics

In [5]:
print("Training for Original Data")
ep.test_PCA_Components(data_train, 5)

Training for Original Data
Components 5 
Explained Variance 0.9015259367400891


Unnamed: 0,0,1,2,3,4
A,-0.018249,-0.011551,0.013026,-0.000556,-0.002674
AAL,0.018321,-0.014100,-0.010367,0.002058,-0.018269
AAP,0.026197,-0.033121,0.037745,0.032945,-0.058576
AAPL,-0.041886,0.009712,-0.019159,-0.021862,-0.010015
ABBV,-0.002245,0.011511,0.007790,-0.005561,-0.007041
...,...,...,...,...,...
YUM,0.020705,-0.008417,-0.007869,0.011614,-0.012152
ZBH,0.024630,0.014703,-0.015457,-0.050489,0.010750
ZBRA,-0.112281,-0.115215,0.048862,-0.012210,0.108502
ZION,0.008549,-0.018451,-0.007175,0.004087,0.000799


In [6]:
print("Training for Cumulative Return")
ep.test_PCA_Components(cumret_train, 5)

Training for Cumulative Return
Components 5 
Explained Variance 0.8863726929201782


Unnamed: 0,0,1,2,3,4
A,-0.031774,-0.017450,0.040522,0.034573,0.000973
AAL,0.044073,-0.041084,-0.013398,0.011115,-0.016598
AAP,0.049486,-0.056166,0.062665,0.018009,-0.005413
AAPL,-0.078188,0.037685,-0.036042,0.013529,-0.019537
ABBV,-0.039816,0.034356,0.049790,0.069261,0.001962
...,...,...,...,...,...
YUM,0.039682,-0.010346,0.000126,-0.046932,0.019342
ZBH,0.009769,0.042734,-0.021998,0.007513,-0.071412
ZBRA,-0.003919,-0.079023,0.005727,0.044153,-0.045209
ZION,0.006607,-0.050749,-0.005049,0.053481,0.002926


In [7]:
print("Training for Log Return")
ep.test_PCA_Components(logret_train, 5)

Training for Log Return
Components 5 
Explained Variance 0.26953941384361363


Unnamed: 0,0,1,2,3,4
A,0.031636,-0.035065,0.010008,0.016626,0.004047
AAL,-0.111901,0.008096,-0.155828,-0.022757,-0.013718
AAP,0.002945,0.029024,0.011880,0.026838,0.002413
AAPL,0.031961,-0.067997,-0.009319,0.004297,0.039132
ABBV,0.030106,0.006573,0.042967,-0.011935,0.011689
...,...,...,...,...,...
YUM,0.023590,0.025384,-0.027480,-0.004652,0.017928
ZBH,0.008684,0.007629,-0.026438,-0.022507,0.019160
ZBRA,0.000486,-0.073851,0.004568,0.019939,-0.009689
ZION,-0.080814,0.034155,0.014265,0.048486,-0.015759


In [8]:
df_pca = ep.test_PCA_Components(data_train, 5)

Components 5 
Explained Variance 0.9015259367400901


# 2. Cluster Time Series with Gaussian Mixture Model

In [54]:
df = cbm.GMM_test(df_pca, 20)

The Score is listed in this order: BIC, Silhouette_score, JS Distance
N-Cluster: 2   (-10718.728465343456, 0.4868327193868394, 0.4389745026819165)
N-Cluster: 3   (-10892.143804636284, 0.4230632742753946, 0.22028778965012386)
N-Cluster: 4   (-10843.192533318444, 0.3721736735675109, 0.34899005609514866)
N-Cluster: 5   (-11114.177453404072, 0.0013768416200202716, 0.19641444187236967)
N-Cluster: 6   (-11024.517656642458, 0.003996207475118673, 0.10020080792858455)
N-Cluster: 7   (-10952.904371093546, 0.013615584269036604, 0.23232163306637954)
N-Cluster: 8   (-10943.391268003237, 0.003189021463038107, 0.21427091753787852)
N-Cluster: 9   (-10872.675863054414, 0.008182541741801792, 0.15368147319273273)
N-Cluster: 10   (-10734.793688769612, 0.02444698936910167, 0.25119114996345004)
N-Cluster: 11   (-10651.519947632716, 0.0446368622726706, 0.22038841248164243)
N-Cluster: 12   (-10617.405640405797, 0.03641990674618551, 0.2055706077785482)
N-Cluster: 13   (-10485.407467088633, 0.018574254060641362

# 3. Pairs Chosen

In [55]:
new = cbm.GMM_select(df_pca, 15)
new

Unnamed: 0_level_0,stocks
labels,Unnamed: 1_level_1
0,"[A, AAP, AAPL, ABBV, ACN, ADI, ADP, AIZ, AJG, ..."
1,"[ADSK, BA, CI, ELV, FDX, FFIV, GWW, ISRG, LLY,..."
2,"[AAL, ABT, ACGL, ADM, AEE, AEP, AES, AFL, AIG,..."
3,"[ALB, AVGO, BLK, CMG, CRL, DE, ENPH, EPAM, ETS..."
4,"[ABC, AKAM, ALL, ALLE, AMZN, APD, ARE, ATO, BA..."
5,"[ADBE, DPZ, FDS, NFLX, NOW, TMO]"
6,[NVR]
7,"[AMGN, BDX, BIIB, CLX, EQIX, HUM, ILMN, MKTX, ..."
8,[BKNG]
9,[AZO]


# 4. Detect Stationarity between each pair

In [56]:
stat_metrics = sms.compute_stationary_long(data_train)

In [57]:
from IPython.display import display
for i in range(len(new.index)):
    df = stat_metrics.T[new.loc[i].stocks].T
    if len(df) > 1:
        avg_adf = df["ADF_value"].mean()
        avg_hurst = df["Hurst Exponent"].mean()
        avg_half_life = df["Half-life mean reversion"].mean()

        print("Label", i, "-----", avg_adf, avg_hurst, avg_half_life)

Label 0 ----- 0.7222576974758049 0.5737824958900688 -0.0444690709902271
Label 1 ----- 0.8126464297673472 0.554436818146761 -1.8128793899968652
Label 2 ----- 0.5919806888142306 0.5351376243179528 -4.684199190767574
Label 3 ----- 0.8973163117499587 0.6218841828855749 -0.6826891113721381
Label 4 ----- 0.4475260601824792 0.49857957994387236 -0.7573870976460786
Label 5 ----- 0.8787049941604476 0.5632152961705281 -0.752395259128192
Label 7 ----- 0.33479569348932514 0.4675350656433711 -0.22206481372573847
Label 10 ----- 0.694603194133688 0.5709577124739279 -0.39610342074555566
Label 14 ----- 0.2492409179971649 0.4965891471651678 -0.2196599678870102


In [58]:
tmp_df = stat_metrics.T[new.loc[7].stocks].T

In [59]:
sms.choose_stocks(tmp_df, ["Hurst Exponent", "ADF_value"])

Unnamed: 0,ADF_value,Hurst Exponent,Half-life mean reversion,Chosen
AMGN,0.387559,0.415204,-0.311609,1.0
BDX,0.000286,0.382119,-0.051336,1.0
BIIB,0.031501,0.341592,-0.075582,1.0
HUM,0.663951,0.477091,-0.319077,1.0
TFX,0.096035,0.482171,-0.111128,1.0
VRTX,0.465396,0.478771,-0.269135,1.0


In [60]:
tmp_df = stat_metrics.T[new.loc[14].stocks].T

In [61]:
sms.choose_stocks(tmp_df, ["Hurst Exponent", "ADF_value"])

Unnamed: 0,ADF_value,Hurst Exponent,Half-life mean reversion,Chosen
ESS,0.678095,0.498918,-0.410799,1.0
FLT,0.015212,0.462047,-0.117369,1.0
LMT,0.009971,0.481956,-0.088545,1.0
NOC,0.190792,0.484317,-0.139489,1.0


- From cluster 7, we filtered out ["BDX", "BIIB"], and in cluster 11, we filtered out ["FLT", "LMT"]

# 5. Backtest with Kalman Spread

In [9]:
import kalman_spread_test as kst
import itertools

In [10]:
# Define the sample array
arr = ["BDX", "BIIB", "FLT", "LMT"]

# Generate all permutations of 3 arrays with 2 elements each
perms = list(itertools.permutations(arr, 2))
result = pd.DataFrame(columns = ['return_s1', 'return_s2', 'strategy_return'])
for perm in perms:
    print("For Pairs", perm)
    km_spread = kst.calculate_kalman_spread(cumret, perm[0], perm[1], '2022-01-01', '2022-12-31' )
    return_s1, return_s2, enhanced_return = kst.backtest_pairs(data, km_spread, perm[0], perm[1], '2022-01-01', '2022-12-31')
    print('\n')
    
    result.loc[f'{perm[0]} - {perm[1]}', "return_s1"] = return_s1
    result.loc[f'{perm[0]} - {perm[1]}', "return_s2"] = return_s2
    result.loc[f'{perm[0]} - {perm[1]}', "strategy_return"] = enhanced_return

For Pairs ('BDX', 'BIIB')
Calculation for BDX - BIIB Pairs
Buy-And-Hold Stratgies for BDX generates 4.36%
Buy-And-Hold Stratgies for BIIB generates 13.43%
Pair Trading Stratgies for kalman spread generates 108.91%



For Pairs ('BDX', 'FLT')
Calculation for BDX - FLT Pairs
Buy-And-Hold Stratgies for BDX generates 4.36%
Buy-And-Hold Stratgies for FLT generates -20.52%
Pair Trading Stratgies for kalman spread generates 83.14%



For Pairs ('BDX', 'LMT')
Calculation for BDX - LMT Pairs
Buy-And-Hold Stratgies for BDX generates 4.36%
Buy-And-Hold Stratgies for LMT generates 40.89%
Pair Trading Stratgies for kalman spread generates 102.93%



For Pairs ('BIIB', 'BDX')
Calculation for BIIB - BDX Pairs
Buy-And-Hold Stratgies for BIIB generates 13.43%
Buy-And-Hold Stratgies for BDX generates 4.36%
Pair Trading Stratgies for kalman spread generates 108.91%



For Pairs ('BIIB', 'FLT')
Calculation for BIIB - FLT Pairs
Buy-And-Hold Stratgies for BIIB generates 13.43%
Buy-And-Hold Stratgies for FLT

In [11]:
result["Evaluation"] = result["strategy_return"] - result["return_s1"] - result["return_s2"]
result

Unnamed: 0,return_s1,return_s2,strategy_return,Evaluation
BDX - BIIB,0.043643,0.134267,1.089069,0.911158
BDX - FLT,0.043643,-0.205227,0.831442,0.993026
BDX - LMT,0.043643,0.408919,1.02926,0.576698
BIIB - BDX,0.134267,0.043643,1.089069,0.911158
BIIB - FLT,0.134267,-0.205227,0.915417,0.986377
BIIB - LMT,0.134267,0.408919,1.525912,0.982725
FLT - BDX,-0.205227,0.043643,0.831442,0.993026
FLT - BIIB,-0.205227,0.134267,0.844514,0.915474
FLT - LMT,-0.205227,0.408919,1.078905,0.875213
LMT - BDX,0.408919,0.043643,0.9712,0.518637


# 6. Revalidate with the Spread Calculation

In [12]:
import similarity_check as sc
sc.calculate_metrics(perms, data, '2022-01-01', '2022-12-31')

Unnamed: 0,Euclidean distance,ADF p-value,Granger Casuality p-value,Hurst Exponent,Half-life of mean reversion,Spread SD,Num zero-crossings,% days within 2-SD band
BDX-BIIB,702.354933,0.700867,0.85086,0.456067,59.18858,41.085847,2,99.601594
BDX-FLT,535.532836,0.535172,0.226568,0.408968,28.710313,18.788991,6,68.924303
BDX-LMT,2871.290419,0.370117,0.539523,0.52839,33.262476,37.882823,0,0.0
BIIB-BDX,702.354933,0.700867,0.000258,0.456067,59.18858,41.085847,2,99.601594
BIIB-FLT,880.266972,0.913202,2e-05,0.543912,233.096115,54.515369,6,97.609562
BIIB-LMT,3122.824907,0.034612,0.439074,0.463734,13.680618,34.389485,0,0.0
FLT-BDX,535.532836,0.535172,0.617444,0.408968,28.710313,18.788991,6,68.924303
FLT-BIIB,880.266972,0.913202,0.449348,0.543912,233.096115,54.515369,6,97.609562
FLT-LMT,3349.740654,0.65245,0.999287,0.535695,54.815099,50.367672,0,0.0
LMT-BDX,2871.290419,0.370117,0.435375,0.52839,33.262476,37.882823,0,0.0
