## Definitions

In [1]:
import os
import shutil

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pandas_ta as ta
import quantstats as qs
qs.extend_pandas()

import numpy as np
from math import ceil

from datetime import datetime, timedelta
from tqdm import tqdm
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn

import mlflow

params = {'figure.facecolor': 'w'}
plt.rcParams.update(params)

from IPython.display import display

### Parameter Definition and Directories

In [2]:
def make_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        shutil.rmtree(directory)
        os.makedirs(directory)

In [3]:
# Parameters
date_start = '2010-01-01'
date_breakpoint = '2019-01-01'
strat_class = "Mean Reversion"
std = 1

# MLFlow Parameters
mlflow_experiment_name = "20220628_s-stat-arb_d-lq45"

In [4]:
# Parameters
date_start = "2010-01-01"
date_breakpoint = "2021-01-01"
strat_class = "Mean Reversion"
std = 1
mlflow_experiment_name = "20220628_s-stat-arb_d-lq45"


In [5]:
# Data Directory
data_dir = '/workspace/202205_idx-trading/_data/'
lq45_dir = '/workspace/202205_idx-trading/_data/20220525_lq45/'
lq45_index_file = data_dir + '20220525_lq45_index.csv'
lq45_list = '20220525_lq45-list.txt'

# Visualization Directories
artifact_dir = "/workspace/202205_idx-trading/strats/experiments/" + "20220628_s-stat-arb_d-lq45/" + f's_{date_start}_bp_{date_breakpoint}' + "/"
coint_plot_dir = artifact_dir + "coint/"
bb_plot_dir = artifact_dir + "s_bb/"
kf_plot_dir = artifact_dir + "s_kf/"

## Create Visualization Directories if not exist
make_dir(artifact_dir)
make_dir(coint_plot_dir)
make_dir(bb_plot_dir)
make_dir(kf_plot_dir)

## Helper Functions

In [6]:
# Preprocessing Functions
def handle_nan(df, method='bfill'):
    ## Fill NaN values with the earliest data
    if method == 'bfill':
        return df.fillna(method='bfill', axis=0)
    elif method == 'zerofill':
        return df.fillna(0)
    elif method == 'drop':
        return df.dropna()

def extend_price_df(df):
    '''
    Calculates returns, log_returns, and log_prices to a df with 'price' column
    '''
    df['log-price'] = np.log(df['price'])
    df['return'] = df['price'].pct_change()
    df['log-return'] = np.log(1 + df['return'])
    
    df = handle_nan(df, method='zerofill')
    
    return df
     
def gen_combined_df(df_dict, dict_keys, col, nan_handle_method='bfill', add_pfix=True):
    for i, key in enumerate(dict_keys):
        if i == 0:
            df_buff = pd.DataFrame(index=df_dict[key].index)
        for c in col:
            if add_pfix:
                df_buff[key + '_' + c] = df_dict[key][c]
            else:
                df_buff[key] = df_dict[key][c]
    
    # Handle NaN values from combination of multiple tickers
    # Assumes that NaN values because "stock have not existed" has been handled
    df_buff = handle_nan(df_buff, method=nan_handle_method)
    df_buff = handle_nan(df_buff, method='drop')
            
    return df_buff

# Plotting Functions
def tsplot(y, lags=None, figsize=(20, 8), style='bmh', title='Time Series Analysis Plots'):
    # source: http://www.blackarbs.com/blog/time-series-analysis-in-python-linear-models-to-garch/11/1/2016#GARCH
    
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    with plt.style.context(style):    
        fig = plt.figure(figsize=figsize)
        layout = (3, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
        qq_ax = plt.subplot2grid(layout, (2, 0))
        pp_ax = plt.subplot2grid(layout, (2, 1))
        
        y.plot(ax=ts_ax)
        ts_ax.set_title(title)
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.05, zero=False, auto_ylims=True)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.05, zero=False, auto_ylims=True)
        sm.qqplot(y, line='s', ax=qq_ax)
        qq_ax.set_title('QQ Plot')        
        scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax)

        plt.tight_layout()
    return 

## Data Preparation

### Data Loading

In [7]:
# Prepare Stock Tickers
with open(data_dir + lq45_list, "r") as f:
    lq45_tickers = f.read().split('\n')

## Prepare active tickers for international codes
active_tickers = [f + '.JK' for f in lq45_tickers]
active_tickers.append('LQ45')

In [8]:
# Prepare Time Series Data
nan_handle_method = 'bfill'

df_dict = {}
for ticker in tqdm(active_tickers):
    if ticker == 'LQ45':
        df_dict[ticker] = pd.read_csv(lq45_index_file)
    else:
        df_dict[ticker] = pd.read_csv(lq45_dir + ticker + '.csv')
    
    ## Take Only Date and Adjusted Close
    df_dict[ticker] = df_dict[ticker][['Date', 'Adj Close']]
    df_dict['Date'] = pd.to_datetime(df_dict[ticker]['Date'])
    df_dict[ticker].set_index(pd.DatetimeIndex(df_dict[ticker]['Date']), inplace=True)
    
    df_dict[ticker].drop('Date', axis=1, inplace=True)
    
    ## Convert Adj Close to price
    df_dict[ticker]['price'] = df_dict[ticker]['Adj Close']
    df_dict[ticker].drop('Adj Close', axis=1, inplace=True)

100%|████████████████████████████████████████████████████████████████████| 46/46 [00:04<00:00, 10.00it/s]


In [9]:
# Separate Into In Sample and Out Sample
nan_cnt_threshold = 252*2

in_df = {}
out_df = {}
rmv_tickers = []
for ticker in tqdm(active_tickers):
    ## Take In Sample and Out Sample Data
    in_df[ticker] = df_dict[ticker][(df_dict[ticker].index >= date_start) & 
                                                (df_dict[ticker].index < date_breakpoint)]
    out_df[ticker] = df_dict[ticker][df_dict[ticker].index >= date_breakpoint]
    
    ## Check if there are too many NaN values
    if in_df[ticker]['price'].isna().sum() > nan_cnt_threshold:
        rmv_tickers.append(ticker)
        continue
    
    ## Handle NaN Values
    in_df[ticker] = handle_nan(in_df[ticker], method=nan_handle_method)
    out_df[ticker] = handle_nan(out_df[ticker], method=nan_handle_method)
    
    ## Extend price to other values
    in_df[ticker] = extend_price_df(in_df[ticker])
    out_df[ticker] = extend_price_df(out_df[ticker])

# Remove tickers that only have small amounts of data
active_tickers = [t for t in active_tickers if t not in rmv_tickers]

100%|████████████████████████████████████████████████████████████████████| 46/46 [00:00<00:00, 62.85it/s]


In [10]:
# Date Parameters - for logging purposes
str_to_date = lambda d_str: datetime.strptime(d_str, "%Y-%m-%d").date()

date_in_sample_start = date_start
date_in_sample_end = str(str_to_date(date_breakpoint) - timedelta(1))
date_in_sample_len = abs(str_to_date(date_in_sample_start) - str_to_date(date_in_sample_end)).days
date_out_sample_start = date_breakpoint
date_out_sample_end = str(out_df[active_tickers[0]].index[-1])[:-9]
date_out_sample_len = abs(str_to_date(date_out_sample_start) - str_to_date(date_out_sample_end)).days

## Data Exploration
Cointegration exploration is done on in-sample data only, so out of sample data is purely for testing.

*We will revisit cointegration testing on OOS data later on.

Steps:
- Test using existing engle-granger package. 
- Fit Ornstein-Uhlenbeck process to generate lambda (and subsequently, half life)
- Do a separate OLS to get beta of (potentially) cointegrating pair. Visualize Mean Reversion to verify.

### Engle-Granger Cointegration Test

In [11]:
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.stattools import adfuller

def find_coint_pairs(df_dict, tickers, form='normal', form_type='price', alpha=0.05):
    '''
    Finds cointegrated pairs from df_dict serial data, based on given tickers.
    '''
    def coint_ticker_pick(pvalue_1, pvalue_2, tickers, i, j):
        if pvalue_1 <= pvalue_2:
            pvalue = pvalue_1
            coint_tickers = [tickers[i], tickers[j]]
        else:
            pvalue = pvalue_2
            coint_tickers = [tickers[j], tickers[i]]
        
        return pvalue, coint_tickers
    
    n = len(tickers)
    pairs = []
    pvalues = []
    
    for i in range(0, n):
        for j in range(i+1, n):
            
            comb_df = gen_combined_df(df_dict, [tickers[i], tickers[j]], [form_type])
            S1 = comb_df[tickers[i] + "_" + form_type]
            S2 = comb_df[tickers[j] + "_" + form_type]
                
            if(form == 'normal'):
                result_1 = coint(S2, S1)
                pvalue_1 = result_1[1]
                
                result_2 = coint(S1, S2)
                pvalue_2 = result_2[1]
                
                pvalue, coint_tickers = coint_ticker_pick(pvalue_1, pvalue_2, tickers, i, j)
                
            elif(form == 'ratio'):
                result_1 = adfuller(S1/S2)
                pvalue_1 = result_1[1]
                
                result_2 = adfuller(S2/S1)
                pvalue_2 = result_2[1]
                
                pvalue, coint_tickers = coint_ticker_pick(pvalue_1, pvalue_2, tickers, i, j)
                
            if pvalue < alpha:
                pairs.append(coint_tickers)
                pvalues.append(pvalue)
                      
    return pairs, pvalues

def calc_beta_ols(S2, S1, form_type='price'):
    '''
    Calculate beta from two series by doing regression.
    '''
    S1 = sm.add_constant(S1)
    results = sm.OLS(S2, S1).fit()
    S1 = S1[form_type]
    b = results.params[form_type].values[0]
    
    return b

def calc_half_life(S, form_type='price'):
    '''
    Calculate half life from a price series
    '''
    S_lag = S.shift(periods=1).iloc[1:]
    S_diff = S.iloc[1:] - S_lag

    S_lag = sm.add_constant(S_lag)
    results = sm.OLS(S_diff, S_lag).fit()
    S_lag = S_lag[form_type]
    lbd = results.params[form_type].values[0]

    hl = -np.log(2) / lbd
    
    return hl, lbd

In [12]:
# Prepare LQ45 Stock Indexes of the Same Group
stock_groups = {
    "energy_and_mining": ['BRPT.JK', 'HRUM.JK', 'MEDC.JK', 'PGAS.JK', 'TPIA.JK', 'ADRO.JK', 'ITMG.JK', 'PTBA.JK', 'INCO.JK', 'MDKA.JK', 'ANTM.JK'],
    "retail": ['AMRT.JK', 'UNVR.JK', 'ERAA.JK', 'ASII.JK'],
    "food_agri": ['CPIN.JK', 'JPFA.JK', 'ICBP.JK', 'INDF.JK'],
    "paper": ['TKIM.JK', 'INKP.JK'],
    "finance": ['BBCA.JK', 'BBNI.JK', 'BBRI.JK', 'BBTN.JK', 'BMRI.JK', 'BFIN.JK'],
    "media": ['EMTK.JK', 'MNCN.JK'],
    "telcom": ['EXCL.JK', 'TLKM.JK', 'TBIG.JK', 'TOWR.JK'],
    "tobacco": ['GGRM.JK', 'HMSP.JK'],
    "construction": ['INTP.JK', 'PTPP.JK', 'SMGR.JK', 'UNTR.JK', 'WIKA.JK', 'WSKT.JK'],
    "medical": ['KLBF.JK', 'MIKA.JK']
}

# Filter to only those that are active
for key, val in stock_groups.items():
    stock_groups[key] = [t for t in val if t in active_tickers]

In [13]:
# Search for Coint Pair on Different Forms
forms = ['normal', 'ratio']
form_types = ['price', 'log-price']

pair_l = []
for key, val in tqdm(stock_groups.items()):
    tickers = val
    tickers.append('LQ45')
    
    for f in forms:
        for ft in form_types:
    
            pairs, pvalues = find_coint_pairs(in_df, tickers, form=f, form_type=ft, alpha=0.025)
            for pair, pvalue in zip(pairs, pvalues):
                pair_l.append({
                                'ticker_1': pair[0],
                                'ticker_2': pair[1],
                                'form': f,
                                'form_type': ft,
                                'eg_pvalue': pvalue
                            })
    
pair_df = pd.DataFrame(pair_l) 

100%|████████████████████████████████████████████████████████████████████| 10/10 [07:32<00:00, 45.26s/it]


In [14]:
# Calculate Half-Life

pair_l = []
for _, row in pair_df.iterrows():
    # Regression to get Beta for Price Spread Model
    ## Combine Series
    comb_df = gen_combined_df(in_df, [row['ticker_1'], row['ticker_2']], [row['form_type']])
    
    ## Rename Columns
    S1 = comb_df[row['ticker_1'] + "_" + row['form_type']]
    S1.name = row['form_type']
    S2 = comb_df[row['ticker_2'] + "_" + row['form_type']]
    S2.name = row['form_type']
    
    if row['form'] == 'normal':
        b = calc_beta_ols(S2, S1, form_type=[row['form_type']])
        spread = S2 - b * S1
        
    elif row['form'] == 'ratio':
        b = 0
        spread = S1 / S2

    # Ornstein-Uhlenbeck Formula to Calculate Half Life
    hl, lbd = calc_half_life(spread, form_type=[row['form_type']])
    
    pair_l.append({
                    'beta': b,
                    'half_life': hl,
                    'lambda': lbd
                    })
    
full_pair_df = pd.concat([pair_df, pd.DataFrame(pair_l)], axis=1)

In [15]:
# Filter Strategies that Takes too Long to be Profitable
pair_df = full_pair_df[(full_pair_df['half_life'] < 60) & (full_pair_df['lambda'] < 0)].reset_index()

### Visualization
Is used to make sure that price spread is actually mean reversing

In [16]:
def plot_price_spreads_coint(df, pair_df, mode="view", plot_dir=None, col_name=None):
    if mode=="view":
        fig = plt.figure(figsize=(30,20))

        for i, row in pair_df.iterrows():
            # Combine Series
            comb_df = gen_combined_df(df, [row['ticker_1'], row['ticker_2']], [row['form_type']])
            S1 = comb_df[row['ticker_1'] + "_" + row['form_type']]
            S2 = comb_df[row['ticker_2'] + "_" + row['form_type']]

            # Calculate Spread
            spread = S2 - row['beta'] * S1

            # Plot on Specific Axis
            ax = plt.subplot(ceil(len(pair_df)/4), 4, i+1)
            ax.set_title(row['ticker_1'] + "/" + row['ticker_2'] + "_" + row['form'] + "_" + row['form_type'])
            spread.plot(ax=ax)

        plt.tight_layout()
        plt.show()
        
    elif mode=="save":
        assert plot_dir is not None, "plot_dir must be given"
        assert col_name is not None, "col_name must be given"
        
        path_l = []
        for i, row in pair_df.iterrows():
            fig = plt.figure(figsize=(10,5))
            
            # Combine Series
            comb_df = gen_combined_df(df, [row['ticker_1'], row['ticker_2']], [row['form_type']])
            S1 = comb_df[row['ticker_1'] + "_" + row['form_type']]
            S2 = comb_df[row['ticker_2'] + "_" + row['form_type']]

            # Calculate Spread
            spread = S2 - row['beta'] * S1

            # Plot on Specific Axis
            plt.plot(spread)
            plt.title(row['ticker_1'] + "/" + row['ticker_2'] + "_" + row['form'] + "_" + row['form_type'])
            
            # Save Plot
            plot_path = plot_dir + "coint_" + str(i) +  ".png"
            _ = fig.savefig(plot_path)
            plt.close()
            
            path_l.append({col_name: plot_path})
            
        print("Saved Plots at " + plot_dir)
            
        # Add paths to column
        buff_pair_df = pd.concat([pair_df, pd.DataFrame(path_l)], axis=1)
        
        return buff_pair_df

In [17]:
# Visualize Price Spread
pair_df = plot_price_spreads_coint(in_df, pair_df, mode="save", plot_dir=coint_plot_dir, col_name="artifact-coint_plot_path")

Saved Plots at /workspace/202205_idx-trading/strats/experiments/20220628_s-stat-arb_d-lq45/s_2010-01-01_bp_2021-01-01/coint/


In [18]:
display(pair_df)

Unnamed: 0,index,ticker_1,ticker_2,form,form_type,eg_pvalue,beta,half_life,lambda,artifact-coint_plot_path
0,7,ERAA.JK,AMRT.JK,ratio,price,0.0003310704,0.0,52.52434,-0.013197,/workspace/202205_idx-trading/strats/experimen...
1,8,LQ45,AMRT.JK,ratio,price,0.0002212411,0.0,50.995557,-0.013592,/workspace/202205_idx-trading/strats/experimen...
2,15,ICBP.JK,INDF.JK,normal,log-price,0.02199944,0.419276,45.203909,-0.015334,/workspace/202205_idx-trading/strats/experimen...
3,23,BBCA.JK,BBRI.JK,normal,price,0.01393281,0.595569,49.706677,-0.013945,/workspace/202205_idx-trading/strats/experimen...
4,24,BBNI.JK,LQ45,normal,price,0.01172007,0.053991,47.274321,-0.014662,/workspace/202205_idx-trading/strats/experimen...
5,25,BFIN.JK,BMRI.JK,normal,price,0.01358353,7.672912,46.47558,-0.014914,/workspace/202205_idx-trading/strats/experimen...
6,26,BBCA.JK,BBRI.JK,normal,log-price,0.002011757,0.921993,34.977284,-0.019817,/workspace/202205_idx-trading/strats/experimen...
7,27,BBNI.JK,LQ45,normal,log-price,0.01106643,0.233892,49.723041,-0.01394,/workspace/202205_idx-trading/strats/experimen...
8,28,BBRI.JK,BBCA.JK,ratio,price,0.002266286,0.0,40.502506,-0.017114,/workspace/202205_idx-trading/strats/experimen...
9,29,BBRI.JK,BBCA.JK,ratio,log-price,0.0003181514,0.0,33.341311,-0.020789,/workspace/202205_idx-trading/strats/experimen...


## Strategy + Backtest

### Backtest Helper Functions

In [19]:
import scipy.stats as ss

def prob_sr(ret, sr_benchmark=0):
    sr = qs.stats.sharpe(ret)
    n = len(ret)
    skew = ss.skew(ret)
    kurtosis = ss.kurtosis(ret, fisher=False)
    
    # Assuming SR is annualized, we need to change into periodical
    sr = sr/np.sqrt(252)
    sr_benchmark = sr_benchmark/np.sqrt(252)
    
    sr_std = np.sqrt((1 + (0.5 * sr ** 2) - (skew * sr) + (((kurtosis - 3) / 4) * sr ** 2)) / (n - 1))
    psr = ss.norm.cdf((sr - sr_benchmark) / sr_std)

    return psr

def expected_sr_max(trials_sr_std=0, num_trials=0, exp_sr_mean=0):
    emc = 0.5772156649
    max_z = (1 - emc) * ss.norm.ppf(1 - 1./num_trials) + emc * ss.norm.ppf(1 - 1./(num_trials * np.e))
    return exp_sr_mean + (trials_sr_std*max_z)

def def_sr(ret, trials_sr_df):
    sr_std = trials_sr_df.std()
    exp_sr_max = expected_sr_max(trials_sr_std=sr_std, num_trials=len(trials_sr_df))
    d_sr = prob_sr(ret, sr_benchmark=exp_sr_max)
    return d_sr

def is_recently_drawdown(s_ret, delta=4):
    '''
    Check if within the previous delta number of days there is a drawdown
    '''
    dd_details = qs.stats.drawdown_details(s_ret)
    dts = [(datetime.now() - timedelta(days=i)).strftime("%Y-%m-%d") for i in range(delta)]
    
    indicator = False
    for dt in dts:
        if dt in dd_details['end'].values:
            indicator = True
        
    return indicator

### Simple Bollinger-Band

In [20]:
from statsmodels.regression.rolling import RollingOLS

def bb_strategy(df_dict, pair_ticker, lookback, form_type='price', beta=None, beta_lookback=30, std=1):
    '''
    Long-Only Bollinger Band Mean Reversion Strategy
    
    1. Calculate the Rolling Price Spread (or use set beta), by first estimating the beta model of the pair.
       We assume the model is of the form Y = beta * X where Y is the second item in the pair.
    2. Calculate the BBands
    3. Generate Signal, based on BBand (long when price >= bbl, exit when price <= bbm)
    4. Calculate Returns
    '''
    
    # Take the relevant price series from each pair
    df_proc = gen_combined_df(df_dict, [pair_ticker[0], pair_ticker[1]], [form_type], add_pfix=False)
    pair = pair_ticker
    
    # Determine Beta
    ## Add beta / Rolling Beta to main df
    if beta is not None:
        df_proc['beta'] = beta
    else:
        S1 = df_proc[pair[0]]
        S1.name = form_type
        S2 = df_proc[pair[1]]
        S2.name = form_type
        
        S1_indep = sm.add_constant(S1)
        result = RollingOLS(S2, S1_indep, window=beta_lookback).fit()
        rolling_beta = result.params[form_type]
    
        df_proc['beta'] = rolling_beta
    
    ## calculate rolling spread
    df_proc['spread'] = df_proc[pair[1]] - df_proc['beta'] * df_proc[pair[0]]
    
    # Generate Technical Indicators (BBand)
    bbands = ta.bbands(df_proc['spread'], length=lookback, std=std)
    
    bbands_upper_cname = 'BBU' + '_' + str(lookback) + '_' + str(std) + '.0'
    bbands_lower_cname = 'BBL' + '_' + str(lookback) + '_' + str(std) + '.0'
    bbands_mid_cname = 'BBM' + '_' + str(lookback) + '_' + str(std) + '.0'
    
    df_proc['spread_BBU'] = bbands[bbands_upper_cname]
    df_proc['spread_BBL'] = bbands[bbands_lower_cname]
    df_proc['spread_BBM'] = bbands[bbands_mid_cname]
    
    # Signal Rules
    long_signal = lambda price, bbl: (price <= bbl)
    long_close_signal = lambda price, bbm: (price >= bbm)
    short_signal = lambda price, bbu: (price >= bbu) # note: might be better to set absolute differences   
    short_close_signal = lambda price, bbm: (price <= bbm)
    
    # Generate Signals
    ## This Signal generation is for a long-only strategy
    last_signal = ''
    df_proc['signal'] = ''
    df_proc['signal_ticker'] = ''
    for i in range(0, len(df_proc)):
        if i == 0:
            df_proc['signal'][i] = ''
        
        elif last_signal == '':
            if long_signal(df_proc['spread'][i], df_proc['spread_BBL'][i]):
                df_proc['signal'][i] = 'long_entry'
                last_signal = 'long_entry'
                df_proc['signal_ticker'][i] = pair[1]
            elif long_close_signal(df_proc['spread'][i], df_proc['spread_BBM'][i]):
                df_proc['signal'][i] = 'long_close'
                last_signal = 'long_close'
                df_proc['signal_ticker'][i] = pair[1]
            elif short_signal(df_proc['spread'][i], df_proc['spread_BBU'][i]):
                df_proc['signal'][i] = 'short_entry'
                last_signal = 'short_entry'
                df_proc['signal_ticker'][i] = pair[0]
            elif short_close_signal(df_proc['spread'][i], df_proc['spread_BBM'][i]):
                df_proc['signal'][i] = 'short_close'
                last_signal = 'short_close'
                df_proc['signal_ticker'][i] = pair[0]
            else:
                df_proc['signal'][i] = ''
            
        elif last_signal == 'long_entry':
            if long_close_signal(df_proc['spread'][i], df_proc['spread_BBM'][i]):
                df_proc['signal'][i] = 'long_close'
                last_signal = 'long_close'
                df_proc['signal_ticker'][i] = pair[1]
            else:
                df_proc['signal'][i] = ''
                
        elif last_signal == 'short_entry':
            if short_close_signal(df_proc['spread'][i], df_proc['spread_BBM'][i]):
                df_proc['signal'][i] = 'short_close'
                last_signal = 'short_close'
                df_proc['signal_ticker'][i] = pair[0]
            else:
                df_proc['signal'][i] = ''
                
        elif last_signal == 'long_close' or last_signal == 'short_close':
            if long_signal(df_proc['spread'][i], df_proc['spread_BBL'][i]):
                df_proc['signal'][i] = 'long_entry'
                last_signal = 'long_entry'
                df_proc['signal_ticker'][i] = pair[1]
            elif short_signal(df_proc['spread'][i], df_proc['spread_BBU'][i]):
                df_proc['signal'][i] = 'short_entry'
                last_signal = 'short_entry'
                df_proc['signal_ticker'][i] = pair[0]
            else:
                df_proc['signal'][i] = ''

    return df_proc
    
def calc_strategy_returns(df, consider_trx_cost=True, buy_cost_pct=0.05, sell_cost_pct=0.1, form_type='price'):
    '''
    Calculate returns and cumulative returns per entry on dataframe.

    Strategy Returns: 
    - (Short Spread) returns(S2 Buy price, S2 Close Price). 
    - (Long Spread) returns(S1 Buy price, S1 Close Price). 
    '''
    
    df_proc = df.copy()

    last_signal = ''
    last_ticker = ''
    df_proc['return'] = np.nan
    for i in range(0, len(df_proc)):
        if last_signal == 'long_entry' or last_signal == 'short_entry':
            if form_type == 'price':
                df_proc["return"][i] = (df_proc[last_ticker][i] - df_proc[last_ticker][i-1]) / df_proc[last_ticker][i-1]
            elif form_type == 'log-price':
                df_proc["return"][i] = df_proc[last_ticker][i] - df_proc[last_ticker][i-1]
        elif last_signal == 'long_close' or last_signal == 'short_close':
            df_proc["return"][i] = 0
        else:
            df_proc["return"][i] = 0
        
        if not(df["signal"][i] == ''):
            last_signal = df_proc["signal"][i] 
            last_ticker = df_proc['signal_ticker'][i]
            
            # Add trx cost
            if consider_trx_cost:
                if df_proc["signal"][i] == 'long_entry' or df_proc["signal"][i] == 'short_entry':
                    df_proc["return"][i] -= buy_cost_pct / 100
                elif df_proc["signal"][i] == 'long_close' or df_proc["signal"][i] == 'short_close':
                    df_proc["return"][i] -= sell_cost_pct / 100

    df_proc["cum_return"] = (1 + df_proc["return"]).cumprod()
    return df_proc

In [21]:
# Single Walk Forward Backtest per Pair
pair_l = []
s1_ret_arr = []
s1_df_arr = []
for i, row in pair_df.iterrows():
    # Run Strategy
    pair = [row['ticker_1'], row['ticker_2']]
    hl = row['half_life']
    
    s_df = bb_strategy(out_df, pair, round(hl), form_type=row['form_type'], std=std)
    s_df = calc_strategy_returns(s_df, form_type=row['form_type'])
    s_ret = s_df['return']
    
    s1_ret_arr.append(s_ret)
    s1_df_arr.append(s_df)
    
    # Calculate Number of Trades
    entry_trades = len(s_df[s_df['signal'] == 'long_entry']) + len(s_df[s_df['signal'] == 'short_entry'])
    close_trades = len(s_df[s_df['signal'] == 'long_close']) + len(s_df[s_df['signal'] == 'short_close'])
    num_trades = min([entry_trades, close_trades])
    
    pair_l.append({
                    'Turnover': num_trades,
                    'Ann Turnover': num_trades / (len(s_ret)/252),
                    'Max DD': qs.stats.drawdown_details(s_ret)['max drawdown'].min(),
                    'Longest DD': qs.stats.drawdown_details(s_ret)['days'].max(),
                    'Currently DD': is_recently_drawdown(s_ret, delta=4),
                    'Cumulative Return': s_df.iloc[-1]['cum_return'],
                    'CAGR': qs.stats.cagr(s_ret),
                    'Sharpe': qs.stats.sharpe(s_ret),
                    'Prob. Sharpe': prob_sr(s_ret, sr_benchmark=0.025)
                    })

# Calculate Deflated Sharpe
s1_pair_df = pd.concat([pair_df, pd.DataFrame(pair_l)], axis=1)

pair_def_sr = []
for s_ret in s1_ret_arr:
    pair_def_sr.append({'Deflated Sharpe': def_sr(s_ret, 
                                                pd.DataFrame(s1_pair_df['Sharpe'], columns=['Sharpe']))[0]
                       })

s1_pair_df = pd.concat([s1_pair_df, pd.DataFrame(pair_def_sr)], axis=1)

In [22]:
def plot_price_spreads_bb(df_arr, pair_df, mode="view", plot_dir=None, col_name=None):
    if mode=="view":
        fig = plt.figure(figsize=(30,20))

        for i, row in pair_df.iterrows():
            df = df_arr[i]
            df['spread'] = np.nan

            # Combine Series
            S1 = df.iloc[:, 0]
            S2 = df.iloc[:, 1]

            # Calculate Spread
            spread = S2 - df['beta'] * S1

            # Test Stationarity
            result = adfuller(handle_nan(spread, method='drop'))
            pvalue = result[1]
            is_stationary = (pvalue < 0.025)

            # Plot on Specific Axis
            ax = plt.subplot(ceil(len(pair_df)/4), 4, i+1)
            ax.set_title(row['ticker_1'] + "/" + row['ticker_2'] + "_" + row['form'] + "_" + row['form_type'])
            spread.plot(ax=ax)

            # Stationary Test on Spread
            if (is_stationary): 
                ax.get_lines()[0].set_color("blue") 
            else :
                ax.get_lines()[0].set_color("red")

        plt.tight_layout()
        plt.show()
        
    elif mode=="save":
        assert plot_dir is not None, "plot_dir must be given"
        assert col_name is not None, "col_name must be given"
        
        path_l = []
        for i, row in pair_df.iterrows():
            fig = plt.figure(figsize=(10,5))
            
            df = df_arr[i]
            df['spread'] = np.nan

            # Combine Series
            S1 = df.iloc[:, 0]
            S2 = df.iloc[:, 1]

            # Calculate Spread
            spread = S2 - df['beta'] * S1

            # Test Stationarity
            result = adfuller(handle_nan(spread, method='drop'))
            pvalue = result[1]
            is_stationary = (pvalue < 0.025)

            # Plot on Specific Axis
            if (is_stationary):
                plt.plot(spread, color="blue")
            else:
                plt.plot(spread, color="red")
            plt.title(row['ticker_1'] + "/" + row['ticker_2'] + "_" + row['form'] + "_" + row['form_type'])
            
            # Save Plot
            plot_path = plot_dir + "strat_" + str(i) +  ".png"
            _ = fig.savefig(plot_path)
            plt.close()
            
            path_l.append({
                                col_name: plot_path
                            })
            
        print("Saved Plots at " + plot_dir)
            
        # Add paths to column
        buff_pair_df = pd.concat([pair_df, pd.DataFrame(path_l)], axis=1)
        
        return buff_pair_df     

In [23]:
# Visualize Rolling Price Spread
s1_pair_df = plot_price_spreads_bb(s1_df_arr, s1_pair_df, mode="save", plot_dir=bb_plot_dir, col_name="artifact-strat_plot_path")

Saved Plots at /workspace/202205_idx-trading/strats/experiments/20220628_s-stat-arb_d-lq45/s_2010-01-01_bp_2021-01-01/s_bb/


In [24]:
display(s1_pair_df.sort_values(by=['Cumulative Return'], ascending=False))

Unnamed: 0,index,ticker_1,ticker_2,form,form_type,eg_pvalue,beta,half_life,lambda,artifact-coint_plot_path,...,Ann Turnover,Max DD,Longest DD,Currently DD,Cumulative Return,CAGR,Sharpe,Prob. Sharpe,Deflated Sharpe,artifact-strat_plot_path
0,7,ERAA.JK,AMRT.JK,ratio,price,0.0003310704,0.0,52.52434,-0.013197,/workspace/202205_idx-trading/strats/experimen...,...,5.478261,-6.403941,27,True,1.58709,0.359468,1.128845,0.921113,0.650796,/workspace/202205_idx-trading/strats/experimen...
15,56,WIKA.JK,INTP.JK,normal,price,0.0212191,2.591538,46.196628,-0.015004,/workspace/202205_idx-trading/strats/experimen...,...,7.532609,-6.574394,41,True,1.47396,0.294246,0.979533,0.884869,0.576475,/workspace/202205_idx-trading/strats/experimen...
12,42,TBIG.JK,TOWR.JK,normal,log-price,0.007352698,1.469176,39.881043,-0.01738,/workspace/202205_idx-trading/strats/experimen...,...,6.847826,-4.27396,32,True,1.374203,0.235329,0.904925,0.878135,0.541603,/workspace/202205_idx-trading/strats/experimen...
14,45,TBIG.JK,TOWR.JK,ratio,price,1.183009e-09,0.0,46.325593,-0.014963,/workspace/202205_idx-trading/strats/experimen...,...,6.847826,-5.538464,29,True,1.373943,0.235173,0.888886,0.877485,0.533685,/workspace/202205_idx-trading/strats/experimen...
4,24,BBNI.JK,LQ45,normal,price,0.01172007,0.053991,47.274321,-0.014662,/workspace/202205_idx-trading/strats/experimen...,...,8.217391,-6.552707,80,True,1.261842,0.16722,0.958892,0.869909,0.563617,/workspace/202205_idx-trading/strats/experimen...
2,15,ICBP.JK,INDF.JK,normal,log-price,0.02199944,0.419276,45.203909,-0.015334,/workspace/202205_idx-trading/strats/experimen...,...,5.478261,-3.14206,42,True,1.245825,0.157348,1.021544,0.897878,0.598332,/workspace/202205_idx-trading/strats/experimen...
8,28,BBRI.JK,BBCA.JK,ratio,price,0.002266286,0.0,40.502506,-0.017114,/workspace/202205_idx-trading/strats/experimen...,...,8.902174,-6.461538,41,True,1.230577,0.147911,0.80529,0.828111,0.489945,/workspace/202205_idx-trading/strats/experimen...
7,27,BBNI.JK,LQ45,normal,log-price,0.01106643,0.233892,49.723041,-0.01394,/workspace/202205_idx-trading/strats/experimen...,...,7.532609,-6.777262,62,True,1.199628,0.128636,0.743627,0.805065,0.460713,/workspace/202205_idx-trading/strats/experimen...
16,57,WIKA.JK,INTP.JK,normal,log-price,0.009379384,0.245922,38.670674,-0.017924,/workspace/202205_idx-trading/strats/experimen...,...,7.532609,-6.800473,75,True,1.153202,0.099405,0.455107,0.701565,0.324133,/workspace/202205_idx-trading/strats/experimen...
6,26,BBCA.JK,BBRI.JK,normal,log-price,0.002011757,0.921993,34.977284,-0.019817,/workspace/202205_idx-trading/strats/experimen...,...,7.532609,-6.679748,42,True,1.143688,0.093366,0.561437,0.740417,0.375242,/workspace/202205_idx-trading/strats/experimen...


### Kalman Filter

In [25]:
class HedgeRatioKFLinReg():
    # Source: https://www.quantstart.com/articles/kalman-filter-based-pairs-trading-strategy-in-qstrader/
    def __init__(self):
        # Mean of System State, or Beta/Hedge Ratio
        self.theta = np.zeros(2)
        
        # Covariance Matrix of System State
        self.R = None
        
        # Covariance Matrix of System State Noise
        self.delta = 1e-4
        self.wt = self.delta / (1 - self.delta) * np.eye(2)
        
        # Covariance Matrix of Measurement Noise
        self.vt = 1e-3
        
    def update(self, s1_price, s2_price):
        # Create the observation matrix of the latest prices
        # of TLT and the intercept value (1.0) as well as the
        # scalar value of the latest price from IEI
        F = np.asarray([s1_price, 1.0]).reshape((1, 2))
        y = s2_price

        # The prior value of the states \theta_t is
        # distributed as a multivariate Gaussian with
        # mean a_t and variance-covariance R_t
        if self.R is not None:
            self.R = self.C + self.wt
        else:
            self.R = np.zeros((2, 2))

        # Calculate the Kalman Filter update
        # ----------------------------------
        # Calculate prediction of new observation
        # as well as forecast error of that prediction
        yhat = F.dot(self.theta)
        et = y - yhat

        # Q_t is the variance of the prediction of
        # observations and hence \sqrt{Q_t} is the
        # standard deviation of the predictions
        Qt = F.dot(self.R).dot(F.T) + self.vt
        sqrt_Qt = np.sqrt(Qt)

        # The posterior value of the states \theta_t is
        # distributed as a multivariate Gaussian with mean
        # m_t and variance-covariance C_t
        At = self.R.dot(F.T) / Qt
        self.theta = self.theta + At.flatten() * et
        self.C = self.R - At * F.dot(self.R)
        
        return self.theta, sqrt_Qt, et

def kf_strategy(df_dict, pair_ticker, form_type='price', std=1, burn_in=4):
    '''
    Kalman Filter Long-Only Bollinger Band Mean Reversion Strategy
    
    1. Calculate the beta/hedge ratio, std, and price spread from KF.
       We assume the model is of the form Y = beta * X where Y is the second item in the pair.
    2. Generate Signal, based on KF std (long when price spread < -std, exit when price spread >= -std)
    3. Calculate Returns
    '''
    
    # Take the relevant price series from each pair
    df_proc = gen_combined_df(df_dict, [pair_ticker[0], pair_ticker[1]], [form_type], add_pfix=False)
    pair = pair_ticker
    
    # Calculate beta, std, price spread using KalmanFilter
    kf = HedgeRatioKFLinReg()
    S1 = df_proc[pair[0]]
    S2 = df_proc[pair[1]]
    kf_l = []
    
    for p1, p2 in zip(S1,S2):
        _, state_std, spread = kf.update(p1, p2)
        kf_l.append({
                        'std': state_std[0][0],
                        'spread': spread[0]
                    })
        
    ## Combine df
    kf_df = pd.DataFrame(kf_l)
    for col in kf_df:
        df_proc[col] = kf_df[col].values
    
    # Drop burn in periods
    df_proc = df_proc[burn_in:]
    
    # Signal Rules
    long_signal = lambda price, std: (price < -std)
    long_close_signal = lambda price, std: (price >= -std)
    short_signal = lambda price, std: (price > std)  
    short_close_signal = lambda price, std: (price <= std)
    
    # Generate Signals
    ## This Signal generation is for a long-only strategy
    last_signal = ''
    df_proc['signal'] = ''
    df_proc['signal_ticker'] = ''
    for i in range(0, len(df_proc)):
        if i == 0:
            df_proc['signal'][i] = ''
        
        elif last_signal == '':
            if long_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'long_entry'
                last_signal = 'long_entry'
                df_proc['signal_ticker'][i] = pair[1]
            elif long_close_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'long_close'
                last_signal = 'long_close'
                df_proc['signal_ticker'][i] = pair[1]
            elif short_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'short_entry'
                last_signal = 'short_entry'
                df_proc['signal_ticker'][i] = pair[0]
            elif short_close_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'short_close'
                last_signal = 'short_close'
                df_proc['signal_ticker'][i] = pair[0]
            else:
                df_proc['signal'][i] = ''
            
        elif last_signal == 'long_entry':
            if long_close_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'long_close'
                last_signal = 'long_close'
                df_proc['signal_ticker'][i] = pair[1]
            else:
                df_proc['signal'][i] = ''
                
        elif last_signal == 'short_entry':
            if short_close_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'short_close'
                last_signal = 'short_close'
                df_proc['signal_ticker'][i] = pair[0]
            else:
                df_proc['signal'][i] = ''
                
        elif last_signal == 'long_close' or last_signal == 'short_close':
            if long_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'long_entry'
                last_signal = 'long_entry'
                df_proc['signal_ticker'][i] = pair[1]
            elif short_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'short_entry'
                last_signal = 'short_entry'
                df_proc['signal_ticker'][i] = pair[0]
            else:
                df_proc['signal'][i] = ''

    return df_proc

In [26]:
# Single Walk Forward Backtest per Pair
pair_l = []
s2_ret_arr = []
s2_df_arr = []
for i, row in pair_df.iterrows():
    # Run Strategy
    pair = [row['ticker_1'], row['ticker_2']]
    hl = row['half_life']
    
    s_df = kf_strategy(out_df, pair, form_type=row['form_type'])
    s_df = calc_strategy_returns(s_df, form_type=row['form_type'])
    s_ret = s_df['return']
    
    s2_ret_arr.append(s_ret)
    s2_df_arr.append(s_df)
    
    # Calculate Number of Trades
    entry_trades = len(s_df[s_df['signal'] == 'long_entry']) + len(s_df[s_df['signal'] == 'short_entry'])
    close_trades = len(s_df[s_df['signal'] == 'long_close']) + len(s_df[s_df['signal'] == 'short_close'])
    num_trades = min([entry_trades, close_trades])
    
    pair_l.append({
                    'Turnover': num_trades,
                    'Ann Turnover': num_trades / (len(s_ret)/252),
                    'Max DD': qs.stats.drawdown_details(s_ret)['max drawdown'].min(),
                    'Longest DD': qs.stats.drawdown_details(s_ret)['days'].max(),
                    'Currently DD': is_recently_drawdown(s_ret, delta=4),
                    'Cumulative Return': s_df.iloc[-1]['cum_return'],
                    'CAGR': qs.stats.cagr(s_ret),
                    'Sharpe': qs.stats.sharpe(s_ret),
                    'Prob. Sharpe': prob_sr(s_ret, sr_benchmark=0.025)
                    })

# Calculate Deflated Sharpe
s2_pair_df = pd.concat([pair_df, pd.DataFrame(pair_l)], axis=1)

pair_def_sr = []
for s_ret in s2_ret_arr:
    pair_def_sr.append({'Deflated Sharpe': def_sr(s_ret, 
                                                pd.DataFrame(s2_pair_df['Sharpe'], columns=['Sharpe']))[0]
                       })

s2_pair_df = pd.concat([s2_pair_df, pd.DataFrame(pair_def_sr)], axis=1)

In [27]:
def plot_price_spreads_kf(df_arr, pair_df, mode="view", plot_dir=None, col_name=None):
    if mode=="view":
        fig = plt.figure(figsize=(30,20))

        for i, row in pair_df.iterrows():
            df = s2_df_arr[i]
            spread = df['spread']

            # Test Stationarity
            result = adfuller(handle_nan(spread, method='drop'))
            pvalue = result[1]
            is_stationary = (pvalue < 0.025)

            # Plot on Specific Axis
            ax = plt.subplot(ceil(len(pair_df)/4), 4, i+1)
            ax.set_title(row['ticker_1'] + "/" + row['ticker_2'] + "_" + row['form'] + "_" + row['form_type'])
            spread.plot(ax=ax)

            # Stationary Test on Spread
            if (is_stationary): 
                ax.get_lines()[0].set_color("blue") 
            else :
                ax.get_lines()[0].set_color("red")

        plt.tight_layout()
        plt.show()
        
    elif mode=="save":
        assert plot_dir is not None, "plot_dir must be given"
        assert col_name is not None, "col_name must be given"

        path_l = []
        for i, row in pair_df.iterrows():
            fig = plt.figure(figsize=(10,5))

            df = df_arr[i]
            spread = df['spread']

            # Test Stationarity
            result = adfuller(handle_nan(spread, method='drop'))
            pvalue = result[1]
            is_stationary = (pvalue < 0.025)

            # Plot on Specific Axis
            if (is_stationary):
                plt.plot(spread, color="blue")
            else:
                plt.plot(spread, color="red")
            plt.title(row['ticker_1'] + "/" + row['ticker_2'] + "_" + row['form'] + "_" + row['form_type'])

            # Save Plot
            plot_path = plot_dir + "strat_" + str(i) + ".png"
            _ = fig.savefig(plot_path)
            plt.close()

            path_l.append({
                                col_name: plot_path
                            })

        print("Saved Plots at " +  plot_dir)

        # Add paths to column
        buff_pair_df = pd.concat([pair_df, pd.DataFrame(path_l)], axis=1)

        return buff_pair_df 

In [28]:
# Visualize Rolling Price Spread
s2_pair_df = plot_price_spreads_kf(s2_df_arr, s2_pair_df, mode="save", plot_dir=kf_plot_dir, col_name="artifact-strat_plot_path")

Saved Plots at /workspace/202205_idx-trading/strats/experiments/20220628_s-stat-arb_d-lq45/s_2010-01-01_bp_2021-01-01/s_kf/


In [29]:
display(s2_pair_df.sort_values(by=['Cumulative Return'], ascending=False))

Unnamed: 0,index,ticker_1,ticker_2,form,form_type,eg_pvalue,beta,half_life,lambda,artifact-coint_plot_path,...,Ann Turnover,Max DD,Longest DD,Currently DD,Cumulative Return,CAGR,Sharpe,Prob. Sharpe,Deflated Sharpe,artifact-strat_plot_path
0,7,ERAA.JK,AMRT.JK,ratio,price,0.0003310704,0.0,52.52434,-0.013197,/workspace/202205_idx-trading/strats/experimen...,...,94.153846,-6.761561,85,True,2.567449,0.880414,1.911911,0.993764,0.6047628,/workspace/202205_idx-trading/strats/experimen...
1,8,LQ45,AMRT.JK,ratio,price,0.0002212411,0.0,50.995557,-0.013592,/workspace/202205_idx-trading/strats/experimen...,...,88.615385,-6.746033,44,True,1.380567,0.24108,0.935316,0.87951,0.1587875,/workspace/202205_idx-trading/strats/experimen...
14,45,TBIG.JK,TOWR.JK,ratio,price,1.183009e-09,0.0,46.325593,-0.014963,/workspace/202205_idx-trading/strats/experimen...,...,56.769231,-4.095563,51,True,1.311313,0.199032,1.20846,0.932037,0.263201,/workspace/202205_idx-trading/strats/experimen...
16,57,WIKA.JK,INTP.JK,normal,log-price,0.009379384,0.245922,38.670674,-0.017924,/workspace/202205_idx-trading/strats/experimen...,...,10.384615,-6.76314,8,False,1.200537,0.130211,0.890381,0.90115,0.110877,/workspace/202205_idx-trading/strats/experimen...
8,28,BBRI.JK,BBCA.JK,ratio,price,0.002266286,0.0,40.502506,-0.017114,/workspace/202205_idx-trading/strats/experimen...,...,84.461538,-6.487691,28,True,1.189932,0.123516,0.757483,0.812496,0.1240161,/workspace/202205_idx-trading/strats/experimen...
3,23,BBCA.JK,BBRI.JK,normal,price,0.01393281,0.595569,49.706677,-0.013945,/workspace/202205_idx-trading/strats/experimen...,...,49.153846,-9.09248,24,True,1.045223,0.030065,0.272288,0.614947,0.04451714,/workspace/202205_idx-trading/strats/experimen...
11,40,TLKM.JK,TOWR.JK,normal,log-price,0.001864807,1.356685,53.54624,-0.012945,/workspace/202205_idx-trading/strats/experimen...,...,0.692308,-0.1,2,False,1.022234,0.014836,0.778548,0.954184,0.0184031,/workspace/202205_idx-trading/strats/experimen...
4,24,BBNI.JK,LQ45,normal,price,0.01172007,0.053991,47.274321,-0.014662,/workspace/202205_idx-trading/strats/experimen...,...,0.692308,-0.1,2,False,1.006459,0.004321,0.668589,0.895325,0.02098733,/workspace/202205_idx-trading/strats/experimen...
12,42,TBIG.JK,TOWR.JK,normal,log-price,0.007352698,1.469176,39.881043,-0.01738,/workspace/202205_idx-trading/strats/experimen...,...,1.384615,-1.700035,2,False,1.004766,0.00319,0.145733,0.55934,0.02644037,/workspace/202205_idx-trading/strats/experimen...
6,26,BBCA.JK,BBRI.JK,normal,log-price,0.002011757,0.921993,34.977284,-0.019817,/workspace/202205_idx-trading/strats/experimen...,...,0.0,-0.1,1,False,0.999,-0.00067,-0.83205,0.02024,6.014633e-10,/workspace/202205_idx-trading/strats/experimen...


## Save to MLFlow

In [30]:
def save_to_mlflow(pair_df, run_params, param_name_list, metric_name_list, artifact_name_list, 
                   exp_id=mlflow_experiment_name, strat_type=None):
    
    assert strat_type is not None, "strat_type must be defined"
    
    for i, row in pair_df.iterrows():
        with mlflow.start_run(experiment_id=exp_id) as run:
            # Log Run Specific Params
            for key in run_params:
                mlflow.log_param(key, run_params[key])
                
            # Log Parameters
            mlflow.log_param("strat_type", strat_type)
            for cn in param_name_list:
                mlflow.log_param(cn, row[cn])
            
            # Log Metrics
            for cn in metric_name_list:
                mlflow.log_metric(cn, row[cn])
            
            # Log Artifacts
            for cn in artifact_name_list:
                mlflow.log_artifact(row[cn])

In [31]:
# Define Parameter List
run_params = {
                "date_in_sample_start": date_in_sample_start, 
                "date_in_sample_end": date_in_sample_end,
                "date_in_sample_len": date_in_sample_len,
                "date_out_sample_start": date_out_sample_start,
                "date_out_sample_end": date_out_sample_end,
                "date_out_sample_len": date_out_sample_len,
                "strat_class": strat_class,
                "std": std
             }

param_name_list = ["ticker_1", "ticker_2", "form", "form_type"]
metric_name_list = ['eg_pvalue','beta','half_life','lambda',
                    'Turnover','Ann Turnover','Max DD','Longest DD','Currently DD','Cumulative Return','CAGR','Sharpe','Prob. Sharpe','Deflated Sharpe']
artifact_name_list = ["artifact-coint_plot_path", "artifact-strat_plot_path"]

In [32]:
# Save Runs to MLFlow
mlflow.set_tracking_uri("/workspace/mlruns")
exp_id = dict(mlflow.get_experiment_by_name(mlflow_experiment_name))['experiment_id']
save_to_mlflow(s1_pair_df, run_params, param_name_list, metric_name_list, artifact_name_list, 
                   exp_id=exp_id, strat_type="Bollinger Band")
save_to_mlflow(s2_pair_df, run_params, param_name_list, metric_name_list, artifact_name_list, 
                   exp_id=exp_id, strat_type="Kalman Filter")

In [None]:
qs.plots.returns(s_ret)
qs.plots.monthly_heatmap(s_ret)
qs.plots.drawdowns_periods(s_ret)
qs.plots.drawdown(s_ret)