## Definitions

In [1]:
import os
import shutil

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pandas_ta as ta
import quantstats as qs
qs.extend_pandas()

import numpy as np
import scipy.stats as ss

from datetime import datetime, timedelta
from tqdm import tqdm
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn

import mlflow

params = {'figure.facecolor': 'w'}
plt.rcParams.update(params)

from IPython.display import display

In [2]:
def make_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        shutil.rmtree(directory)
        os.makedirs(directory)

In [3]:
# Parameters
date_start = '2010-01-01'
date_breakpoint = '2019-01-01'
strat_class = "Mean Reversion"
std = 1

# MLFlow Parameters
mlflow_experiment_name = "20220628_s-stat-arb_d-lq45"

In [None]:
df = pd.read_csv("20220621_garch-df_proc.csv")

In [6]:
# Prepare Stock Tickers
with open(data_dir + lq45_list, "r") as f:
    lq45_tickers = f.read().split('\n')

## Prepare active tickers for international codes
active_tickers = [f + '.JK' for f in lq45_tickers]
active_tickers.append('LQ45')

In [7]:
# Prepare Time Series Data
nan_handle_method = 'bfill'

df_dict = {}
for ticker in tqdm(active_tickers):
    if ticker == 'LQ45':
        df_dict[ticker] = pd.read_csv(lq45_index_file)
    else:
        df_dict[ticker] = pd.read_csv(lq45_dir + ticker + '.csv')
    
    ## Take Only Date and Adjusted Close
    df_dict[ticker] = df_dict[ticker][['Date', 'Adj Close']]
    df_dict['Date'] = pd.to_datetime(df_dict[ticker]['Date'])
    df_dict[ticker].set_index(pd.DatetimeIndex(df_dict[ticker]['Date']), inplace=True)
    
    df_dict[ticker].drop('Date', axis=1, inplace=True)
    
    ## Convert Adj Close to price
    df_dict[ticker]['price'] = df_dict[ticker]['Adj Close']
    df_dict[ticker].drop('Adj Close', axis=1, inplace=True)

100%|████████████████████████████████████████████████████████████████████| 46/46 [00:02<00:00, 18.52it/s]


In [8]:
# Separate Into In Sample and Out Sample
nan_cnt_threshold = 252*2

in_df = {}
out_df = {}
rmv_tickers = []
for ticker in tqdm(active_tickers):
    ## Take In Sample and Out Sample Data
    in_df[ticker] = df_dict[ticker][(df_dict[ticker].index >= date_start) & 
                                                (df_dict[ticker].index < date_breakpoint)]
    out_df[ticker] = df_dict[ticker][df_dict[ticker].index >= date_breakpoint]
    
    ## Check if there are too many NaN values
    if in_df[ticker]['price'].isna().sum() > nan_cnt_threshold:
        rmv_tickers.append(ticker)
        continue
    
    ## Handle NaN Values
    in_df[ticker] = handle_nan(in_df[ticker], method=nan_handle_method)
    out_df[ticker] = handle_nan(out_df[ticker], method=nan_handle_method)
    
    ## Extend price to other values
    in_df[ticker] = extend_price_df(in_df[ticker])
    out_df[ticker] = extend_price_df(out_df[ticker])

# Remove tickers that only have small amounts of data
active_tickers = [t for t in active_tickers if t not in rmv_tickers]

100%|████████████████████████████████████████████████████████████████████| 46/46 [00:00<00:00, 95.09it/s]


In [9]:
# Date Parameters - for logging purposes
str_to_date = lambda d_str: datetime.strptime(d_str, "%Y-%m-%d").date()

date_in_sample_start = date_start
date_in_sample_end = str(str_to_date(date_breakpoint) - timedelta(1))
date_in_sample_len = abs(str_to_date(date_in_sample_start) - str_to_date(date_in_sample_end)).days
date_out_sample_start = date_breakpoint
date_out_sample_end = str(out_df[active_tickers[0]].index[-1])[:-9]
date_out_sample_len = abs(str_to_date(date_out_sample_start) - str_to_date(date_out_sample_end)).days

In [10]:
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.stattools import adfuller

def find_coint_pairs(df_dict, tickers, form='normal', form_type='price', alpha=0.05):
    '''
    Finds cointegrated pairs from df_dict serial data, based on given tickers.
    '''
    def coint_ticker_pick(pvalue_1, pvalue_2, tickers, i, j):
        if pvalue_1 <= pvalue_2:
            pvalue = pvalue_1
            coint_tickers = [tickers[i], tickers[j]]
        else:
            pvalue = pvalue_2
            coint_tickers = [tickers[j], tickers[i]]
        
        return pvalue, coint_tickers
    
    n = len(tickers)
    pairs = []
    pvalues = []
    
    for i in range(0, n):
        for j in range(i+1, n):
            
            comb_df = gen_combined_df(df_dict, [tickers[i], tickers[j]], [form_type])
            S1 = comb_df[tickers[i] + "_" + form_type]
            S2 = comb_df[tickers[j] + "_" + form_type]
                
            if(form == 'normal'):
                result_1 = coint(S2, S1)
                pvalue_1 = result_1[1]
                
                result_2 = coint(S1, S2)
                pvalue_2 = result_2[1]
                
                pvalue, coint_tickers = coint_ticker_pick(pvalue_1, pvalue_2, tickers, i, j)
                
            elif(form == 'ratio'):
                result_1 = adfuller(S1/S2)
                pvalue_1 = result_1[1]
                
                result_2 = adfuller(S2/S1)
                pvalue_2 = result_2[1]
                
                pvalue, coint_tickers = coint_ticker_pick(pvalue_1, pvalue_2, tickers, i, j)
                
            if pvalue < alpha:
                pairs.append(coint_tickers)
                pvalues.append(pvalue)
                      
    return pairs, pvalues

def calc_beta_ols(S2, S1, form_type='price'):
    '''
    Calculate beta from two series by doing regression.
    '''
    S1 = sm.add_constant(S1)
    results = sm.OLS(S2, S1).fit()
    S1 = S1[form_type]
    b = results.params[form_type].values[0]
    
    return b

def calc_half_life(S, form_type='price'):
    '''
    Calculate half life from a price series
    '''
    S_lag = S.shift(periods=1).iloc[1:]
    S_diff = S.iloc[1:] - S_lag

    S_lag = sm.add_constant(S_lag)
    results = sm.OLS(S_diff, S_lag).fit()
    S_lag = S_lag[form_type]
    lbd = results.params[form_type].values[0]

    hl = -np.log(2) / lbd
    
    return hl, lbd

In [11]:
# Prepare LQ45 Stock Indexes of the Same Group
stock_groups = {
    "energy_and_mining": ['BRPT.JK', 'HRUM.JK', 'MEDC.JK', 'PGAS.JK', 'TPIA.JK', 'ADRO.JK', 'ITMG.JK', 'PTBA.JK', 'INCO.JK', 'MDKA.JK', 'ANTM.JK'],
    "retail": ['AMRT.JK', 'UNVR.JK', 'ERAA.JK', 'ASII.JK'],
    "food_agri": ['CPIN.JK', 'JPFA.JK', 'ICBP.JK', 'INDF.JK'],
    "paper": ['TKIM.JK', 'INKP.JK'],
    "finance": ['BBCA.JK', 'BBNI.JK', 'BBRI.JK', 'BBTN.JK', 'BMRI.JK', 'BFIN.JK'],
    "media": ['EMTK.JK', 'MNCN.JK'],
    "telcom": ['EXCL.JK', 'TLKM.JK', 'TBIG.JK', 'TOWR.JK'],
    "tobacco": ['GGRM.JK', 'HMSP.JK'],
    "construction": ['INTP.JK', 'PTPP.JK', 'SMGR.JK', 'UNTR.JK', 'WIKA.JK', 'WSKT.JK'],
    "medical": ['KLBF.JK', 'MIKA.JK']
}

# Filter to only those that are active
for key, val in stock_groups.items():
    stock_groups[key] = [t for t in val if t in active_tickers]

In [12]:
# Search for Coint Pair on Different Forms
forms = ['normal', 'ratio']
form_types = ['price', 'log-price']

pair_l = []
for key, val in tqdm(stock_groups.items()):
    tickers = val
    tickers.append('LQ45')
    
    for f in forms:
        for ft in form_types:
    
            pairs, pvalues = find_coint_pairs(in_df, tickers, form=f, form_type=ft, alpha=0.025)
            for pair, pvalue in zip(pairs, pvalues):
                pair_l.append({
                                'ticker_1': pair[0],
                                'ticker_2': pair[1],
                                'form': f,
                                'form_type': ft,
                                'eg_pvalue': pvalue
                            })
    
pair_df = pd.DataFrame(pair_l) 

100%|████████████████████████████████████████████████████████████████████| 10/10 [02:12<00:00, 13.29s/it]


In [13]:
# Calculate Half-Life

pair_l = []
for _, row in pair_df.iterrows():
    # Regression to get Beta for Price Spread Model
    ## Combine Series
    comb_df = gen_combined_df(in_df, [row['ticker_1'], row['ticker_2']], [row['form_type']])
    
    ## Rename Columns
    S1 = comb_df[row['ticker_1'] + "_" + row['form_type']]
    S1.name = row['form_type']
    S2 = comb_df[row['ticker_2'] + "_" + row['form_type']]
    S2.name = row['form_type']
    
    if row['form'] == 'normal':
        b = calc_beta_ols(S2, S1, form_type=[row['form_type']])
        spread = S2 - b * S1
        
    elif row['form'] == 'ratio':
        b = 0
        spread = S1 / S2

    # Ornstein-Uhlenbeck Formula to Calculate Half Life
    hl, lbd = calc_half_life(spread, form_type=[row['form_type']])
    
    pair_l.append({
                    'beta': b,
                    'half_life': hl,
                    'lambda': lbd
                    })
    
full_pair_df = pd.concat([pair_df, pd.DataFrame(pair_l)], axis=1)

In [14]:
# Filter Strategies that Takes too Long to be Profitable
pair_df = full_pair_df[(full_pair_df['half_life'] < 60) & (full_pair_df['lambda'] < 0)].reset_index()

In [15]:
def plot_price_spreads_coint(df, pair_df, mode="view", plot_dir=None, col_name=None):
    if mode=="view":
        fig = plt.figure(figsize=(30,20))

        for i, row in pair_df.iterrows():
            # Combine Series
            comb_df = gen_combined_df(df, [row['ticker_1'], row['ticker_2']], [row['form_type']])
            S1 = comb_df[row['ticker_1'] + "_" + row['form_type']]
            S2 = comb_df[row['ticker_2'] + "_" + row['form_type']]

            # Calculate Spread
            spread = S2 - row['beta'] * S1

            # Plot on Specific Axis
            ax = plt.subplot(ceil(len(pair_df)/4), 4, i+1)
            ax.set_title(row['ticker_1'] + "/" + row['ticker_2'] + "_" + row['form'] + "_" + row['form_type'])
            spread.plot(ax=ax)

        plt.tight_layout()
        plt.show()
        
    elif mode=="save":
        assert plot_dir is not None, "plot_dir must be given"
        assert col_name is not None, "col_name must be given"
        
        path_l = []
        for i, row in pair_df.iterrows():
            fig = plt.figure(figsize=(10,5))
            
            # Combine Series
            comb_df = gen_combined_df(df, [row['ticker_1'], row['ticker_2']], [row['form_type']])
            S1 = comb_df[row['ticker_1'] + "_" + row['form_type']]
            S2 = comb_df[row['ticker_2'] + "_" + row['form_type']]

            # Calculate Spread
            spread = S2 - row['beta'] * S1

            # Plot on Specific Axis
            plt.plot(spread)
            plt.title(row['ticker_1'] + "/" + row['ticker_2'] + "_" + row['form'] + "_" + row['form_type'])
            
            # Save Plot
            plot_path = plot_dir + "coint_" + str(i) +  ".png"
            _ = fig.savefig(plot_path)
            plt.close()
            
            path_l.append({col_name: plot_path})
            
        print("Saved Plots at " + plot_dir)
            
        # Add paths to column
        buff_pair_df = pd.concat([pair_df, pd.DataFrame(path_l)], axis=1)
        
        return buff_pair_df

In [16]:
# Visualize Price Spread
pair_df = plot_price_spreads_coint(in_df, pair_df, mode="save", plot_dir=coint_plot_dir, col_name="artifact-coint_plot_path")

Saved Plots at /workspace/202205_idx-trading/strats/experiments/20220628_s-stat-arb_d-lq45/s_2010-01-01_bp_2019-01-01/coint/


In [17]:
display(pair_df)

Unnamed: 0,index,ticker_1,ticker_2,form,form_type,eg_pvalue,beta,half_life,lambda,artifact-coint_plot_path
0,3,ASII.JK,AMRT.JK,normal,log-price,0.008969679,1.947255,37.811525,-0.018332,/workspace/202205_idx-trading/strats/experimen...
1,4,LQ45,AMRT.JK,normal,log-price,0.01132397,2.497965,58.376005,-0.011874,/workspace/202205_idx-trading/strats/experimen...
2,5,UNVR.JK,ASII.JK,normal,log-price,0.0171229,0.416545,58.517623,-0.011845,/workspace/202205_idx-trading/strats/experimen...
3,6,ERAA.JK,AMRT.JK,ratio,price,0.001567197,0.0,52.31712,-0.013249,/workspace/202205_idx-trading/strats/experimen...
4,7,LQ45,AMRT.JK,ratio,price,0.001315995,0.0,48.088972,-0.014414,/workspace/202205_idx-trading/strats/experimen...
5,12,LQ45,ASII.JK,ratio,log-price,0.006540303,0.0,55.133204,-0.012572,/workspace/202205_idx-trading/strats/experimen...
6,17,BBCA.JK,BBRI.JK,normal,price,0.01409072,0.672636,28.294893,-0.024497,/workspace/202205_idx-trading/strats/experimen...
7,18,BBCA.JK,BMRI.JK,normal,price,0.02017991,1.214822,30.314168,-0.022865,/workspace/202205_idx-trading/strats/experimen...
8,19,BBRI.JK,BBNI.JK,normal,price,0.02439975,2.988737,37.86099,-0.018308,/workspace/202205_idx-trading/strats/experimen...
9,20,BBNI.JK,BMRI.JK,normal,price,0.02103181,0.570617,35.747414,-0.01939,/workspace/202205_idx-trading/strats/experimen...


## Backtest

### Backtest Class Definitions

In [None]:
class Backtest():
    '''
    Standard Walk Forward Backtest
    
    Assumptions:
    - We assume strat_df has standard column names
    - signal follows standard of x_entry and x_close for long and short, or '' for no signal
    - Weighting for each instrument is static and granular (not based on a round number of stocks)
    '''
    
    def __init__(self, long_only=False, include_transaction_costs=True, buy_cost_pct=0.05, sell_cost_pct=0.1):
        self.long_only = long_only
        self.include_transaction_costs = include_transaction_costs
        if include_transaction_costs:
            self.buy_cost_pct = buy_cost_pct
            self.sell_cost_pct = sell_cost_pct
        
    def run(self, signal_df, ticker_weights):
        '''
        Algo sketch
        1. Initialize signal
        2. Run all the required metric functions
        3. return relevant information
        '''
        self.init_signal(signal_df, ticker_weights)
    
    def init_signal(self, signal_df, ticker_weights):
        '''
        Initializes signal dataframe for use in backtesting
        Assumptions: 
        - We assume signal_df is of a particular colnames
        - We assume that signal_df also consists of the close price
        
        Input:
        - signal_df: consists of colnames [price_, signal_, strength_, signal_price]
        
        TODO - Change to support other forms of price sources (e.g. market spread)
        '''
        self.strat_df = signal_df
        
        self.ticker_weights = ticker_weights
        self.signal_tickers = self.get_signal_tickers()
        self.tickers = [parse_col_str(signal_t)[1] for signal_t in self.signal_tickers]
    
    # Calculate Relevant Metrics
    
    # TODO - Might wanna add capability to calculate PnL based on position
    # TODO - Might wanna add capability to calculate metrics regardless of the class variables
    
    ## Running Metrics
    def calc_positions(self):
        '''
        Calculates position per point in time for each ticker and adds it to a column in strat_df
        '''
        def get_spike(x):
            if x == "long_entry" or x == "short_exit":
                return 1
            elif x == "long_exit" or x == "short_entry":
                return -1
            else:
                return 0
        
        for t in self.tickers:
            position_t = "position_" + t
            signal_t = "signal_" + t
            
            spike_df = strat_df[signal_t].apply(get_spike)
            
            self.strat_df[position_t] = spike_df.cumsum() * self.ticker_weight[t]
    
    def calc_returns(self):
        '''
        Calculates returns from price
        Assumptions: 
        - price, not log-price
        
        TODO - Might wanna implement using pandas multiindex/hierarchical columns
        TODO - Might wanna implement parallelized pandas processing
        TODO - Might wanna add capability for stop loss
        TODO - Later on, you can implement returns calculation based on different signal price
        TODO - Might wanna rethink this implementation if we use positions
        '''
        for t in self.tickers:
            return_t = "return_" + t
            self.strat_df[return_t] = np.nan
        self.strat_df['return'] = np.nan    
        
        last_signal = {}
        for ii in range(0, len(self.strat_df)):
            returns_buff = 0
            for signal_t in self.signal_tickers:
                t = parse_col_str(signal_t)[1]
                price_t = 'price_' + t
                return_t = "return_" + t
                
                if not (t in last_signal.key):
                    ticker_return = 0
                elif last_signal[t] == 'long_entry':
                    ticker_return = (self.strat_df[price_t][ii] - self.strat_df[price_t][ii-1])/self.strat_df[price_t][ii-1]  
                elif last_signal[t] == 'short_entry':
                    ticker_return = -(self.strat_df[price_t][ii] - self.strat_df[price_t][ii-1])/self.strat_df[price_t][ii-1]
                elif last_signal[t] == 'long_close' or last_signal[t] == 'short_close':
                    ticker_return = 0
                
                # Rescale ticker_return based on weight for the particular ticker
                ticker_return = ticker_return * self.ticker_weight[t]
                
                if not self.strat_df[signal_t][ii] == "":
                    last_signal[t] = self.strat_df[signal_t][ii]
            
                    if self.include_transaction_costs:
                        ticker_return += self.calc_transaction_cost(last_signal[t])
                        
                returns_buff += ticker_return
                self.strat_df[return_t][ii] = ticker_return
                ticker_return = 0

            self.strat_df['return'][ii] = returns_buff

    def calc_cum_returns(self):
        '''
        Calculates cumulative return per point in time and adds it to a column in strat_df
        '''
        assert "return" in self.strat_df.columns.values.tolist(), "calc_return must be run before calc_cum_returns" 
        self.strat_df['cum_return'] = (1 + self.strat_df["return"]).cumprod() - 1
    
    def calc_hits_and_misses(self):
        '''
        Determines if a position is a hit or miss, decided at exit.
        - Hit : Positive Returns
        - Miss: Negative Returns
        '''
        assert "return" in self.strat_df.columns.values.tolist(), "calc_return must be run before calc_hits_and_misses" 
        
        hm_l = []
        cum_return = {}
        last_signal = {}
        for ii in range(0, len(self.strat_df)):
            tickers_dict = {}
            for signal_t in self.signal_tickers:
                t = parse_col_str(signal_t)[1]
                return_t = "return_" + t
                
                if not (t in cum_return.key):
                    cum_return[t] = 0
                
                # Determine the return for the current timestamp, and calculate running cumulative return
                if not (t in last_signal.key):
                    ticker_return = 0
                elif last_signal[t] == 'long_entry' or last_signal[t] == 'short_entry':
                    ticker_return = self.strat_df[return_t][ii]
                    cum_return[t] = (1 + cum_return[t]) * (1 + ticker_return) - 1 
                elif last_signal[t] == 'long_close' or last_signal[t] == 'short_close':
                    ticker_return = 0 
                
                # Determine H/M Based on running cumulative return
                if not self.strat_df[signal_t][ii] == "":
                    last_signal[t] = self.strat_df[signal_t][ii]
                    
                    if last_signal[t] == 'long_close' or last_signal[t] == 'short_close':
                        tickers_dict['H/M_' + t] = "hit" if cum_return[t] > 0 else "miss"
                        cum_return[t] = 0
                else:
                    tickers_dict['H/M_' + t] = ""
                
            hm_l.append(tickers_dict)
            
        self.strat_df = pd.concat([self.strat_df, pd.DataFrame(hm_l)], axis=1)
    
    ## Positions Metrics
    def calc_turnover(self, mode="aggregate"):
        '''
        Calculates turnover, or number of trades
        
        TODO - Might wanna check if I eventually use stop loss
        '''
        turnover = {}
        for signal_t in self.signal_tickers:
            
            l_entry_trade = len(self.strat_df[self.strat_df[signal_t] == 'long_entry'])
            s_entry_trade = len(self.strat_df[self.strat_df[signal_t] == 'short_entry'])
            l_close_trade = len(self.strat_df[self.strat_df[signal_t] == 'long_close'])
            s_close_trade = len(self.strat_df[self.strat_df[signal_t]== 'short_close'])
        
            l_trade = min([l_entry_trade, l_close_trade])
            s_trade = min([s_entry_trade, s_close_trade])
            
            t = self.parse_col_str(signal_t)[1]
            turnover[t] = l_trade + s_trade
            
        if mode=="detailed":
            return turnover
        elif mode=="aggregate":
            return sum(turnover.values())
    
    def calc_ratio_of_longs(self, mode="aggregate"):
        '''
        Calculates ratio of longs for all trades
        Assumptions: 
        - signal follows standard of x_entry and x_close for long and short, or '' for no signal
        '''
        turnover = {}
        longs = {}
        for signal_t in self.signal_tickers:
            
            l_entry_trade = len(self.strat_df[self.strat_df[signal_t] == 'long_entry'])
            s_entry_trade = len(self.strat_df[self.strat_df[signal_t] == 'short_entry'])
            l_close_trade = len(self.strat_df[self.strat_df[signal_t] == 'long_close'])
            s_close_trade = len(self.strat_df[self.strat_df[signal_t]== 'short_close'])
        
            l_trade = min([l_entry_trade, l_close_trade])
            s_trade = min([s_entry_trade, s_close_trade])
            
            t = self.parse_col_str(signal_t)[1]
            turnover[t] = l_trade + s_trade
            longs[t] = l_trade
            
        if mode=="detailed":
            long_ratio = {}
            for t in turnover.keys:
                long_ratio[t] = longs[t] / turnover[t]
            return long_ratio
        elif mode=="aggregate":
            return sum(longs.values) / sum(turnover.values)
    
    def calc_avg_holding_period(self, mode="aggregate"):
        '''
        Calculates average holding period for each position (short and long combined)
        
        Assumption:
        - After x_entry will always be x_exit
        - longs and shorts will always come in different periods
        '''
        
        avg_holding_periods = {}
        total_holding_period = 0
        num_period = 0
        
        for signal_t in self.signal_tickers:
            t = parse_col_str(signal_t)[1]
            holding_periods = []
            buff_df = self.strat_df[signal_t][self.strat_df[signal_t] != ""]
            
            if buff_df[0] == "long_exit" or buff_df[0] == "short_exit":
                buff_df = buff_df[1:]
                
            if buff_df[-1] == "long_entry" or buff_df[-1] == "short_entry":
                buff_df = buff_df[:-1]
                
            buff_df['diff'] = buff_df.index.to_series.diff().dt.days
            buff_df = buff_df[signal_t][buff_df[signal_t] == "long_exit" || buff_df[signal_t] == "short_exit"]
            avg_holding_periods[t] = buff_df['diff'].mean()
            
            total_holding_period += buff_df['diff'].sum()
            num_period += len(buff_df['diff'])
        
        if mode=="detailed":
            return avg_holding_periods
        elif mode=="aggregate":
            return total_holding_period / num_period
            
    ## Performance Metrics
    def calc_cagr(self):
        '''
        Calculates CAGR
        
        Assumptions:
        - strat_df is in days
        '''
        return qs.stats.cagr(self.strat_df['return'])
    
    def calc_sharpe(self):
        '''
        Calculates Sharpe Ratio
        
        Assumptions:
        - strat_df is in days
        '''
        return qs.stats.sharpe(self.strat_df['return'])
    
    def calc_prob_sharpe(self, sr_benchmark = 0):
        ret = self.strat_df['return']
        
        sr = qs.stats.sharpe(ret)
        n = len(ret)
        skew = ss.skew(ret)
        kurtosis = ss.kurtosis(ret, fisher=False)

        # Assuming SR is annualized, we need to change into periodical
        sr = sr/np.sqrt(252)
        sr_benchmark = sr_benchmark/np.sqrt(252)

        sr_std = np.sqrt((1 + (0.5 * sr ** 2) - (skew * sr) + (((kurtosis - 3) / 4) * sr ** 2)) / (n - 1))
        psr = ss.norm.cdf((sr - sr_benchmark) / sr_std)

        return psr
    
    def calc_deflated_sharpe(self, sr_std, num_trials):
        def expected_sr_max(trials_sr_std=0, num_trials=0, exp_sr_mean=0):
            emc = 0.5772156649
            max_z = (1 - emc) * ss.norm.ppf(1 - 1./num_trials) + emc * ss.norm.ppf(1 - 1./(num_trials * np.e))
            return exp_sr_mean + (trials_sr_std*max_z)
        
        ret = self.strat_df['return']

        exp_sr_max = expected_sr_max(trials_sr_std=sr_std, num_trials=num_trials)
        d_sr = self.calc_prob_sharpe(ret, sr_benchmark=exp_sr_max)
        return d_sr
    
    def calc_hit_ratio(self, mode="aggregate"):
        '''
        Calculate the ratio of bets that results in hit
        '''
        
        hms = {}
        total_hits = 0
        total_bets = 0
        
        for signal_t in self.signal_tickers:
            t = parse_col_str(signal_t)[1]
            hm_t = "H/M_" + t
            
            hm_df = self.strat_df[hm_t][self.strat_df[hm_t] != ""] 
            num_hits = len(hm_df[hm_df == "hit"])
            num_bets = len(hm_df)
            
            total_hits += num_hits
            total_bets += num_bets
            
            hms[t] = num_hits / num_bets
        
        if mode=="detailed":
            return hms
        elif mode=="aggregate":
            return total_hits / total_bets
    
    def calc_avg_returns_from_hits_and_misses(self, mode="aggregate"):
        '''
        Calculate the average returns from both hits and misses
        '''
        
        
        
        pass
    
    ## Drawdowns Metrics
    def calc_dd(self, mode="max", delta=4):
        '''
        Calculates drawdown metrics
        - Maximum Drawdown (max dd) for the maximum drawdown which happens for the strategy
        - Time underwater (time underwater) is the length of maximum drawdown
        - Longest Drawdown (longest dd) for the logest days of drawdown whcih happens for the strategy
        - Currently Drawdown (currently dd) for checking if recently (wihtin delta days) the strategy is currently in drawdown
        
        Assumptions:
        - strat_df is in days
        
        Input
        - mode : can be either "max" for max dd, "tuw" for time underwater, "long" for longest dd, and "current" for currently dd
        - delta: used to determine the time delta to consider if something is in drawdown or not
        '''
        if mode=="max":
            return qs.stats.drawdown_details(self.strat_df['return'])['max drawdown'].max()
        elif mode=="tuw":
            pass
        
        elif mode=="longest":
            return qs.stats.drawdown_details(s_ret)['days'].max()
        elif mode=="current":
            dd_details = qs.stats.drawdown_details(s_ret)
            dts = [(datetime.now() - timedelta(days=i)).strftime("%Y-%m-%d") for i in range(delta)]

            indicator = False
            for dt in dts:
                if dt in dd_details['end'].values:
                    indicator = True

            return indicator
    
    def calc_hhi(self):
        pass
    
    # Plot Relevant Plots
    # TODO - Add Plot Save Function
    def plot_cum_returns(self, strat_df=None):
        '''
        Plots cumulative returns
        
        Assumptions:
        - strat_df is in days
        '''
        if strat_df is not None:
            qs.plots.returns(strat_df['return'])
        else:
            qs.plots.returns(self.strat_df['return'])
    
    def plot_monthly_returns(self, strat_df=None):
        '''
        Plots monthly returns
        
        Assumptions:
        - strat_df is in days
        '''
        if strat_df is not None:
            qs.plots.monthly_heatmap(strat_df['return'])
        else:
            qs.plots.monthly_heatmap(self.strat_df['return'])
    
    def plot_drawdown(self, strat_df=None):
        '''
        Plots drawdown chart
        
        Assumptions:
        - strat_df is in days
        '''
        if strat_df is not None:
            qs.plots.drawdown(strat_df['return'])
        else:
            qs.plots.drawdown(self.strat_df['return'])
    
    def plot_buy_sell(self, strat_df=None, positions_plot=True):
        '''
        Plots buy/sell plot and positions plot
        
        Assumptions:
        - That 'positions' has been calculated
        '''
        fig = plt.figure(figsize=(20,30))
        
        if strat_df is not None:
            buff_s_df = strat_df
        else:
            buff_s_df = self.strat_df
            
        signal_tickers = self.get_signal_tickers(strat_df=buff_s_df)
            
        for signal_t in signal_tickers:
            t = self.parse_col_str(signal_t)[1]
            price_t = "price_" + t
            position_t = "position_" + t
            
            # Buy/Sell Plot
            ax = plt.subplot(5, 1, (1,4))
            ax.set_title(f"Buy/Sell Plot for {t}")

            ## Plot Price Graph
            ax.plot(buff_s_df[price_t])

            ## Plot Buy and Sell Dots
            for i in range(0, len(buff_s_df)):
                if buff_s_df[signal_t][i] == 'long_entry':
                    dot_color = 'go'
                elif buff_s_df[signal_t][i] == 'long_exit':
                    dot_color = 'ro'
                elif buff_s_df[signal_t][i] == 'short_entry':
                    dot_color = 'bo'
                elif buff_s_df[signal_t][i] == 'short_exit':
                    dot_color = 'yo'
                ax.plot(buff_s_df[signal_t][i].index, buff_s_df[price_t][i], dot_color)
            
            # Positions Plot
            if positions_plot:
                ax = plt.subplot(5, 1, 5)
                ax.set_title(f"Position Plot for {t}")
                ax.plot(buff_s_df[position_t])
            
    # Helper Functions
    def get_signal_tickers(self, strat_df=None):
        
        if strat_df is not None:
            buff_s_df = strat_df
        else:
            buff_s_df = self.strat_df
        
        colnames = buff_s_df.columns.values.tolist()
        signal_tickers = [cs for cs in sig_colnames 
                            if self.parse_col_str(cs)[0]=='signal']
        return signal_tickers
    
    def calc_transaction_cost(self, last_signal):
        if last_signal == 'long_entry' or last_signal == 'short_entry':
            return -self.buy_cost_pct / 100
        elif last_signal == 'long_close' or last_signal == 'short_close':
            return -self.sell_cost_pct / 100
    
    def parse_col_str(self, col_str):
        return col_str.split('_')

    
'''
class MarketConditionBacktest(Backtest):
    def __init__(self, signal_df):
        super().init(signal_df)
        
class RandomizedBacktest(Backtest):
    def __init__(self, signal_df):
        super().init(signal_df)
        
class ModelBacktest(Backtest):
    def __init__(self, signal_df):
        super().init(signal_df)
'''

In [18]:
import scipy.stats as ss

def def_sr(ret, trials_sr_df):
    
    def expected_sr_max(trials_sr_std=0, num_trials=0, exp_sr_mean=0):
        emc = 0.5772156649
        max_z = (1 - emc) * ss.norm.ppf(1 - 1./num_trials) + emc * ss.norm.ppf(1 - 1./(num_trials * np.e))
        return exp_sr_mean + (trials_sr_std*max_z)
    
    sr_std = trials_sr_df.std()
    exp_sr_max = expected_sr_max(trials_sr_std=sr_std, num_trials=len(trials_sr_df))
    d_sr = prob_sr(ret, sr_benchmark=exp_sr_max)
    return d_sr

In [20]:
# Single Walk Forward Backtest per Pair
pair_l = []
s1_ret_arr = []
s1_df_arr = []
for i, row in pair_df.iterrows():
    # Run Strategy
    pair = [row['ticker_1'], row['ticker_2']]
    hl = row['half_life']
    
    s_df = bb_strategy(out_df, pair, round(hl), form_type=row['form_type'], std=std)
    s_df = calc_strategy_returns(s_df, form_type=row['form_type'])
    s_ret = s_df['return']
    
    s1_ret_arr.append(s_ret)
    s1_df_arr.append(s_df)
    
    # Calculate Number of Trades
    entry_trades = len(s_df[s_df['signal'] == 'long_entry']) + len(s_df[s_df['signal'] == 'short_entry'])
    close_trades = len(s_df[s_df['signal'] == 'long_close']) + len(s_df[s_df['signal'] == 'short_close'])
    num_trades = min([entry_trades, close_trades])
    
    pair_l.append({
                    'Turnover': num_trades,
                    'Ann Turnover': num_trades / (len(s_ret)/252),
                    'Max DD': qs.stats.drawdown_details(s_ret)['max drawdown'].min(),
                    'Longest DD': qs.stats.drawdown_details(s_ret)['days'].max(),
                    'Currently DD': is_recently_drawdown(s_ret, delta=4),
                    'Cumulative Return': s_df.iloc[-1]['cum_return'],
                    'CAGR': qs.stats.cagr(s_ret),
                    'Sharpe': qs.stats.sharpe(s_ret),
                    'Prob. Sharpe': prob_sr(s_ret, sr_benchmark=0.025)
                    })

# Calculate Deflated Sharpe
s1_pair_df = pd.concat([pair_df, pd.DataFrame(pair_l)], axis=1)

pair_def_sr = []
for s_ret in s1_ret_arr:
    pair_def_sr.append({'Deflated Sharpe': def_sr(s_ret, 
                                                pd.DataFrame(s1_pair_df['Sharpe'], columns=['Sharpe']))[0]
                       })

s1_pair_df = pd.concat([s1_pair_df, pd.DataFrame(pair_def_sr)], axis=1)

In [21]:
def plot_price_spreads_bb(df_arr, pair_df, mode="view", plot_dir=None, col_name=None):
    if mode=="view":
        fig = plt.figure(figsize=(30,20))

        for i, row in pair_df.iterrows():
            df = df_arr[i]
            df['spread'] = np.nan

            # Combine Series
            S1 = df.iloc[:, 0]
            S2 = df.iloc[:, 1]

            # Calculate Spread
            spread = S2 - df['beta'] * S1

            # Test Stationarity
            result = adfuller(handle_nan(spread, method='drop'))
            pvalue = result[1]
            is_stationary = (pvalue < 0.025)

            # Plot on Specific Axis
            ax = plt.subplot(ceil(len(pair_df)/4), 4, i+1)
            ax.set_title(row['ticker_1'] + "/" + row['ticker_2'] + "_" + row['form'] + "_" + row['form_type'])
            spread.plot(ax=ax)

            # Stationary Test on Spread
            if (is_stationary): 
                ax.get_lines()[0].set_color("blue") 
            else :
                ax.get_lines()[0].set_color("red")

        plt.tight_layout()
        plt.show()
        
    elif mode=="save":
        assert plot_dir is not None, "plot_dir must be given"
        assert col_name is not None, "col_name must be given"
        
        path_l = []
        for i, row in pair_df.iterrows():
            fig = plt.figure(figsize=(10,5))
            
            df = df_arr[i]
            df['spread'] = np.nan

            # Combine Series
            S1 = df.iloc[:, 0]
            S2 = df.iloc[:, 1]

            # Calculate Spread
            spread = S2 - df['beta'] * S1

            # Test Stationarity
            result = adfuller(handle_nan(spread, method='drop'))
            pvalue = result[1]
            is_stationary = (pvalue < 0.025)

            # Plot on Specific Axis
            if (is_stationary):
                plt.plot(spread, color="blue")
            else:
                plt.plot(spread, color="red")
            plt.title(row['ticker_1'] + "/" + row['ticker_2'] + "_" + row['form'] + "_" + row['form_type'])
            
            # Save Plot
            plot_path = plot_dir + "strat_" + str(i) +  ".png"
            _ = fig.savefig(plot_path)
            plt.close()
            
            path_l.append({
                                col_name: plot_path
                            })
            
        print("Saved Plots at " + plot_dir)
            
        # Add paths to column
        buff_pair_df = pd.concat([pair_df, pd.DataFrame(path_l)], axis=1)
        
        return buff_pair_df     

In [22]:
# Visualize Rolling Price Spread
s1_pair_df = plot_price_spreads_bb(s1_df_arr, s1_pair_df, mode="save", plot_dir=bb_plot_dir, col_name="artifact-strat_plot_path")

Saved Plots at /workspace/202205_idx-trading/strats/experiments/20220628_s-stat-arb_d-lq45/s_2010-01-01_bp_2019-01-01/s_bb/


In [24]:
class HedgeRatioKFLinReg():
    # Source: https://www.quantstart.com/articles/kalman-filter-based-pairs-trading-strategy-in-qstrader/
    def __init__(self):
        # Mean of System State, or Beta/Hedge Ratio
        self.theta = np.zeros(2)
        
        # Covariance Matrix of System State
        self.R = None
        
        # Covariance Matrix of System State Noise
        self.delta = 1e-4
        self.wt = self.delta / (1 - self.delta) * np.eye(2)
        
        # Covariance Matrix of Measurement Noise
        self.vt = 1e-3
        
    def update(self, s1_price, s2_price):
        # Create the observation matrix of the latest prices
        # of TLT and the intercept value (1.0) as well as the
        # scalar value of the latest price from IEI
        F = np.asarray([s1_price, 1.0]).reshape((1, 2))
        y = s2_price

        # The prior value of the states \theta_t is
        # distributed as a multivariate Gaussian with
        # mean a_t and variance-covariance R_t
        if self.R is not None:
            self.R = self.C + self.wt
        else:
            self.R = np.zeros((2, 2))

        # Calculate the Kalman Filter update
        # ----------------------------------
        # Calculate prediction of new observation
        # as well as forecast error of that prediction
        yhat = F.dot(self.theta)
        et = y - yhat

        # Q_t is the variance of the prediction of
        # observations and hence \sqrt{Q_t} is the
        # standard deviation of the predictions
        Qt = F.dot(self.R).dot(F.T) + self.vt
        sqrt_Qt = np.sqrt(Qt)

        # The posterior value of the states \theta_t is
        # distributed as a multivariate Gaussian with mean
        # m_t and variance-covariance C_t
        At = self.R.dot(F.T) / Qt
        self.theta = self.theta + At.flatten() * et
        self.C = self.R - At * F.dot(self.R)
        
        return self.theta, sqrt_Qt, et

def kf_strategy(df_dict, pair_ticker, form_type='price', std=1, burn_in=4):
    '''
    Kalman Filter Long-Only Bollinger Band Mean Reversion Strategy
    
    1. Calculate the beta/hedge ratio, std, and price spread from KF.
       We assume the model is of the form Y = beta * X where Y is the second item in the pair.
    2. Generate Signal, based on KF std (long when price spread < -std, exit when price spread >= -std)
    3. Calculate Returns
    '''
    
    # Take the relevant price series from each pair
    df_proc = gen_combined_df(df_dict, [pair_ticker[0], pair_ticker[1]], [form_type], add_pfix=False)
    pair = pair_ticker
    
    # Calculate beta, std, price spread using KalmanFilter
    kf = HedgeRatioKFLinReg()
    S1 = df_proc[pair[0]]
    S2 = df_proc[pair[1]]
    kf_l = []
    
    for p1, p2 in zip(S1,S2):
        _, state_std, spread = kf.update(p1, p2)
        kf_l.append({
                        'std': state_std[0][0],
                        'spread': spread[0]
                    })
        
    ## Combine df
    kf_df = pd.DataFrame(kf_l)
    for col in kf_df:
        df_proc[col] = kf_df[col].values
    
    # Drop burn in periods
    df_proc = df_proc[burn_in:]
    
    # Signal Rules
    long_signal = lambda price, std: (price < -std)
    long_close_signal = lambda price, std: (price >= -std)
    short_signal = lambda price, std: (price > std)  
    short_close_signal = lambda price, std: (price <= std)
    
    # Generate Signals
    ## This Signal generation is for a long-only strategy
    last_signal = ''
    df_proc['signal'] = ''
    df_proc['signal_ticker'] = ''
    for i in range(0, len(df_proc)):
        if i == 0:
            df_proc['signal'][i] = ''
        
        elif last_signal == '':
            if long_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'long_entry'
                last_signal = 'long_entry'
                df_proc['signal_ticker'][i] = pair[1]
            elif long_close_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'long_close'
                last_signal = 'long_close'
                df_proc['signal_ticker'][i] = pair[1]
            elif short_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'short_entry'
                last_signal = 'short_entry'
                df_proc['signal_ticker'][i] = pair[0]
            elif short_close_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'short_close'
                last_signal = 'short_close'
                df_proc['signal_ticker'][i] = pair[0]
            else:
                df_proc['signal'][i] = ''
            
        elif last_signal == 'long_entry':
            if long_close_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'long_close'
                last_signal = 'long_close'
                df_proc['signal_ticker'][i] = pair[1]
            else:
                df_proc['signal'][i] = ''
                
        elif last_signal == 'short_entry':
            if short_close_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'short_close'
                last_signal = 'short_close'
                df_proc['signal_ticker'][i] = pair[0]
            else:
                df_proc['signal'][i] = ''
                
        elif last_signal == 'long_close' or last_signal == 'short_close':
            if long_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'long_entry'
                last_signal = 'long_entry'
                df_proc['signal_ticker'][i] = pair[1]
            elif short_signal(df_proc['spread'][i], df_proc['std'][i]):
                df_proc['signal'][i] = 'short_entry'
                last_signal = 'short_entry'
                df_proc['signal_ticker'][i] = pair[0]
            else:
                df_proc['signal'][i] = ''

    return df_proc

In [25]:
# Single Walk Forward Backtest per Pair
pair_l = []
s2_ret_arr = []
s2_df_arr = []
for i, row in pair_df.iterrows():
    # Run Strategy
    pair = [row['ticker_1'], row['ticker_2']]
    hl = row['half_life']
    
    s_df = kf_strategy(out_df, pair, form_type=row['form_type'])
    s_df = calc_strategy_returns(s_df, form_type=row['form_type'])
    s_ret = s_df['return']
    
    s2_ret_arr.append(s_ret)
    s2_df_arr.append(s_df)
    
    # Calculate Number of Trades
    entry_trades = len(s_df[s_df['signal'] == 'long_entry']) + len(s_df[s_df['signal'] == 'short_entry'])
    close_trades = len(s_df[s_df['signal'] == 'long_close']) + len(s_df[s_df['signal'] == 'short_close'])
    num_trades = min([entry_trades, close_trades])
    
    pair_l.append({
                    'Turnover': num_trades,
                    'Ann Turnover': num_trades / (len(s_ret)/252),
                    'Max DD': qs.stats.drawdown_details(s_ret)['max drawdown'].min(),
                    'Longest DD': qs.stats.drawdown_details(s_ret)['days'].max(),
                    'Currently DD': is_recently_drawdown(s_ret, delta=4),
                    'Cumulative Return': s_df.iloc[-1]['cum_return'],
                    'CAGR': qs.stats.cagr(s_ret),
                    'Sharpe': qs.stats.sharpe(s_ret),
                    'Prob. Sharpe': prob_sr(s_ret, sr_benchmark=0.025)
                    })

# Calculate Deflated Sharpe
s2_pair_df = pd.concat([pair_df, pd.DataFrame(pair_l)], axis=1)

pair_def_sr = []
for s_ret in s2_ret_arr:
    pair_def_sr.append({'Deflated Sharpe': def_sr(s_ret, 
                                                pd.DataFrame(s2_pair_df['Sharpe'], columns=['Sharpe']))[0]
                       })

s2_pair_df = pd.concat([s2_pair_df, pd.DataFrame(pair_def_sr)], axis=1)

In [26]:
def plot_price_spreads_kf(df_arr, pair_df, mode="view", plot_dir=None, col_name=None):
    if mode=="view":
        fig = plt.figure(figsize=(30,20))

        for i, row in pair_df.iterrows():
            df = s2_df_arr[i]
            spread = df['spread']

            # Test Stationarity
            result = adfuller(handle_nan(spread, method='drop'))
            pvalue = result[1]
            is_stationary = (pvalue < 0.025)

            # Plot on Specific Axis
            ax = plt.subplot(ceil(len(pair_df)/4), 4, i+1)
            ax.set_title(row['ticker_1'] + "/" + row['ticker_2'] + "_" + row['form'] + "_" + row['form_type'])
            spread.plot(ax=ax)

            # Stationary Test on Spread
            if (is_stationary): 
                ax.get_lines()[0].set_color("blue") 
            else :
                ax.get_lines()[0].set_color("red")

        plt.tight_layout()
        plt.show()
        
    elif mode=="save":
        assert plot_dir is not None, "plot_dir must be given"
        assert col_name is not None, "col_name must be given"

        path_l = []
        for i, row in pair_df.iterrows():
            fig = plt.figure(figsize=(10,5))

            df = df_arr[i]
            spread = df['spread']

            # Test Stationarity
            result = adfuller(handle_nan(spread, method='drop'))
            pvalue = result[1]
            is_stationary = (pvalue < 0.025)

            # Plot on Specific Axis
            if (is_stationary):
                plt.plot(spread, color="blue")
            else:
                plt.plot(spread, color="red")
            plt.title(row['ticker_1'] + "/" + row['ticker_2'] + "_" + row['form'] + "_" + row['form_type'])

            # Save Plot
            plot_path = plot_dir + "strat_" + str(i) + ".png"
            _ = fig.savefig(plot_path)
            plt.close()

            path_l.append({
                                col_name: plot_path
                            })

        print("Saved Plots at " +  plot_dir)

        # Add paths to column
        buff_pair_df = pd.concat([pair_df, pd.DataFrame(path_l)], axis=1)

        return buff_pair_df 

In [27]:
# Visualize Rolling Price Spread
s2_pair_df = plot_price_spreads_kf(s2_df_arr, s2_pair_df, mode="save", plot_dir=kf_plot_dir, col_name="artifact-strat_plot_path")

Saved Plots at /workspace/202205_idx-trading/strats/experiments/20220628_s-stat-arb_d-lq45/s_2010-01-01_bp_2019-01-01/s_kf/


In [28]:
display(s2_pair_df.sort_values(by=['Cumulative Return'], ascending=False))

Unnamed: 0,index,ticker_1,ticker_2,form,form_type,eg_pvalue,beta,half_life,lambda,artifact-coint_plot_path,...,Ann Turnover,Max DD,Longest DD,Currently DD,Cumulative Return,CAGR,Sharpe,Prob. Sharpe,Deflated Sharpe,artifact-strat_plot_path
3,6,ERAA.JK,AMRT.JK,ratio,price,0.001567197,0.0,52.31712,-0.013249,/workspace/202205_idx-trading/strats/experimen...,...,90.312139,-14.86487,141,True,2.841268,0.347801,0.944861,0.959833,0.4568121,/workspace/202205_idx-trading/strats/experimen...
18,36,TBIG.JK,TOWR.JK,ratio,price,6.098125e-07,0.0,45.072916,-0.015378,/workspace/202205_idx-trading/strats/experimen...,...,72.249711,-9.192208,51,True,2.839221,0.347523,1.189809,0.991073,0.6487969,/workspace/202205_idx-trading/strats/experimen...
7,18,BBCA.JK,BMRI.JK,normal,price,0.02017991,1.214822,30.314168,-0.022865,/workspace/202205_idx-trading/strats/experimen...,...,77.784971,-7.090295,30,True,1.576848,0.139026,0.699444,0.907718,0.2759037,/workspace/202205_idx-trading/strats/experimen...
10,21,BBRI.JK,BMRI.JK,normal,price,0.001990189,1.799395,17.506676,-0.039593,/workspace/202205_idx-trading/strats/experimen...,...,87.981503,-7.090295,58,True,1.509379,0.124878,0.603374,0.860687,0.227667,/workspace/202205_idx-trading/strats/experimen...
8,19,BBRI.JK,BBNI.JK,normal,price,0.02439975,2.988737,37.86099,-0.018308,/workspace/202205_idx-trading/strats/experimen...,...,86.524855,-11.718749,59,True,1.450906,0.112246,0.539534,0.829609,0.1959804,/workspace/202205_idx-trading/strats/experimen...
14,25,BBCA.JK,BBRI.JK,ratio,price,3.796269e-05,0.0,21.382339,-0.032417,/workspace/202205_idx-trading/strats/experimen...,...,48.652023,-9.09248,24,True,1.447997,0.111608,0.637045,0.892927,0.2294781,/workspace/202205_idx-trading/strats/experimen...
6,17,BBCA.JK,BBRI.JK,normal,price,0.01409072,0.672636,28.294893,-0.024497,/workspace/202205_idx-trading/strats/experimen...,...,48.652023,-9.09248,24,True,1.447997,0.111608,0.637045,0.892927,0.2294781,/workspace/202205_idx-trading/strats/experimen...
4,7,LQ45,AMRT.JK,ratio,price,0.001315995,0.0,48.088972,-0.014414,/workspace/202205_idx-trading/strats/experimen...,...,79.06463,-8.261279,46,True,1.336307,0.086464,0.470341,0.800594,0.1569375,/workspace/202205_idx-trading/strats/experimen...
17,35,TLKM.JK,TOWR.JK,ratio,price,6.733996e-13,0.0,38.705519,-0.017908,/workspace/202205_idx-trading/strats/experimen...,...,26.802312,-6.647392,13,True,1.147727,0.040168,0.337356,0.728563,0.09773439,/workspace/202205_idx-trading/strats/experimen...
5,12,LQ45,ASII.JK,ratio,log-price,0.006540303,0.0,55.133204,-0.012572,/workspace/202205_idx-trading/strats/experimen...,...,0.592244,-0.1,4,False,1.122984,0.033735,0.5614,0.978317,0.04855226,/workspace/202205_idx-trading/strats/experimen...


In [37]:
def save_to_mlflow(pair_df, run_params, param_name_list, metric_name_list, artifact_name_list, 
                   exp_id=mlflow_experiment_name, strat_type=None):
    
    assert strat_type is not None, "strat_type must be defined"
    
    for i, row in pair_df.iterrows():
        with mlflow.start_run(experiment_id=exp_id) as run:
            # Log Run Specific Params
            for key in run_params:
                mlflow.log_param(key, run_params[key])
                
            # Log Parameters
            mlflow.log_param("strat_type", strat_type)
            for cn in param_name_list:
                mlflow.log_param(cn, row[cn])
            
            # Log Metrics
            for cn in metric_name_list:
                mlflow.log_metric(cn, row[cn])
            
            # Log Artifacts
            for cn in artifact_name_list:
                mlflow.log_artifact(row[cn])

In [38]:
# Define Parameter List
run_params = {
                "date_in_sample_start": date_in_sample_start, 
                "date_in_sample_end": date_in_sample_end,
                "date_in_sample_len": date_in_sample_len,
                "date_out_sample_start": date_out_sample_start,
                "date_out_sample_end": date_out_sample_end,
                "date_out_sample_len": date_out_sample_len,
                "strat_class": strat_class,
                "std": std
             }

param_name_list = ["ticker_1", "ticker_2", "form", "form_type"]
metric_name_list = ['eg_pvalue','beta','half_life','lambda',
                    'Turnover','Ann Turnover','Max DD','Longest DD','Currently DD','Cumulative Return','CAGR','Sharpe','Prob. Sharpe','Deflated Sharpe']
artifact_name_list = ["artifact-coint_plot_path", "artifact-strat_plot_path"]

In [39]:
# Save Runs to MLFlow
mlflow.set_tracking_uri("/workspace/mlruns")
exp_id = dict(mlflow.get_experiment_by_name(mlflow_experiment_name))['experiment_id']
save_to_mlflow(s1_pair_df, run_params, param_name_list, metric_name_list, artifact_name_list, 
                   exp_id=exp_id, strat_type="Bollinger Band")
save_to_mlflow(s2_pair_df, run_params, param_name_list, metric_name_list, artifact_name_list, 
                   exp_id=exp_id, strat_type="Kalman Filter")