In [29]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
import numpy as np
import sys
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
from scipy.stats import kendalltau
from scipy.integrate import quad
from scipy.optimize import minimize
from statsmodels.distributions.empirical_distribution import ECDF



def get_uv_from_xy(x,y):
    x_cdf = ECDF(x)
    y_cdf = ECDF(y)
    len_x = len(x)
    len_y = len(y)
    u, v = [len_x/(len_x+1)*x_cdf(a) for a in x], [len_y/(len_y+1)*y_cdf(a) for a in y]
    return u,v


def get_parameter(family, tau):
    """
    Estimate the theta parameter for the copula based on Kendall tau
    """
    if  family == 'clayton':
        return 2 * tau / (1 - tau)

    elif family == 'frank':
        integrand = lambda t: t / (np.exp(t) - 1)  # generate the integrand
        frank_fun = lambda theta: ((tau - 1) / 4.0  - (quad(integrand, sys.float_info.epsilon, theta)[0] / theta - 1) / theta) ** 2
        return minimize(frank_fun, 4, method='BFGS', tol=1e-5).x[0] 

    elif family == 'gumbel':
        return 1 / (1 - tau)


def pdf_copula(family, theta, u, v):
    """
    Estimate the probability density function of three kinds of Archimedean copulas
    """
    if  family == 'clayton':
        pdf = (theta+1) * ((u ** (-theta) + v ** (-theta) - 1) ** (-2 - 1/theta)) * (u ** (-theta-1) * v ** (-theta-1))

    elif family == 'frank':
        num = -theta * (np.exp(-theta) - 1) * (np.exp(-theta * (u + v)))
        denom = ((np.exp(-theta * u) - 1) * (np.exp(-theta * v) - 1) + (np.exp(-theta) - 1)) ** 2
        pdf = num / denom

    elif family == 'gumbel':
        A = (-np.log(u)) ** theta + (-np.log(v)) ** theta
        c = np.exp(-A ** (1 / theta))
        pdf = c * (u * v) ** (-1) * (A ** (-2 + 2/theta)) * ((np.log(u) * np.log(v)) ** (theta - 1)) * (1 + (theta - 1) * A ** (-1/theta))
    
    return pdf


def log_pdf(family, theta, u, v):
    pdf = pdf_copula(family, theta, u, v)
    return np.log(pdf)


def conditional_cdf(family,theta,u,v):
    """
    This is C(u|v) = dC/dv 
    Since u and v are symmetric, reverse the parameters for C(v|u)=dC/du
    """
    if family == 'clayton':
        ccdf =  v ** (-theta-1) * (u ** (-theta) + v ** (-theta) -1) ** (-1/theta -1)
            
    elif family == 'frank':
        exp_u = np.exp(-theta * u) - 1
        exp_v = np.exp(-theta * v) - 1
        ccdf = ( exp_u * exp_v + exp_u ) / ( exp_u * exp_v + (np.exp(-theta)-1) )

    elif family == 'gumbel':
        A = (-np.log(u)) ** theta + (-np.log(v)) ** theta
        c = np.exp(-A ** (1 / theta))
        ccdf = c * A ** ((1-theta)/theta) * (-np.log(v)) ** (theta-1) * (1/v)

    return ccdf


def add_copula_x_variable(df, df_price_1, df_price_2, train_start=None, train_end=None, ret_days=1):
    """
	Fit the copula on training data and create a new column with MI_u_v and MI_v_u
	"""

    if not train_start: train_start = df.Date.min()
    if not train_end: train_end = df.Date.max()
    # Compute total returns
    df_price_1 = df_price_1.pct_change(periods=ret_days)
    df_price_2 = df_price_2.pct_change(periods=ret_days)
    x_fit = df_price_1.loc[train_start : train_end+pd.Timedelta(days=1)].dropna()
    y_fit = df_price_2.loc[train_start : train_end+pd.Timedelta(days=1)].dropna()
    u_fit,v_fit = get_uv_from_xy(x_fit, y_fit)
    tau = kendalltau(x_fit, y_fit)[0]

    AIC ={}  # generate a dict with key being the copula family, value = [theta, AIC]
    for i in ['clayton', 'frank', 'gumbel']:
        param = get_parameter(i, tau)
        lpdf = [log_pdf(i, param, a, b) for (a, b) in zip(u_fit, v_fit)]
        lpdf = np.nan_to_num(lpdf) 
        loglikelihood = sum(lpdf)
        AIC[i] = [param, -2 * loglikelihood + 2]

    fitted_copula = min(AIC.items(), key = lambda a: a[1][1])[0]
    fitted_theta = AIC[fitted_copula][0]

    start_dt = df.Date.min()
    end_dt = df.Date.max()
    x = df_price_1.loc[start_dt : end_dt+pd.Timedelta(days=1)].dropna()
    y = df_price_2.loc[start_dt : end_dt+pd.Timedelta(days=1)].dropna()
    u,v = get_uv_from_xy(x, y)

    df_p_val = pd.DataFrame([u, v]).T
    df_p_val.index = x.index
    df_p_val.index.name = 'Date'
    df_p_val.columns = ['u', 'v']
    df_p_val['MI_u_v'] = df_p_val.apply(lambda r: conditional_cdf(fitted_copula, fitted_theta, r.u, r.v),axis=1)
    df_p_val['MI_v_u'] = df_p_val.apply(lambda r: conditional_cdf(fitted_copula, fitted_theta, r.v, r.u),axis=1)
    df_p_val.reset_index(inplace=True)
    

    df = df.merge(df_p_val[['Date','MI_u_v', 'MI_v_u']], on='Date', how='left')
    return df.set_index('Date')


In [57]:
# Load Pairs
raw_price_df = pd.read_csv("Final Price Df.csv", parse_dates=["Date"])
raw_price_df.sort_values(["ETF_Ticker", "Date"], inplace=True)

feature_df_raw = pd.read_csv("Final TrainingSet.csv", parse_dates=["Date"])
feature_df_raw.sort_values(["Ticker_Pair", "Date"], inplace=True)

pair_arr = np.unique(feature_df_raw["Ticker_Pair"].values)

# Make predictions on each pair under 
freq_map = {'d':1, 'w':5, 'M':21}
price_s = raw_price_df.set_index(["ETF_Ticker", "Date"]).squeeze()
feature_df = feature_df_raw.set_index(["Ticker_Pair", "Date"]).squeeze()


freq = 'M'
mode = 'predict'
output_list = []

for pair in pair_arr:
    print(pair)
    pair_list = pair.split("_")
    pair_s = price_s.loc[pair_list, :].copy()
    pair_df = pair_s.unstack("ETF_Ticker")
    pair_df.dropna(inplace=True)

    feature_df_pair = feature_df.loc[pair, :].copy()

    if mode == "predict":
        feature_df_pair = add_copula_x_variable(feature_df_pair.reset_index(), pair_df[pair_list[0]], pair_df[pair_list[1]],
            train_end=pd.Timestamp('2016-12-31'),ret_days=freq_map[freq])
        output_i_df = feature_df_pair[['MI_u_v', 'MI_v_u']]
        
        output_i_df.index.name = "Date"
        output_i_df.reset_index(inplace=True)
        output_i_df["pair"] = pair
        output_i_df = output_i_df.reindex(["pair"] + list(output_i_df.columns[:-1]), axis=1)
        output_list.append(output_i_df)
        output_df = pd.concat(output_list)

*XFN_IJH
*XIC_EZA
*XIU_REM
BKF_IGM
BKF_IVV
EWD_THD
EWG_THD
EWH_IJH
EWM_EZA
EWQ_IYG
EWT_IJH
EWT_IJJ
EWT_IVE
EWT_IVV
EWT_IWB
EWT_IWV
EZU_THD
IDU_IVV
IEV_THD
IFGL_IHE
IFGL_IWM
IGM_IHI
IHE_ACWI
IHE_ACWX
IHE_SCZ
IHE_TOK
IHE_WOOD
IHF_ILCB
IHF_ILCG
IHF_IUSG
IHF_IVW
IHF_IXN
IJH_ACWI
IJH_TOK
IJH_WOOD
IJJ_ACWI
IMCV_IWM
IVV_ACWI
IWB_ACWI
IWM_ACWI
IWM_SCZ
IWM_SUSA
IWM_WOOD
IWV_ACWI
IXP_IYH
IYH_ACWI
IYH_TOK
IYK_ACWI
REM_SUSA
RXI_WOOD


In [58]:
output_df.to_pickle(f"CopulaPredictions_{freq}.pkl")