In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import itertools
from statistical_functions import preprocess, test_is_I1, test_is_tradable
import matplotlib.pyplot as plt

  from pandas.core import datetools


# Notice
* We haven't consider capital management yet, but it is fairly important.
* Haven't consider unlisted stocks.

# Preprocess

In [2]:
%%time
data = pd.read_csv("ec_data.csv")
data.date = data.date.apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
data['year'] = data.date.apply(lambda x: x.year)
data['month'] = data.date.apply(lambda x: x.month)
data['week'] = data.date.apply(lambda x: x.week)
data['dayofweek'] = data.date.apply(lambda x: x.dayofweek)
data.sort_values('date', inplace=True)
data.head()

Wall time: 25 s


In [3]:
# create possible pairs
possible_pairs = list(
    itertools.combinations(data.code.unique(), 2)
)

# create trading time frame
timeframe = data[['year', 'month', 'week']].drop_duplicates()
timeframe.reset_index(drop=True, inplace=True)

# create dataframe of date for available data
# it will be used to check whether stocks containing enough data
# before further cointegration testing
# trade off between speed and more possible pairs
date = pd.DataFrame(data.date.unique(), columns=['date'])
date['year'] = date.date.apply(lambda x: x.year)
date['month'] = date.date.apply(lambda x: x.month)
date['week'] = date.date.apply(lambda x: x.week)
date['dayofweek'] = date.date.apply(lambda x: x.dayofweek)

# Backtest

In [4]:
# initial part
trailing_windows = 52
current_step = trailing_windows
ADF_threshold = -2
ASR_threshold = 1.5

In [9]:
%%time
# trailing part
# prepare data for model
timeframe_for_model = timeframe.iloc[current_step-trailing_windows: current_step]

date_for_model = date[
    (date.year >= timeframe_for_model.year.iloc[0]) &
    (date.week >= timeframe_for_model.week.iloc[0]) &
    (date.year <= timeframe_for_model.year.iloc[-1]) &
    (date.week <= timeframe_for_model.week.iloc[-1])
]

data_for_model = data[
    (data.year >= timeframe_for_model.year.iloc[0]) &
    (data.week >= timeframe_for_model.week.iloc[0]) &
    (data.year <= timeframe_for_model.year.iloc[-1]) &
    (data.week <= timeframe_for_model.week.iloc[-1])
]

# prepare data for spread
timeframe_for_spread = timeframe.iloc[current_step]

date_for_spread = date[
    (date.year == timeframe_for_spread.year) &
    (date.week == timeframe_for_spread.week)
]

data_for_spread = data[
    (data.year == timeframe_for_spread.year) &
    (data.week == timeframe_for_spread.week)
]
data_for_spread = data_for_spread[data_for_spread.date == date_for_spread.date.iloc[0]]

# find tradable pairs
possible_pairs = list(
    itertools.combinations(data_for_model.code.unique(), 2)
)

tradable_pairs = pd.DataFrame()
for pair in possible_pairs:
    stock_1 = data_for_model[data_for_model.code == pair[0]]
    stock_2 = data_for_model[data_for_model.code == pair[1]]
    if len(stock_1) == len(stock_2) == len(date_for_model):
        temp_pair = test_is_tradable(stock_1.price, stock_2.price)
        if temp_pair is not None:
            temp_pair.index = [pair]
            tradable_pairs = pd.concat([tradable_pairs, temp_pair])

# calaulate statistics for tradable pairs
tradable_pairs['stock_1'] = None
tradable_pairs['stock_2'] = None

tradable_pairs[['stock_1', 'stock_2']] = list(map(
    lambda x: [
        data_for_spread[data_for_spread.code == x[0]].price.values[0],
        data_for_spread[data_for_spread.code == x[1]].price.values[0]
    ],
    tradable_pairs.index
))

tradable_pairs[['stock_1_log', 'stock_2_log']] = np.log(tradable_pairs[['stock_1', 'stock_2']])           

tradable_pairs['spread'] = (
    tradable_pairs.stock_2_log - 
    tradable_pairs.intercept -
    tradable_pairs.hedge_ratio*tradable_pairs.stock_1_log
)

tradable_pairs['ASR'] = abs(tradable_pairs.spread)/tradable_pairs.sigma

tradable_pairs['PS'] = np.power(
    tradable_pairs.ASR,
    ADF_threshold - tradable_pairs.ADF_statistic
)

# current_step += 1

Wall time: 14.7 s


In [16]:
tradable_pairs = tradable_pairs[tradable_pairs.ASR >= ASR_threshold]
tradable_pairs = tradable_pairs[tradable_pairs.ADF_statistic < ADF_threshold]
tradable_pairs.sort_values('PS', ascending=False)

Unnamed: 0,hedge_ratio,intercept,sigma,ADF_statistic,stock_1,stock_2,stock_1_log,stock_2_log,spread,ASR,PS
"(2413, 2392)",1.067716,-1.382207,0.102011,-3.876335,41.32,15.58,3.721347,2.745988,0.154853,1.518007,2.188418
"(2328, 2327)",0.570884,2.437207,0.105195,-2.128007,8.94,52.25,2.190536,3.95604,0.268292,2.550419,1.127325
"(1471, 2327)",0.641634,2.744702,0.153445,-2.147851,4.6,52.25,1.526056,3.95604,0.232169,1.513043,1.063142


In [None]:
%%time
tradable_pairs = pd.DataFrame()
for pair in possible_pairs:
    stock_1 = data_for_model[data_for_model.code == pair[0]]
    stock_2 = data_for_model[data_for_model.code == pair[1]]
    # to ensure stock_1 and, stock_2 are match in date
    stock_1, stock_2 = preprocess(stock_1, stock_2)
    # to avoid "maxlag should be < nobs" error in ADF test
    # also ensure there is enough data for regression
    if len(stock_1) > 200:
        temp_pair = test_is_tradable(stock_1.price, stock_2.price)
        if temp_pair is not None:
            temp_pair.index = [pair]
            tradable_pairs = pd.concat([tradable_pairs, temp_pair])

In [None]:
%%time
tradable_pairs_2 = pd.DataFrame()
for pair in possible_pairs:
    stock_1 = data_for_model[data_for_model.code == pair[0]]
    stock_2 = data_for_model[data_for_model.code == pair[1]]
    # to ensure stock_1 and, stock_2 are match in date
#     stock_1, stock_2 = preprocess(stock_1, stock_2)
    # to avoid "maxlag should be < nobs" error in ADF test
    # also ensure there is enough data for regression
    if len(stock_1) > 200:
        try:
            temp_pair = test_is_tradable(stock_1.price, stock_2.price)
            if temp_pair is not None:
                temp_pair.index = [pair]
                tradable_pairs_2 = pd.concat([tradable_pairs_2, temp_pair])
        except:
            pass