In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import itertools
from statistical_functions import preprocess, test_is_I1, test_is_tradable
import matplotlib.pyplot as plt

  from pandas.core import datetools


# Notice
* We haven't consider capital management yet, but it is fairly important.
* Haven't consider unlisted stocks.

# Preprocess

In [2]:
data = pd.read_csv("ec_data.csv")
data.date = data.date.apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
data['year'] = data.date.apply(lambda x: x.year)
data['month'] = data.date.apply(lambda x: x.month)
data['week'] = data.date.apply(lambda x: x.week)
data['dayofweek'] = data.date.apply(lambda x: x.dayofweek)
data.sort_values('date', inplace=True)
data.head()

Unnamed: 0,code,name,industry_code,industry,date,price,year,month,week,dayofweek
0,1471,首利,M23D,電子零組件,2000-01-04,8.98,2000,1,1,1
16,2492,華新科,M23D,電子零組件,2000-01-04,35.06,2000,1,1,1
15,2484,希華,M23D,電子零組件,2000-01-04,23.85,2000,1,1,1
14,2483,百容,M23D,電子零組件,2000-01-04,15.29,2000,1,1,1
13,2478,大毅,M23D,電子零組件,2000-01-04,12.86,2000,1,1,1


In [3]:
# create possible pairs
possible_pairs = list(
    itertools.combinations(data.code.unique(), 2)
)

# create trading time frame
timeframe = data[['year', 'month', 'week']].drop_duplicates()
timeframe.reset_index(drop=True, inplace=True)

# create dataframe of date for available data
# it will be used to check whether stocks containing enough data
# before further cointegration testing
# trade off between speed and more possible pairs
date = pd.DataFrame(data.date.unique(), columns=['date'])
date['year'] = date.date.apply(lambda x: x.year)
date['month'] = date.date.apply(lambda x: x.month)
date['week'] = date.date.apply(lambda x: x.week)
date['dayofweek'] = date.date.apply(lambda x: x.dayofweek)

In [4]:
# initial part
trailing_windows = 52
current_step = trailing_windows

In [5]:
# trailing part
timeframe_for_model = timeframe.iloc[current_step-trailing_windows: current_step]

data_for_model = data[
    (data.year >= timeframe_for_model.year.iloc[0]) &
    (data.week >= timeframe_for_model.week.iloc[0]) &
    (data.year <= timeframe_for_model.year.iloc[-1]) &
    (data.week <= timeframe_for_model.week.iloc[-1])
]

date_for_model = date[
    (date.year >= timeframe_for_model.year.iloc[0]) &
    (date.week >= timeframe_for_model.week.iloc[0]) &
    (date.year <= timeframe_for_model.year.iloc[-1]) &
    (date.week <= timeframe_for_model.week.iloc[-1])
]

possible_pairs = list(
    itertools.combinations(data_for_model.code.unique(), 2)
)




# current_step += 1

In [8]:
%%time
tradable_pairs_3 = pd.DataFrame()
for pair in possible_pairs:
    stock_1 = data_for_model[data_for_model.code == pair[0]]
    stock_2 = data_for_model[data_for_model.code == pair[1]]
    if len(stock_1) == len(stock_2) == len(date_for_model):
        temp_pair = test_is_tradable(stock_1.price, stock_2.price)
        if temp_pair is not None:
            temp_pair.index = [pair]
            tradable_pairs_3 = pd.concat([tradable_pairs_3, temp_pair])

Wall time: 4.88 s


In [9]:
print(
    "   ", len(tradable_pairs), len(tradable_pairs_2), len(tradable_pairs_3)
)

    99 86 86


In [None]:
%%time
tradable_pairs = pd.DataFrame()
for pair in possible_pairs:
    stock_1 = data_for_model[data_for_model.code == pair[0]]
    stock_2 = data_for_model[data_for_model.code == pair[1]]
    # to ensure stock_1 and, stock_2 are match in date
    stock_1, stock_2 = preprocess(stock_1, stock_2)
    # to avoid "maxlag should be < nobs" error in ADF test
    # also ensure there is enough data for regression
    if len(stock_1) > 200:
        temp_pair = test_is_tradable(stock_1.price, stock_2.price)
        if temp_pair is not None:
            temp_pair.index = [pair]
            tradable_pairs = pd.concat([tradable_pairs, temp_pair])

In [None]:
%%time
tradable_pairs_2 = pd.DataFrame()
for pair in possible_pairs:
    stock_1 = data_for_model[data_for_model.code == pair[0]]
    stock_2 = data_for_model[data_for_model.code == pair[1]]
    # to ensure stock_1 and, stock_2 are match in date
#     stock_1, stock_2 = preprocess(stock_1, stock_2)
    # to avoid "maxlag should be < nobs" error in ADF test
    # also ensure there is enough data for regression
    if len(stock_1) > 200:
        try:
            temp_pair = test_is_tradable(stock_1.price, stock_2.price)
            if temp_pair is not None:
                temp_pair.index = [pair]
                tradable_pairs_2 = pd.concat([tradable_pairs_2, temp_pair])
        except:
            pass