In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import TimeSeriesSplit

import pickle

from OU import OU

Get Data

In [26]:
goog_data = pd.read_csv('./data/goog17-18.csv').iloc[:, 1:]
googl_data = pd.read_csv('./data/googl17-18.csv').iloc[:, 1:]

## Creating Trading Signals

We will create 4 technical trading signals here:

1. Simple Moving Average
2. Exponential Weighted Moving Average
3. Money Flow Index
4. Relative Strength Index

### Simple Moving Average

In [55]:
def sma(prices, window):
    '''
    This SMA function uses Close prices calculated from the window number of periods before but
    does not use the current period as you would only have that at the end of the minute at which
    point you are in the next minute and cannot trade within the last minute
    '''
    sma = prices.rolling(window).mean()[window-1:]
    sma.index += 1
    sma = sma[:-1]
    return sma

In [83]:
def smaOPEN(prices, window):
    '''
    This SMA function uses OPEN prices because you will have this at the open of each minute
    '''
    sma = prices.rolling(window).mean()[window-1:]
    return sma

### Exponential Weighted Moving Average

In [84]:
def ewma(prices, window):
    '''
    This ewma uses just the last window number of close prices
    '''
    ewma = pd.Series.ewm(prices, span=window).mean()[window-1:]
    ewma.index += 1
    ewma = ewma[:-1]
    return ewma

In [85]:
def ewmaOPEN(prices, window):
    '''
    This EWMA function uses OPEN prices because you will have this at the open of each minute
    '''
    ewma = pd.Series.ewm(prices).mean()[window-1:]
    return ewma

### Money Flow Index

In [135]:
def mfi(df, window):
    '''
    This MFI function looks at the the index-1 MFI compared to the index-2 MFI in order to 
    account for look forward bias because this uses volume and close price which are both only
    available when the period ends
    '''
    mf = (df['HIGH'] + df['LOW'] + df['CLOSE']) / 3
    i = 1
    pos_mf = [0, 0]
    neg_mf = [0, 0]
    print(mf[:10])
    while i < df.index[-1]:
        if mf[i] > mf[i - 1]:
            pos_mf.append(mf[i] * df.loc[i, 'VOLUME'])
            neg_mf.append(0)
        else:
            pos_mf.append(0)
            neg_mf.append(mf[i] * df.loc[i, 'VOLUME'])
        i += 1

    pos_mf = pd.Series(pos_mf)
    neg_mf = pd.Series(neg_mf)

    pos_sum = pd.Series(pos_mf.rolling(window).sum())
    neg_sum = pd.Series(neg_mf.rolling(window).sum())
    print(pos_sum[window:])
    mfr = (window - pos_sum) / (window - neg_sum)
    mfi = abs(100 - (100 / (1 + mfr)))

    return mfi[window:]

In [136]:
googMFI = mfi(goog_data, 5)

0    786.201667
1    784.256667
2    783.051000
3    783.648667
4    783.656667
5    784.832000
6    784.780000
7    784.160000
8    784.806667
9    784.733333
dtype: float64
0                  NaN
1                  NaN
2                  NaN
3                  NaN
4         5.055318e+06
              ...     
203448    3.473529e+07
203449    8.224832e+07
203450    1.091384e+08
203451    1.384726e+08
203452    1.241017e+08
Length: 203453, dtype: float64


### Relative Strength Index

In [129]:
def rsi(df, window):
    i = 1
    pos_period = [0]
    neg_period = [0]
    dfOpen = df['OPEN']

    while i < dfOpen.index[-1]:
        if dfOpen[i] > dfOpen[i - 1]:
            pos_period.append(dfOpen[i])
            neg_period.append(0)
        else:
            pos_period.append(0)
            neg_period.append(dfOpen[i])
        i += 1
    print(pos_period[:10])
    pos_period = pd.Series(pos_period)
    neg_period = pd.Series(neg_period)

    pos_sum = pd.Series(pos_period.rolling(window).sum())
    neg_sum = pd.Series(neg_period.rolling(window).sum())

    rs = (window - pos_sum) / (window - neg_sum)
    rsi = abs(100 - (100 / (1 + rs)))
    return rsi[window:]

## Ornstein Uhlenbeck to model residuals

the Ornsteing Ulhenbeck process is used to model the residual term becuase it is a stochastic process such that the object modeled by the process always drifts towards its long term mean

In [7]:
def create_label_func(threshold=0.001, window=5):
    """
    Given the spreads of the residuals between our pair, the label is 1
    if the spread exceeds the threshold, and 0 otherwise.

    :residuals:     spreads of the residuals of the pair
    :window:        how far in the future we want to evaluate our position
    :threshold:     the percent change threshold that determines whether a trade happens

    :ret:           list of labels of 1 or 0
    """

    def create_labels(residuals):
        min_val = residuals[::-1].rolling(window=window).min()[::-1]
        min_val.iloc[-window:] = residuals.iloc[-window:]

        a = (residuals-min_val) > threshold
        labels = a.astype(int)

        return labels
    
    return create_labels

## Feature Creation

using a window of 5 for testing because we want to capitalize on short term trends

In [8]:
feature_window = 5

goog_data['sma'] = sma(goog_data['CLOSE'], feature_window).pct_change()
googl_data['sma'] = sma(googl_data['CLOSE'], feature_window).pct_change()

goog_data['ewma'] = ewma(goog_data['CLOSE'], feature_window).pct_change()
googl_data['ewma'] = ewma(googl_data['CLOSE'], feature_window).pct_change()

goog_data['mfi'] = mfi(goog_data, feature_window).pct_change()
googl_data['mfi'] = mfi(googl_data, feature_window).pct_change()

goog_data['rsi'] = rsi(goog_data, feature_window).pct_change()
googl_data['rsi'] = rsi(googl_data, feature_window).pct_change()

goog_data['price'] = goog_data['CLOSE'].pct_change()
googl_data['price'] = googl_data['CLOSE'].pct_change()

gg_df = goog_data[feature_window+1:].reset_index(drop=True)
ggl_df = googl_data[feature_window+1:].reset_index(drop=True)

In [9]:
gg_df.to_csv('./data/goog_processed.csv')
ggl_df.to_csv('./data/googl_processed.csv')

### Set Hyperparamters

In [10]:
threshold = 0.0005
window = 5

label_func = create_label_func(threshold = threshold, window = window)

In [23]:
OU_transform = OU(gg_df, ggl_df)

:model_size:    How large of a training set we want to use for sliding window cross-validation

:eval_size:        How large of a test we want our sliding window to be evaluated on

EXAMPLE: model_size = 30,000, eval_size = 10,000

TRAIN: [0:30000] TEST: [30000:40000]

TRAIN: [10000:40000] TEST: [40000:50000]

....

In [24]:
OU_transform.split_slide(m_size=2000, e_size=100)

Sliding window split successful


In [13]:
info = OU_transform.get_splits(['price', 'sma', 'ewma', 'mfi', 'rsi'], label_func=label_func, scale=True)

In [14]:
save_dir = './data/info.npy'

np.save(save_dir, info)

In [22]:
labels = np.hstack([fold['test']['labels'].values for fold in info])
np.bincount(labels)

array([132718,  68729])