In [3]:
import pandas as pd
import numpy as np

import math
import scipy.stats
import sklearn
from sklearn import preprocessing
from sklearn import svm

import pickle

#### Load in Data

In [5]:
goog_data = pd.read_csv('./data/goog_processed.csv')
googl_data = pd.read_csv('./data/googl_processed.csv')

info = np.load('./data/info.npy', allow_pickle=True)

Create a Profit/Loss DataFrame 

In [46]:
def create_pl_df(df_ticker1, df_ticker2, info):
    """
    :df_ticker1: Ticker data from first asset
    :df_ticker2: Ticker data from second asset
    :info: Fold dictionary calculated from OU class
    """
    ticker1_name = df_ticker1['TICKER']
    ticker2_name = df_ticker2['TICKER']

    df_test_only = pd.DataFrame()
    df_test_only_labels = pd.Series()

    for i in info:
        idx = i['test']['index']
        residuals = i['test']['residuals_transform_price']
        beta = i['train']['beta_fit_price']
        df_temp = pd.concat([df_ticker1.loc[idx]['CLOSE'],
                             beta * df_ticker2.loc[idx]['CLOSE'],
                             df_ticker1.loc[idx]['price'],
                             beta * df_ticker2.loc[idx]['price'],
                             i['test']['residuals_transform_price'],
                             df_ticker1.loc[idx]['TIMESTAMP']], axis=1)
        df_test_only = df_test_only.append(df_temp)
        df_test_only_labels = df_test_only_labels.append(i['test']['labels'])

    df_test_only['label'] = df_test_only_labels
    df_test_only.columns=['GOOG', 'beta*GOOGL', 'GOOG_return', 'beta*GOOGL_return', 'residual', 'TIMESTAMP', 'label']

    #Find Profit of last trade
    # (y-(1-deltaY)y) - (x-(1-deltaX)x)
    df_test_only['beta*GOOGL_gains'] = df_test_only['beta*GOOGL'] - (1-df_test_only['beta*GOOGL_return']) * df_test_only['beta*GOOGL']
    df_test_only['GOOG_gains'] = df_test_only['GOOG'] - (1-df_test_only['GOOG_return']) * df_test_only['GOOG']
    df_test_only['profit'] = df_test_only['beta*GOOGL_gains'] - df_test_only['GOOG_gains']
    #df_test_only['TIMESTAMP'] = pd.datetime(df_test_only['TIMESTAMP'])
    for i, item in enumerate(df_test_only['TIMESTAMP']):
        df_test_only.loc[i, 'TIMESTAMP'] = pd.to_datetime(item)
    print(df_test_only)
    return df_test_only

In [21]:
def param_format(params):
    """
    Formats a dictionary of paramters into a string that is writeable to a file

    :params:    Paramter dictionary to format

    :ret:       Stringified version of params dict
    """

    param_str = ', '.join("{!s}-{!r}".format(key, val) for (key, val) in params.items())
    param_str = param_str.replace("{", "")
    param_str = param_str.replace("}", "")
    param_str = param_str.replace("'", "")
    param_str = param_str.replace(",", "")
    param_str = param_str.replace(" ", "")
    param_str = param_str.replace(":", "")
    param_str = param_str.replace(".", "")
    param_str = param_str.strip()

    return param_str

In [200]:
def sharpe(df_temp):
    """
    Take in P/L df with labels and finds Sharpe

    Sharpe = (Daily P/L) / stdev(Daily P/L) * sqrt(252)

    :df_temp = P/L df

    :ret: Sharpe Ratio
    """
    
    days = {}
    count = 0
    for i, time in enumerate(df_temp['TIMESTAMP']):
        time = pd.to_datetime(time)
        time = time.date()
        time = time.strftime('%m/%d/%Y')
        number = df_temp.iloc[i]['profit_timeline']
        if time in days.keys():
            days[time] = number + days[time]
        else:
            count += 1
            days[time] = number
    df = pd.DataFrame.from_dict(days, orient='index')
    print(f'Sharpe Count: {count}')
    
    sharpe = (df.mean() / df.std()) * np.sqrt(252)
    print(f'Sharpe: {sharpe}')
    print(f'Daily Profit: {df.mean()}')
    return sharpe

def sortino(df_temp):
    """
    Takes in a profit/loss dataframe with labels and finds the Sortino Ratio according to 
    the following formula: 
    
    Daily P/L / Standard Dev(Negative Daily P/L) x sqrt(252)
    
    :df_temp: Profit/loss dataframe. 
    
    :ret: Sortino Ratio.
    """
    days = {}
    count = 0
    for i , time in enumerate(df_temp['TIMESTAMP']):
        time = pd.to_datetime(time)
        time = time.date()
        time = time.strftime('%m/%d/%Y')
        number = df_temp.iloc[i]['profit_timeline']
        if time in days.keys():
            days[time] = number + days[time]
        else:
            count += 1
            days[time] = number

    df = pd.DataFrame.from_dict(days, orient='index')
    print(f'Sortino Count: {count}')

    sortino = (df.mean() / df[df < 0].std()) * np.sqrt(252)
    print(f'Sortino: {sortino}')
    return sortino

def precision(pred, label):
    return (pred[np.logical_and(pred==1, label==1)].shape[0]/pred[pred==1].shape[0])

def fit(params, info):
    """
    Fit an SVM according to parameters across all folds inside dictionary

    :params:        Model Hyperparamters
    :info_dict:     Associated Info Dict

    :ret:           Numpy array of predictions
    """
    preds = []

    for i in info:
        b_svm = svm.SVC(**params)
        b_svm.fit(i['train']['df_scale'], i['train']['labels'])
        pred = b_svm.predict(i['test']['df_scale'])
        preds.append(pred)

    return np.hstack(preds)

def find_profit_loss(df_test, preds, params, window=5, threshold=0.0005, plot=False, save_dir=None):
    """
    Performs P/L backtesting given prediction labels

    :TO DO:         Transaction cost model

    :df_test:       P/L Dataframe
    :preds:         labels
    :params:        model parameter names
    :window:        Evaluation window
    :threshold:     how much of a residual shift determines a trade opening
    :plot:          flag for plotting
    :save_dir:      Save directory

    :ret:           results dict
    """

    backtesting_results = {}

    param_str = param_format(params)
    print("Finding P/L for model with paramters: %s." % param_str)
    total_profit = 0.0

    profit_timeline = []
    trade_profit_timeline = []
    time_held_timeline = []
    trade_data = []

    df_temp = df_test.copy()
    df_temp['label'] = preds

    for row in df_temp.iterrows():
        cur_profit = 0.0
        profit = row[1]['profit']
        residual = row[1]['residual']

        for position in trade_data:
            position['fresh'] += 1
            position['profit'] += profit
            if(position['residual'] - threshold >= residual) or position['fresh'] >= window:
                cur_profit += position['profit']
                trade_profit_timeline.append(position['profit'])
                time_held_timeline.append(position['fresh'])
                trade_data.remove(position)
        profit_timeline.append(cur_profit)
        total_profit += cur_profit

        if row[1]['label'] == 1 and residual > 0:
            trade_data.append({'profit': 0, 'residual': residual, 'fresh': 0})
        
    df_temp['profit_timeline'] = profit_timeline

    backtesting_results['total_profit'] = total_profit
    backtesting_results['daily_profit_timeline'] = profit_timeline
    backtesting_results['trade_profit_timeline'] = trade_profit_timeline
    backtesting_results['time_held_timeline'] = time_held_timeline
    backtesting_results['trades_executed'] = len(trade_profit_timeline)
    backtesting_results['params'] = params
    backtesting_results['precision'] = precision(df_temp['label'], df_test['label'])
    backtesting_results['mean_profit_per_trade'] = np.mean(trade_profit_timeline)
    backtesting_results['sharpe'] = sharpe(df_temp)
    backtesting_results['sortino'] = sortino(df_temp)

    return backtesting_results

In [26]:
params = {'C': 100,
          'cache_size': 2000,
          'class_weight': {0: 0.5, 1: 0.5},
          'gamma': 1,
          'kernel': 'rbf'}

In [94]:
df_pl = create_pl_df(goog_data, googl_data, info)

  df_test_only_labels = pd.Series()
  df_test_only = df_test_only.append(df_temp)
  df_test_only_labels = df_test_only_labels.append(i['test']['labels'])
  df_test_only = df_test_only.append(df_temp)
  df_test_only_labels = df_test_only_labels.append(i['test']['labels'])
  df_test_only = df_test_only.append(df_temp)
  df_test_only_labels = df_test_only_labels.append(i['test']['labels'])
  df_test_only = df_test_only.append(df_temp)
  df_test_only_labels = df_test_only_labels.append(i['test']['labels'])
  df_test_only = df_test_only.append(df_temp)
  df_test_only_labels = df_test_only_labels.append(i['test']['labels'])
  df_test_only = df_test_only.append(df_temp)
  df_test_only_labels = df_test_only_labels.append(i['test']['labels'])
  df_test_only = df_test_only.append(df_temp)
  df_test_only_labels = df_test_only_labels.append(i['test']['labels'])
  df_test_only = df_test_only.append(df_temp)
  df_test_only_labels = df_test_only_labels.append(i['test']['labels'])
  df_test_only = df_

KeyboardInterrupt: 

In [78]:
df_pl = df_pl[:201447]
df_pl

Unnamed: 0,GOOG,beta*GOOGL,GOOG_return,beta*GOOGL_return,residual,TIMESTAMP,label,beta*GOOGL_gains,GOOG_gains,profit
2000,783.970,663.774914,0.000502,0.000390,0.000111,2016-11-15 11:27:00,1.0,0.259118,0.393197,-0.134080
2001,783.658,663.829402,-0.000398,0.000068,-0.000466,2016-11-15 11:28:00,0.0,0.044987,-0.311876,0.356863
2002,783.590,663.873983,-0.000087,0.000055,-0.000142,2016-11-15 11:29:00,0.0,0.036807,-0.067994,0.104801
2003,784.000,664.105142,0.000523,0.000287,0.000236,2016-11-15 11:30:00,0.0,0.190905,0.410215,-0.219310
2004,784.000,663.964795,0.000000,-0.000174,0.000174,2016-11-15 11:31:00,0.0,-0.115842,0.000000,-0.115842
...,...,...,...,...,...,...,...,...,...,...
203442,1093.740,925.932547,0.003394,0.003140,0.000254,2018-11-30 15:55:00,0.0,2.907780,3.712559,-0.804779
203443,1093.420,925.789648,-0.000293,-0.000130,-0.000163,2018-11-30 15:56:00,0.0,-0.120099,-0.319906,0.199807
203444,1094.120,926.411677,0.000640,0.000565,0.000075,2018-11-30 15:57:00,0.0,0.523217,0.700448,-0.177231
203445,1092.490,925.024720,-0.001490,-0.001258,-0.000231,2018-11-30 15:58:00,0.0,-1.164103,-1.627572,0.463469


In [196]:
labels = fit(params, info)

KeyboardInterrupt: 

In [201]:
pl_info = find_profit_loss(df_pl, labels, params)

Finding P/L for model with paramters: C-100cache_size-2000class_weight-005105gamma-1kernel-rbf.
Sharpe Count: 515
Sharpe: 0    11.261882
dtype: float64
Sortino Count: 515
Sortino: 0    18.752174
dtype: float64


In [202]:
pos_label = np.array([1 if x==1 and residual > 0 else 0 for x, residual in zip(labels, df_pl.residual)])
print(precision(pos_label, df_pl['label']))

0.6975370965274591


In [203]:
print(pl_info['total_profit'])

3565.3867484288025
