In [1]:
import pandas as pd
import numpy as np
import sys, math, os, json, re, random
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import mean_absolute_error



from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# jupyter magic to display plots directly in the notebook
%matplotlib inline

# use vector graphics format for nicer plots
%config Inline.Backend.figure_format = 'svg'

%config Completer.use_jedi = False

%load_ext autoreload
%autoreload 2

In [2]:
DATA = 'data/RfqData.xlsx'
sheets = pd.ExcelFile(DATA).sheet_names

In [3]:
def add_features(df):
    df['quote_diff'] = (df['QuotedPrice']/df['MidPrice'])
    df['NextMid_diff'] = (df['NextMidPrice']/df['MidPrice'])
    df['Traded'] = df['Traded'].map(target_mapping)
    df['Notional_cuts'] = pd.cut(df['Notional'], [0, 100000, 1000000, 10000000, 50000000], labels=[0, 1, 2, 3], include_lowest=True, right=True)
    
    cols = ['Side','Counterparty','Bond']
    
    for col in cols:
        one_hot_encode = pd.get_dummies(df[col])
        df = df.join(one_hot_encode)
        
    df['Counterparty'] = df['Counterparty'].apply(lambda x: int(x[-1:]))
    df['Bond'] = df['Bond'].apply(lambda x: int(x[-1:]))
    df['Side'] = df['Side'].map({'Offer':0,'Bid':1})
    
    cols_to_drop = ['Time'] + ['Notional', 'QuotedPrice', 'NextMidPrice']
    
    return df.drop(columns=cols_to_drop)


training = pd.read_excel(DATA, sheet_name=sheets[0])
test = pd.read_excel(DATA, sheet_name=sheets[1])
competition = pd.read_excel(DATA, sheet_name=sheets[2])

target_mapping = {'MISSED': 0, 'DONE': 1}

def prep_data_regression():# extract input and target
    to_drop = ['NextMid_diff','Bond','Counterparty','MidPrice',\
               'Notional_cuts','quote_diff','Competitors','Side']
    columns = training.drop(columns=to_drop).columns
    X_train = training.drop(columns=to_drop).values
    
    y_train = training[['NextMid_diff']].values.ravel()
    X_test = test.drop(columns=to_drop).values
    y_test = test[['NextMid_diff']].values.ravel()
    return columns, X_train, y_train, X_test, y_test


def prep_data_classifier():# extract input and target
    to_drop = ['NextMid_diff','Bond','Counterparty','MidPrice','Side',\
               'Ctpy_0', 'Ctpy_1', 'Ctpy_2', 'Ctpy_3',\
              'Bond_0', 'Bond_1', 'Bond_2','Traded']
    columns = training.drop(columns=to_drop).columns
    X_train = training.drop(columns=to_drop).values
    
    y_train = training[['Traded']].values.ravel()
    columns_test = test.drop(columns=to_drop).columns
    X_test = test.drop(columns=to_drop).values
    y_test = test[['Traded']].values.ravel()
    return columns, columns_test, X_train, y_train, X_test, y_test
    
training = add_features(training)
test = add_features(test)

### next mid price prediction

In [4]:
columns, X_train, y_train, X_test, y_test = prep_data_regression()
columns

Index(['Traded', 'Bid', 'Offer', 'Ctpy_0', 'Ctpy_1', 'Ctpy_2', 'Ctpy_3',
       'Bond_0', 'Bond_1', 'Bond_2'],
      dtype='object')

In [5]:
RFmodel = RandomForestRegressor(min_samples_split=100,oob_score=True)
RFmodel.fit(X_train, y_train)

# evaluate performance
abs_error_train = mean_absolute_error(y_train, RFmodel.predict(X_train))
abs_error_test = mean_absolute_error(y_test, RFmodel.predict(X_test))
y_pred = RFmodel.predict(X_test)
test_se = (np.sign(y_pred - np.ones(len(y_pred))) != np.sign(y_test - np.ones(len(y_pred)))).sum()/len(y_test)

info_train = f'Train: abs error = {abs_error_train}'
info_test = f'Test : abs error = {abs_error_test}'
print(info_train)
print(info_test)
print('sign error:', test_se)
print('estimation error for any single point prediction:',RFmodel.oob_score_)

Train: abs error = 0.0007939693745342977
Test : abs error = 0.0008710923595845926
sign error: 0.373
estimation error for any single point prediction: 0.05251795850516172


In [6]:
regressor_test = test.copy(deep=True)

In [7]:
regressor_test['error'] = (y_pred-y_test)*test['MidPrice'].to_numpy()
regressor_test['sign error'] = np.sign(y_pred - np.ones(len(y_pred)))>0

In [8]:
regressor_test.groupby(['Counterparty']).mean()['error']

Counterparty
0   -0.007725
1   -0.004056
2   -0.018843
3    0.004355
Name: error, dtype: float64

In [9]:
regressor_test.groupby(['Bond']).mean()['error']

Bond
0   -0.002975
1    0.011068
2   -0.003779
Name: error, dtype: float64

In [10]:
regressor_test.groupby(['Bond']).mean()['sign error']

Bond
0    0.338115
1    0.561224
2    0.425121
Name: sign error, dtype: float64

### Notes: Our estimation error when predicting the price of bond 1 is 10x bond 0 or 2. Our prediction error against counterparty 2 is 1/10 the error against the counter parties. our sign error when predicting the next price for counterparty 0 is much better than against counterparty 1 and 2

### Trade Classification 

In [11]:
columns, columns_test, X_train, y_train, X_test, y_test = prep_data_classifier()
columns

Index(['Competitors', 'quote_diff', 'Notional_cuts', 'Bid', 'Offer'], dtype='object')

In [15]:
RFmodel = RandomForestClassifier(min_samples_split=100,oob_score=True)
RFmodel.fit(X_train, y_train)

# evaluate performance
log_loss_train = log_loss(y_train, RFmodel.predict_proba(X_train))
log_loss_test = log_loss(y_test, RFmodel.predict_proba(X_test))
accuracy_train = accuracy_score(y_train, RFmodel.predict(X_train))
accuracy_test = accuracy_score(y_test, RFmodel.predict(X_test))

info_train = f'Train: Log-loss = {log_loss_train}, accuracy = {100*accuracy_train:.4f} %'
info_test = f'Test: Log-loss = {log_loss_test}, accuracy = {100*accuracy_test:.4f} %'
print(info_train)
print(info_test)
print('estimation error for any single point prediction:',RFmodel.oob_score_)

Train: Log-loss = 0.31716923926820256, accuracy = 86.4200 %
Test: Log-loss = 0.3549980555536589, accuracy = 85.7000 %
estimation error for any single point prediction: 0.856
