In [183]:
import pandas as pd
import numpy as np
import sys, math, os, json, re, random
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import mean_absolute_error

from functions import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# jupyter magic to display plots directly in the notebook
%matplotlib inline

# use vector graphics format for nicer plots
%config Inline.Backend.figure_format = 'svg'

%config Completer.use_jedi = False

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [190]:
DATA = 'data/RfqData.xlsx'
sheets = pd.ExcelFile(DATA).sheet_names

In [209]:
def add_features(df):
    df['quote_diff'] = (df['QuotedPrice']/df['MidPrice'])
    df['NextMid_diff'] = (df['NextMidPrice']/df['MidPrice'])
    df['Traded'] = df['Traded'].map(target_mapping)
    df['Notional_cuts'] = pd.cut(df['Notional'], [0, 100000, 1000000, 10000000, 50000000], labels=[0, 1, 2, 3], include_lowest=True, right=True)
    
    cols = ['Side','Counterparty','Bond']
    
    for col in cols:
        one_hot_encode = pd.get_dummies(df[col])
        df = df.join(one_hot_encode)
        
    df['Counterparty'] = df['Counterparty'].apply(lambda x: int(x[-1:]))
    df['Bond'] = df['Bond'].apply(lambda x: int(x[-1:]))
    df['Side'] = df['Side'].map({'Offer':-1,'Bid':1})
    
    df['PnL']=(((df['NextMidPrice']-df['QuotedPrice'])>=0).astype(int)*2-1)*df['Side']
    
    cols_to_drop = ['Time'] + ['Notional', 'QuotedPrice', 'NextMidPrice']
    
    return df.drop(columns=cols_to_drop)


training = pd.read_excel(DATA, sheet_name=sheets[0])
test = pd.read_excel(DATA, sheet_name=sheets[1])
competition = pd.read_excel(DATA, sheet_name=sheets[2])

target_mapping = {'MISSED': 0, 'DONE': 1}

def prep_data_regression(train,test):# extract input and target
    to_drop = ['NextMid_diff','Bond','Counterparty','MidPrice',\
               'Notional_cuts','quote_diff','Competitors','Side','PnL']
    columns = training.drop(columns=to_drop).columns
    X_train = training.drop(columns=to_drop).values
    
    y_train = training[['NextMid_diff']].values.ravel()
    X_test = test.drop(columns=to_drop).values
    y_test = test[['NextMid_diff']].values.ravel()
    return columns, X_train, y_train, X_test, y_test


def prep_data_classifier(train,test,y_value):# extract input and target
    to_drop = ['NextMid_diff','Bond','Counterparty','MidPrice','Side',\
               'Ctpy_0', 'Ctpy_1', 'Ctpy_2', 'Ctpy_3',\
              'Bond_0', 'Bond_1', 'Bond_2','Traded','PnL']
    columns = training.drop(columns=to_drop).columns
    X_train = training.drop(columns=to_drop).values  
    y_train = training[[y_value]].values.ravel()
    
    columns_test = test.drop(columns=to_drop).columns
    X_test = test.drop(columns=to_drop).values
    y_test = test[[y_value]].values.ravel()
    return columns, columns_test, X_train, y_train, X_test, y_test

# training=add_features(training[training['Counterparty']=='Ctpy_0'])
# test=add_features(test[test['Counterparty']=='Ctpy_0'])

In [210]:
training = add_features(training)
test = add_features(test)

### next mid price prediction

In [192]:
columns_m, X_train_m, y_train_m, X_test_m, y_test_m = prep_data_regression()
columns_m

Index(['Traded', 'Bid', 'Offer', 'Ctpy_0', 'Ctpy_1', 'Ctpy_2', 'Ctpy_3',
       'Bond_0', 'Bond_1', 'Bond_2'],
      dtype='object')

In [194]:
RFmodel_r = RandomForestRegressor(min_samples_split=100,oob_score=True)
RFmodel_r.fit(X_train_m, y_train_m)

# evaluate performance
abs_error_train = mean_absolute_error(y_train_m, RFmodel_r.predict(X_train_m))
abs_error_test = mean_absolute_error(y_test_m, RFmodel_r.predict(X_test_m))
y_pred = RFmodel_r.predict(X_test_m)
test_se = (np.sign(y_pred - np.ones(len(y_pred))) != np.sign(y_test_m - np.ones(len(y_pred)))).sum()/len(y_test_m)

info_train = f'Train: abs error = {abs_error_train}'
info_test = f'Test : abs error = {abs_error_test}'
print(info_train)
print(info_test)
print('sign error:', test_se)
print('estimation error for any single point prediction:',RFmodel_r.oob_score_)

Train: abs error = 0.0007939211268763759
Test : abs error = 0.0008710984222408261
sign error: 0.373
estimation error for any single point prediction: 0.05279758264069434


In [227]:
Next_mid_price = y_pred*test['MidPrice']

In [196]:
regressor_test = test.copy(deep=True)

In [197]:
regressor_test['error'] = (y_pred-y_test_m)*test['MidPrice'].to_numpy()
regressor_test['sign error'] = np.sign(y_pred - np.ones(len(y_pred)))>0

In [198]:
regressor_test.groupby(['Counterparty']).mean()['error']

Counterparty
0   -0.007830
1   -0.004509
2   -0.019977
3    0.004506
Name: error, dtype: float64

In [199]:
regressor_test.groupby(['Counterparty']).mean()['sign error']

Counterparty
0    0.473404
1    0.182609
2    0.354167
3    0.541766
Name: sign error, dtype: float64

In [200]:
regressor_test.groupby(['Bond']).mean()['error']

Bond
0   -0.003019
1    0.011684
2   -0.004276
Name: error, dtype: float64

In [201]:
regressor_test.groupby(['Bond']).mean()['sign error']

Bond
0    0.338115
1    0.561224
2    0.425121
Name: sign error, dtype: float64

### Notes: Our estimation error when predicting the price of bond 1 is 10x bond 0 or 2. Our prediction error against counterparty 2 is 1/10 the error against the counter parties. our sign error when predicting the next price for counterparty 0 is much better than against counterparty 1 and 2

### Trade Classification 

In [212]:
columns_c, columns_test_c, X_train_c, y_train_c, X_test_c, y_test_c = prep_data_classifier(training,test,'Traded')
columns_c

Index(['Competitors', 'quote_diff', 'Notional_cuts', 'Bid', 'Offer'], dtype='object')

In [214]:
RFmodel1 = RandomForestClassifier(min_samples_split=100,oob_score=True)
RFmodel1.fit(X_train_c, y_train_c)

# evaluate performance
log_loss_train = log_loss(y_train_c, RFmodel1.predict_proba(X_train_c))
log_loss_test = log_loss(y_test_c, RFmodel1.predict_proba(X_test_c))
accuracy_train = accuracy_score(y_train_c, RFmodel1.predict(X_train_c))
accuracy_test = accuracy_score(y_test_c, RFmodel1.predict(X_test_c))

info_train = f'Train: Log-loss = {log_loss_train}, accuracy = {100*accuracy_train:.4f} %'
info_test = f'Test: Log-loss = {log_loss_test}, accuracy = {100*accuracy_test:.4f} %'
print(info_train)
print(info_test)
print('estimation error for any single point prediction:',RFmodel1.oob_score_)

Train: Log-loss = 0.31618931094918196, accuracy = 86.4800 %
Test: Log-loss = 0.35503507597072487, accuracy = 84.6000 %
estimation error for any single point prediction: 0.856


In [222]:
X_test_c

array([[2.        , 0.99713494, 1.        , 1.        , 0.        ],
       [3.        , 0.99964618, 0.        , 1.        , 0.        ],
       [4.        , 0.99988199, 0.        , 1.        , 0.        ],
       ...,
       [1.        , 1.00063354, 0.        , 0.        , 1.        ],
       [3.        , 0.99945583, 0.        , 1.        , 0.        ],
       [4.        , 1.00078827, 0.        , 0.        , 1.        ]])

#### PnL Classification - prob need to update the feature

In [215]:
columns_p, columns_test_p, X_train_p, y_train_p, X_test_p, y_test_p = prep_data_classifier(training,test,'PnL')
columns_p

Index(['Competitors', 'quote_diff', 'Notional_cuts', 'Bid', 'Offer'], dtype='object')

In [216]:
RFmodel2 = RandomForestClassifier(min_samples_split=100,oob_score=True)
RFmodel2.fit(X_train_p, y_train_p)

# evaluate performance
log_loss_train = log_loss(y_train_p, RFmodel2.predict_proba(X_train_p))
log_loss_test = log_loss(y_test_p, RFmodel2.predict_proba(X_test_p))
accuracy_train = accuracy_score(y_train_p, RFmodel2.predict(X_train_p))
accuracy_test = accuracy_score(y_test_p, RFmodel2.predict(X_test_p))

info_train = f'Train: Log-loss = {log_loss_train}, accuracy = {100*accuracy_train:.4f} %'
info_test = f'Test: Log-loss = {log_loss_test}, accuracy = {100*accuracy_test:.4f} %'
print(info_train)
print(info_test)
print('estimation error for any single point prediction:',RFmodel2.oob_score_)

Train: Log-loss = 0.5715609679902964, accuracy = 69.5600 %
Test: Log-loss = 0.5923190821859095, accuracy = 66.8000 %
estimation error for any single point prediction: 0.6414


In [None]:
RFmodel1.predict_proba(test.iloc[0]).values)[:, 1]

#### Best Quote Price

In [229]:
test['next_mid_price'] = Next_mid_price

In [247]:
suggested_delta = []
for idx,row in test.iterrows():
    bond_error = regressor_test.groupby(['Bond']).mean()['error'][row['Bond']] 
    #we can define penalty here
    sample,delta = max_pnl_price(bond_error,row,columns_c,columns_p,RFmodel1,RFmodel2,1)
    #suggested_delta.append(delta*row['MidPrice'])
    #print(sample)
    print(delta)
    if idx==10:   
        break

0.1
0.1
0.1
0.1


TypeError: 'numpy.float64' object is not iterable

In [244]:
test

Unnamed: 0,Bond,Side,Counterparty,MidPrice,Competitors,Traded,quote_diff,NextMid_diff,Notional_cuts,Bid,Offer,Ctpy_0,Ctpy_1,Ctpy_2,Ctpy_3,Bond_0,Bond_1,Bond_2,PnL,next_mid_price
0,2,1,3,108.20,2,0,0.997135,1.001109,1,1,0,0,0,0,1,0,0,1,1,108.177416
1,0,1,3,84.79,3,1,0.999646,0.999410,0,1,0,0,0,0,1,1,0,0,-1,84.782625
2,0,1,3,84.74,4,1,0.999882,0.998466,0,1,0,0,0,0,1,1,0,0,-1,84.732629
3,2,1,1,108.81,1,1,0.999540,0.997059,0,1,0,0,1,0,0,0,0,1,-1,108.799625
4,0,-1,1,84.60,2,1,1.000473,1.000118,2,0,1,0,1,0,0,1,0,0,1,84.592811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,-1,0,83.01,1,0,1.000361,1.000602,0,0,1,1,0,0,0,1,0,0,-1,83.051143
996,2,1,1,110.52,3,0,0.999095,0.999729,0,1,0,0,1,0,0,0,0,1,1,110.510200
997,2,-1,3,110.49,1,0,1.000634,0.997918,0,0,1,0,0,0,1,0,0,1,1,110.504461
998,2,1,3,110.26,3,1,0.999456,1.001179,0,1,0,0,0,0,1,0,0,1,1,110.247725


In [243]:
pnl(suggested_delta,test)

KeyError: "['QuotedPrice', 'NextMidPrice'] not in index"