In [256]:
import pandas as pd
import numpy as np
import sys, math, os, json, re, random
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import mean_absolute_error

from functions import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# jupyter magic to display plots directly in the notebook
%matplotlib inline

# use vector graphics format for nicer plots
%config Inline.Backend.figure_format = 'svg'

%config Completer.use_jedi = False

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [257]:
DATA = 'data/RfqData.xlsx'
sheets = pd.ExcelFile(DATA).sheet_names

In [258]:
def add_features(df):
    df['quote_diff'] = (df['QuotedPrice']/df['MidPrice'])
    df['NextMid_diff'] = (df['NextMidPrice']/df['MidPrice'])
    df['Traded'] = df['Traded'].map(target_mapping)
    df['Notional_cuts'] = pd.cut(df['Notional'], [0, 100000, 1000000, 10000000, 50000000], labels=[0, 1, 2, 3], include_lowest=True, right=True)
    
    cols = ['Side','Counterparty','Bond']
    
    for col in cols:
        one_hot_encode = pd.get_dummies(df[col])
        df = df.join(one_hot_encode)
        
    df['Counterparty'] = df['Counterparty'].apply(lambda x: int(x[-1:]))
    df['Bond'] = df['Bond'].apply(lambda x: int(x[-1:]))
    df['Side'] = df['Side'].map({'Offer':-1,'Bid':1})
    
    df['PnL']=(((df['NextMidPrice']-df['QuotedPrice'])>=0).astype(int)*2-1)*df['Side']
    
    cols_to_drop = ['Time'] + ['Notional', 'QuotedPrice', 'NextMidPrice']
    
    return df.drop(columns=cols_to_drop)


training = pd.read_excel(DATA, sheet_name=sheets[0])
test = pd.read_excel(DATA, sheet_name=sheets[1])
competition = pd.read_excel(DATA, sheet_name=sheets[2])

target_mapping = {'MISSED': 0, 'DONE': 1}

def prep_data_regression(train,test):# extract input and target
    to_drop = ['NextMid_diff','Bond','Counterparty','MidPrice',\
               'Notional_cuts','quote_diff','Competitors','Side','PnL']
    columns = training.drop(columns=to_drop).columns
    X_train = training.drop(columns=to_drop).values
    
    y_train = training[['NextMid_diff']].values.ravel()
    X_test = test.drop(columns=to_drop).values
    y_test = test[['NextMid_diff']].values.ravel()
    return columns, X_train, y_train, X_test, y_test


def prep_data_classifier(train,test,y_value):# extract input and target
    to_drop = ['NextMid_diff','Bond','Counterparty','MidPrice','Side',\
               'Ctpy_0', 'Ctpy_1', 'Ctpy_2', 'Ctpy_3',\
              'Bond_0', 'Bond_1', 'Bond_2','Traded','PnL']
    columns = training.drop(columns=to_drop).columns
    X_train = training.drop(columns=to_drop).values  
    y_train = training[[y_value]].values.ravel()
    
    columns_test = test.drop(columns=to_drop).columns
    X_test = test.drop(columns=to_drop).values
    y_test = test[[y_value]].values.ravel()
    return columns, columns_test, X_train, y_train, X_test, y_test

# training=add_features(training[training['Counterparty']=='Ctpy_0'])
# test=add_features(test[test['Counterparty']=='Ctpy_0'])

In [259]:
training = add_features(training)
test = add_features(test)

### next mid price prediction

In [260]:
columns_m, X_train_m, y_train_m, X_test_m, y_test_m = prep_data_regression()
columns_m

TypeError: prep_data_regression() missing 2 required positional arguments: 'train' and 'test'

In [261]:
RFmodel_r = RandomForestRegressor(min_samples_split=100,oob_score=True)
RFmodel_r.fit(X_train_m, y_train_m)

# evaluate performance
abs_error_train = mean_absolute_error(y_train_m, RFmodel_r.predict(X_train_m))
abs_error_test = mean_absolute_error(y_test_m, RFmodel_r.predict(X_test_m))
y_pred = RFmodel_r.predict(X_test_m)
test_se = (np.sign(y_pred - np.ones(len(y_pred))) != np.sign(y_test_m - np.ones(len(y_pred)))).sum()/len(y_test_m)

info_train = f'Train: abs error = {abs_error_train}'
info_test = f'Test : abs error = {abs_error_test}'
print(info_train)
print(info_test)
print('sign error:', test_se)
print('estimation error for any single point prediction:',RFmodel_r.oob_score_)

Train: abs error = 0.0007938657042792581
Test : abs error = 0.0008708858483890167
sign error: 0.373
estimation error for any single point prediction: 0.05223879142232901


In [262]:
Next_mid_price = y_pred*test['MidPrice']

In [263]:
regressor_test = test.copy(deep=True)

In [264]:
regressor_test['error'] = (y_pred-y_test_m)*test['MidPrice'].to_numpy()
regressor_test['sign error'] = np.sign(y_pred - np.ones(len(y_pred)))>0

In [265]:
regressor_test.groupby(['Counterparty']).mean()['error']

Counterparty
0   -0.007745
1   -0.004035
2   -0.019968
3    0.004220
Name: error, dtype: float64

In [266]:
regressor_test.groupby(['Counterparty']).mean()['sign error']

Counterparty
0    0.473404
1    0.182609
2    0.354167
3    0.541766
Name: sign error, dtype: float64

In [267]:
regressor_test.groupby(['Bond']).mean()['error']

Bond
0   -0.003279
1    0.012064
2   -0.003915
Name: error, dtype: float64

In [268]:
regressor_test.groupby(['Bond']).mean()['sign error']

Bond
0    0.338115
1    0.561224
2    0.425121
Name: sign error, dtype: float64

### Notes: Our estimation error when predicting the price of bond 1 is 10x bond 0 or 2. Our prediction error against counterparty 2 is 1/10 the error against the counter parties. our sign error when predicting the next price for counterparty 0 is much better than against counterparty 1 and 2

### Trade Classification 

In [269]:
columns_c, columns_test_c, X_train_c, y_train_c, X_test_c, y_test_c = prep_data_classifier(training,test,'Traded')
columns_c

Index(['Competitors', 'quote_diff', 'Notional_cuts', 'Bid', 'Offer'], dtype='object')

In [270]:
RFmodel1 = RandomForestClassifier(min_samples_split=100,oob_score=True)
RFmodel1.fit(X_train_c, y_train_c)

# evaluate performance
log_loss_train = log_loss(y_train_c, RFmodel1.predict_proba(X_train_c))
log_loss_test = log_loss(y_test_c, RFmodel1.predict_proba(X_test_c))
accuracy_train = accuracy_score(y_train_c, RFmodel1.predict(X_train_c))
accuracy_test = accuracy_score(y_test_c, RFmodel1.predict(X_test_c))

info_train = f'Train: Log-loss = {log_loss_train}, accuracy = {100*accuracy_train:.4f} %'
info_test = f'Test: Log-loss = {log_loss_test}, accuracy = {100*accuracy_test:.4f} %'
print(info_train)
print(info_test)
print('estimation error for any single point prediction:',RFmodel1.oob_score_)

Train: Log-loss = 0.3145092619795812, accuracy = 86.5400 %
Test: Log-loss = 0.35181151592895377, accuracy = 84.9000 %
estimation error for any single point prediction: 0.8574


In [271]:
X_test_c

array([[2.        , 0.99713494, 1.        , 1.        , 0.        ],
       [3.        , 0.99964618, 0.        , 1.        , 0.        ],
       [4.        , 0.99988199, 0.        , 1.        , 0.        ],
       ...,
       [1.        , 1.00063354, 0.        , 0.        , 1.        ],
       [3.        , 0.99945583, 0.        , 1.        , 0.        ],
       [4.        , 1.00078827, 0.        , 0.        , 1.        ]])

#### PnL Classification - prob need to update the feature

In [272]:
columns_p, columns_test_p, X_train_p, y_train_p, X_test_p, y_test_p = prep_data_classifier(training,test,'PnL')
columns_p

Index(['Competitors', 'quote_diff', 'Notional_cuts', 'Bid', 'Offer'], dtype='object')

In [273]:
RFmodel2 = RandomForestClassifier(min_samples_split=100,oob_score=True)
RFmodel2.fit(X_train_p, y_train_p)

# evaluate performance
log_loss_train = log_loss(y_train_p, RFmodel2.predict_proba(X_train_p))
log_loss_test = log_loss(y_test_p, RFmodel2.predict_proba(X_test_p))
accuracy_train = accuracy_score(y_train_p, RFmodel2.predict(X_train_p))
accuracy_test = accuracy_score(y_test_p, RFmodel2.predict(X_test_p))

info_train = f'Train: Log-loss = {log_loss_train}, accuracy = {100*accuracy_train:.4f} %'
info_test = f'Test: Log-loss = {log_loss_test}, accuracy = {100*accuracy_test:.4f} %'
print(info_train)
print(info_test)
print('estimation error for any single point prediction:',RFmodel2.oob_score_)

Train: Log-loss = 0.571826745952243, accuracy = 69.1800 %
Test: Log-loss = 0.5942876388137891, accuracy = 66.9000 %
estimation error for any single point prediction: 0.6436


#### Best Quote Price

In [275]:
test['next_mid_price'] = Next_mid_price

In [276]:
suggested_delta = []
for idx,row in test.iterrows():
    bond_error = regressor_test.groupby(['Bond']).mean()['error'][row['Bond']] 
    #we can define penalty here
    sample,delta = max_pnl_price(bond_error,row,columns_c,columns_p,RFmodel1,RFmodel2,1)
    suggested_delta.append(delta*row['MidPrice'])

In [277]:
test = pd.read_excel(DATA, sheet_name=sheets[1])

In [279]:
test_perf=pnl(suggested_delta,test)
test_perf

pos_pnl:562
better_price:439
traded_pos_pnl:1
