## Usual imports

In [1]:
import pandas as pd
import numpy as np
import sys, math, os, json, re, random
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

from reticulum import AdaptiveBayesianReticulum

# jupyter magic to display plots directly in the notebook
%matplotlib inline

# use vector graphics format for nicer plots
%config Inline.Backend.figure_format = 'svg'

%config Completer.use_jedi = False

%load_ext autoreload
%autoreload 2

In [2]:
DATA = 'data/RfqData.xlsx'

In [3]:
sheets = pd.ExcelFile(DATA).sheet_names
sheets

['Training RFQs', 'OOS RFQs', 'Competition RFQs']

In [4]:
training = pd.read_excel(DATA, sheet_name=sheets[0])
test = pd.read_excel(DATA, sheet_name=sheets[1])
competition = pd.read_excel(DATA, sheet_name=sheets[2])

In [5]:
training.head()

Unnamed: 0,Time,Bond,Side,Notional,Counterparty,MidPrice,QuotedPrice,Competitors,Traded,NextMidPrice
0,25000,Bond_2,Offer,10000000,Ctpy_0,124.01,124.25,1,MISSED,124.24
1,25001,Bond_0,Bid,1000,Ctpy_1,98.07,98.06,1,DONE,98.08
2,25002,Bond_1,Offer,1000,Ctpy_1,170.3,170.4,1,MISSED,170.64
3,25003,Bond_0,Bid,20000,Ctpy_0,98.0,97.98,4,DONE,97.94
4,25004,Bond_1,Offer,1000,Ctpy_3,171.12,171.16,2,DONE,171.46


In [6]:
competition

Unnamed: 0,Time,Bond,Side,Notional,Counterparty,MidPrice,QuotedPrice,Competitors
0,31000,Bond_0,Bid,1000,Ctpy_0,82.55,,3
1,31001,Bond_2,Bid,1000,Ctpy_3,110.32,,4
2,31004,Bond_1,Offer,300000,Ctpy_0,126.83,,2
3,31008,Bond_0,Bid,1000,Ctpy_1,82.28,,1
4,31009,Bond_2,Offer,1000,Ctpy_1,109.99,,2


In [7]:
target_mapping = {'MISSED': 0, 'DONE': 1}

In [21]:
def add_features(df):
    df['quote_diff'] = df['QuotedPrice']/df['MidPrice']
    df['Traded'] = df['Traded'].map(target_mapping)
    # df['Competitors'] = df['Competitors'].apply(lambda x: str(x))
    df['Notional_cuts'] = pd.cut(df['Notional'], [0, 100000, 1000000, 10000000, 50000000], labels=[0, 1, 2, 3], include_lowest=True, right=True)
    
    cols = ['Side']
    one_hot_encode = pd.get_dummies(df[cols])
    df = df.join(one_hot_encode)
    
    cols_to_drop = ['Bond', 'Side', 'Counterparty', 'Time'] + ['Notional', 'QuotedPrice', 'MidPrice', 'NextMidPrice']
    
    return df.drop(columns=cols_to_drop)

In [9]:
len(training.Notional)

5000

In [10]:
training = add_features(training)
training.head()

Unnamed: 0,Competitors,Traded,quote_diff,Notional_cuts,Side_Bid,Side_Offer
0,1,0,1.001935,2,0,1
1,1,1,0.999898,0,1,0
2,1,0,1.000587,0,0,1
3,4,1,0.999796,0,1,0
4,2,1,1.000234,0,0,1


In [11]:
test = add_features(test)

test.head()

Unnamed: 0,Competitors,Traded,quote_diff,Notional_cuts,Side_Bid,Side_Offer
0,2,0,0.997135,1,1,0
1,3,1,0.999646,0,1,0
2,4,1,0.999882,0,1,0
3,1,1,0.99954,0,1,0
4,2,1,1.000473,2,0,1


In [12]:
# extract input and target
X_train = training.drop(columns=['Traded']).values
y_train = training[['Traded']].values.ravel()
X_test = test.drop(columns=['Traded']).values
y_test = test[['Traded']].values.ravel()

In [13]:
# train model
model = AdaptiveBayesianReticulum(
    prior=(1, 1),
    pruning_factor=1.01,
    n_iter=100,
    learning_rate_init=0.1,
    n_gradient_descent_steps=1,
    initial_relative_stiffness=20)

t0 = dt.datetime.utcnow()
model.fit(X_train, y_train, verbose=True)
t1 = dt.datetime.utcnow()

print('Model:')
print(model)
print(f'Training took {t1-t0}')

Creating root node at level=0, n_data=5000
Splitting left  child of node at level=0, n_data=5000.00
Splitting left  child of node at level=1, n_data=4520.11
Splitting right child of node at level=0, n_data=5000.00
Splitting right child of node at level=2, n_data=3784.65
Pruning node at level 3
Splitting right child of node at level=2, n_data=3784.65
Splitting left  child of node at level=2, n_data=3784.65
Splitting left  child of node at level=3, n_data=2578.26
Splitting right child of node at level=4, n_data=601.90
Splitting left  child of node at level=5, n_data=302.32
Pruning node at level 6
Splitting left  child of node at level=1, n_data=479.89
Splitting right child of node at level=1, n_data=479.89
Splitting right child of node at level=3, n_data=2578.26
Splitting left  child of node at level=3, n_data=1206.38
Splitting right child of node at level=5, n_data=302.32
Splitting left  child of node at level=4, n_data=1976.36
Splitting right child of node at level=1, n_data=4520.11
Sp

In [14]:
# evaluate performance
log_loss_train = log_loss(y_train, model.predict_proba(X_train))
log_loss_test = log_loss(y_test, model.predict_proba(X_test))
accuracy_train = accuracy_score(y_train, model.predict(X_train))
accuracy_test = accuracy_score(y_test, model.predict(X_test))

info_train = f'Train: Log-loss = {log_loss_train}, accuracy = {100*accuracy_train:.4f} %'
info_test = f'Test: Log-loss = {log_loss_test}, accuracy = {100*accuracy_test:.4f} %'
print(f'Depth:  {model.get_depth()}')
print(f'Leaves: {model.get_n_leaves()}')
print(info_train)
print(info_test)
print(f'Feature importance: {model.feature_importance()}')

Depth:  10
Leaves: 45
Train: Log-loss = 0.4127035604190389, accuracy = 81.9000 %
Test: Log-loss = 0.4167728917344216, accuracy = 83.0000 %
Feature importance: [9.07488487e-04 9.96013990e-01 5.02931510e-04 1.34847964e-03
 1.22711042e-03]


## Estimate the probability of execution a trade for a given price (and any other feature available in the data)

In [20]:
trade_execution_prob = model.predict_proba(X_test)[:, 1]