## Usual imports

In [1]:
import pandas as pd
import numpy as np
import sys, math, os, json, re, random
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

from reticulum import AdaptiveBayesianReticulum

# jupyter magic to display plots directly in the notebook
%matplotlib inline

# use vector graphics format for nicer plots
%config Inline.Backend.figure_format = 'svg'

%config Completer.use_jedi = False

%load_ext autoreload
%autoreload 2

In [2]:
DATA = 'data/RfqData.xlsx'

In [3]:
sheets = pd.ExcelFile(DATA).sheet_names
sheets

['Training RFQs', 'OOS RFQs', 'Competition RFQs']

In [4]:
training = pd.read_excel(DATA, sheet_name=sheets[0])
test = pd.read_excel(DATA, sheet_name=sheets[1])
competition = pd.read_excel(DATA, sheet_name=sheets[2])

In [5]:
training.head()

Unnamed: 0,Time,Bond,Side,Notional,Counterparty,MidPrice,QuotedPrice,Competitors,Traded,NextMidPrice
0,25000,Bond_2,Offer,10000000,Ctpy_0,124.01,124.25,1,MISSED,124.24
1,25001,Bond_0,Bid,1000,Ctpy_1,98.07,98.06,1,DONE,98.08
2,25002,Bond_1,Offer,1000,Ctpy_1,170.3,170.4,1,MISSED,170.64
3,25003,Bond_0,Bid,20000,Ctpy_0,98.0,97.98,4,DONE,97.94
4,25004,Bond_1,Offer,1000,Ctpy_3,171.12,171.16,2,DONE,171.46


In [6]:
competition

Unnamed: 0,Time,Bond,Side,Notional,Counterparty,MidPrice,QuotedPrice,Competitors
0,31000,Bond_0,Bid,1000,Ctpy_0,82.55,,3
1,31001,Bond_2,Bid,1000,Ctpy_3,110.32,,4
2,31004,Bond_1,Offer,300000,Ctpy_0,126.83,,2
3,31008,Bond_0,Bid,1000,Ctpy_1,82.28,,1
4,31009,Bond_2,Offer,1000,Ctpy_1,109.99,,2


In [7]:
target_mapping = {'MISSED': 0, 'DONE': 1}
training['log_notional'] = np.log(training['Notional'])
test['log_notional'] = np.log(test['Notional'])

In [8]:
def add_features(df):
    df['quote_diff'] = df['QuotedPrice']/df['MidPrice']
    df['Traded'] = df['Traded'].map(target_mapping)
    # df['Competitors'] = df['Competitors'].apply(lambda x: str(x))
    df['Notional_cuts'] = pd.cut(df['Notional'], [0, 100000, 1000000, 10000000, 50000000], labels=[0, 1, 2, 3], include_lowest=True, right=True)
    
    cols = ['Side']
    one_hot_encode = pd.get_dummies(df[cols])
    df = df.join(one_hot_encode)
    
    cols_to_drop = ['Bond', 'Side', 'Counterparty', 'Time'] + ['Notional', 'QuotedPrice', 'MidPrice', 'NextMidPrice']
    
    return df.drop(columns=cols_to_drop)

In [33]:
y_train = training['NextMidPrice']/training['MidPrice']
x_train = training[['Notional', 'Competitors']]
y_test = test['NextMidPrice']/test['MidPrice']
x_test = test[['Notional', 'Competitors']]

In [34]:
from sklearn.metrics import mean_squared_error

from bayesian_decision_tree.hyperplane_optimization import SimulatedAnnealingOptimizer
from bayesian_decision_tree.regression import HyperplaneRegressionTree

In [35]:
mu = y_train.mean()
sd_prior = y_train.std() / 10
prior_pseudo_observations = 10
kappa = prior_pseudo_observations
alpha = prior_pseudo_observations / 2
var_prior = sd_prior**2
tau_prior = 1/var_prior
beta = alpha/tau_prior
prior = np.array([mu, kappa, alpha, beta])

In [36]:
# model
model = HyperplaneRegressionTree(
        partition_prior=0.9,
        prior=prior,
        delta=0,
        optimizer=SimulatedAnnealingOptimizer(10, 10, 0.9, 666))

# train
model.fit(x_train.values, y_train.values)
print(model)
print()
print('Tree depth and number of leaves: {}, {}'.format(model.get_depth(), model.get_n_leaves()))
print('Feature importance:', model.feature_importance())

HP(origin=[9.e+04 2.e+00], normal=[9.99999920e-01 4.01149343e-04])
 ├ back : y=0.9999612784250579, n=3947
 └ front: y=1.0000280247772302, n=1053

Tree depth and number of leaves: 1, 2
Feature importance: [9.99599011e-01 4.00988519e-04]


In [37]:
# compute RMSE
rmse_train = np.sqrt(mean_squared_error(model.predict(x_train), y_train))
rmse_test = np.sqrt(mean_squared_error(model.predict(x_test), y_test))
info_train = 'RMSE train: {:.4f}'.format(rmse_train)
info_test = 'RMSE test:  {:.4f}'.format(rmse_test)
print(info_train)
print(info_test)

RMSE train: 0.0011
RMSE test:  0.0012


In [38]:
y_test

0      1.001109
1      0.999410
2      0.998466
3      0.997059
4      1.000118
         ...   
995    1.000602
996    0.999729
997    0.997918
998    1.001179
999    1.001419
Length: 1000, dtype: float64

In [39]:
model.predict(x_test)

array([1.00002802, 0.99996128, 0.99996128, 0.99996128, 1.00002802,
       0.99996128, 1.00002802, 1.00002802, 0.99996128, 0.99996128,
       0.99996128, 0.99996128, 0.99996128, 0.99996128, 0.99996128,
       0.99996128, 1.00002802, 1.00002802, 0.99996128, 0.99996128,
       1.00002802, 0.99996128, 0.99996128, 0.99996128, 0.99996128,
       0.99996128, 0.99996128, 0.99996128, 0.99996128, 1.00002802,
       1.00002802, 0.99996128, 1.00002802, 0.99996128, 0.99996128,
       0.99996128, 0.99996128, 0.99996128, 0.99996128, 0.99996128,
       0.99996128, 0.99996128, 0.99996128, 0.99996128, 0.99996128,
       0.99996128, 0.99996128, 1.00002802, 0.99996128, 1.00002802,
       1.00002802, 0.99996128, 0.99996128, 0.99996128, 0.99996128,
       1.00002802, 0.99996128, 0.99996128, 0.99996128, 0.99996128,
       0.99996128, 0.99996128, 0.99996128, 0.99996128, 1.00002802,
       0.99996128, 0.99996128, 0.99996128, 1.00002802, 0.99996128,
       0.99996128, 0.99996128, 0.99996128, 0.99996128, 0.99996