# Imports

In [44]:
import pandas as pd
import numpy as np
import sys, math, os, json, re, random
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

from reticulum import AdaptiveBayesianReticulum
from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from tqdm import tqdm

# jupyter magic to display plots directly in the notebook
%matplotlib inline

# use vector graphics format for nicer plots
%config Inline.Backend.figure_format = 'svg'

%config Completer.use_jedi = False

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
DATA = 'RfqData.xlsx'

In [46]:
sheets = pd.ExcelFile(DATA).sheet_names
sheets

['Overview', 'Training RFQs', 'OOS RFQs', 'Competition RFQs']

In [47]:
training = pd.read_excel(DATA, sheet_name=sheets[1])
test = pd.read_excel(DATA, sheet_name=sheets[2])
competition = pd.read_excel(DATA, sheet_name=sheets[3])

In [48]:
training.head()

Unnamed: 0,Time,Bond,Side,Notional,N_group,Counterparty,MidPrice,QuotedPrice,Delta,Competitors,Profit,Status,Adjusted profit,Traded,NextMidPrice
0,25000.0,Bond_2,Offer,10000000.0,big,Ctpy_0,124.01,124.25,0.24,1.0,-0.01,0.0,0.0,MISSED,124.24
1,25001.0,Bond_0,Bid,1000.0,small,Ctpy_1,98.07,98.06,0.01,1.0,0.02,1.0,0.02,DONE,98.08
2,25002.0,Bond_1,Offer,1000.0,small,Ctpy_1,170.3,170.4,0.1,1.0,0.24,0.0,0.0,MISSED,170.64
3,25003.0,Bond_0,Bid,20000.0,big,Ctpy_0,98.0,97.98,0.02,4.0,-0.04,1.0,-0.04,DONE,97.94
4,25004.0,Bond_1,Offer,1000.0,small,Ctpy_3,171.12,171.16,0.04,2.0,0.3,1.0,0.3,DONE,171.46


In [None]:
training = training.dropna()
test = test.dropna()

In [62]:
training.d

Unnamed: 0,Time,Bond,Side,Notional,Counterparty,MidPrice,QuotedPrice,Delta,Competitors,Profit,Status,Adjusted profit,Traded,NextMidPrice
0,25000.0,Bond_2,Offer,10000000.0,Ctpy_0,124.01,124.25,0.24,1.0,-0.01,0.0,0.00,MISSED,124.24
1,25001.0,Bond_0,Bid,1000.0,Ctpy_1,98.07,98.06,0.01,1.0,0.02,1.0,0.02,DONE,98.08
2,25002.0,Bond_1,Offer,1000.0,Ctpy_1,170.30,170.40,0.10,1.0,0.24,0.0,0.00,MISSED,170.64
3,25003.0,Bond_0,Bid,20000.0,Ctpy_0,98.00,97.98,0.02,4.0,-0.04,1.0,-0.04,DONE,97.94
4,25004.0,Bond_1,Offer,1000.0,Ctpy_3,171.12,171.16,0.04,2.0,0.30,1.0,0.30,DONE,171.46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29995.0,Bond_0,Bid,1000.0,Ctpy_0,85.15,85.12,0.03,4.0,0.02,1.0,0.02,DONE,85.14
4996,29996.0,Bond_2,Bid,1000.0,Ctpy_3,108.73,108.69,0.04,4.0,-0.12,1.0,-0.12,DONE,108.57
4997,29997.0,Bond_0,Bid,1000.0,Ctpy_3,84.98,84.96,0.02,4.0,-0.09,1.0,-0.09,DONE,84.87
4998,29998.0,Bond_2,Bid,1000.0,Ctpy_2,108.43,108.32,0.11,2.0,-0.01,0.0,0.00,MISSED,108.31


In [50]:
target_mapping = {'MISSED': 0, 'DONE': 1}

In [51]:
def add_features(df):
    df['Notional_cuts'] = pd.cut(df['Notional'], [0, 100000, 1000000, 10000000, 50000000], labels=[0, 1, 2, 3], include_lowest=True, right=True)
    df['Traded'] = df['Traded'].map(target_mapping)
    
    return df

### Trade Classification

In [52]:
def add_features_classifier(df):
    df['quote_diff'] = (df['QuotedPrice'] - df['MidPrice'])
    df['Traded'] = df['Traded'].map(target_mapping)
    # df['Competitors'] = df['Competitors'].apply(lambda x: str(x))
    df['Notional_cuts'] = pd.cut(df['Notional'], [0, 100000, 1000000, 10000000, 50000000], labels=[0, 1, 2, 3], include_lowest=True, right=True)
    
    cols = ['Side']
    one_hot_encode = pd.get_dummies(df[cols])
    df = df.join(one_hot_encode)
    
    cols_to_drop = ['Bond', 'Side', 'Counterparty', 'Time'] + ['Notional', 'QuotedPrice', 'MidPrice', 'NextMidPrice']
    
    return df.drop(columns=cols_to_drop)

In [53]:
def prep_data_classifier(train, test, y_value): # extract input and target
    X_train = train.drop(columns=[y_value]).values  
    y_train = train[[y_value]].values.ravel()
    
    X_test = test.drop(columns=[y_value]).values
    y_test = test[[y_value]].values.ravel()
    
    return X_train, y_train, X_test, y_test

In [54]:
training = training.drop(columns = 'N_group')

In [55]:
test = test.drop(columns = 'N_group')

In [56]:
training_classifier = add_features_classifier(training.copy())
test_classifier = add_features_classifier(test.copy())

In [57]:
X_train_c, y_train_c, X_test_c, y_test_c = prep_data_classifier(training_classifier, test_classifier, 'Traded')

In [58]:
X_train_c

array([[ 0.24,  1.  , -0.01, ...,  2.  ,  0.  ,  1.  ],
       [ 0.01,  1.  ,  0.02, ...,  0.  ,  1.  ,  0.  ],
       [ 0.1 ,  1.  ,  0.24, ...,  0.  ,  0.  ,  1.  ],
       ...,
       [ 0.02,  4.  , -0.09, ...,  0.  ,  1.  ,  0.  ],
       [ 0.11,  2.  , -0.01, ...,  0.  ,  1.  ,  0.  ],
       [ 0.14,  2.  , -0.21, ...,  0.  ,  0.  ,  1.  ]])

In [60]:
X_test_c

array([[ 2.  , -0.31,  1.  ,  1.  ,  0.  ],
       [ 3.  , -0.03,  0.  ,  1.  ,  0.  ],
       [ 4.  , -0.01,  0.  ,  1.  ,  0.  ],
       ...,
       [ 1.  ,  0.07,  0.  ,  0.  ,  1.  ],
       [ 3.  , -0.06,  0.  ,  1.  ,  0.  ],
       [ 4.  ,  0.1 ,  0.  ,  0.  ,  1.  ]])

In [59]:
# train model
ABRmodel = AdaptiveBayesianReticulum(
    prior=(1, 1),
    pruning_factor=1.01,
    n_iter=100,
    learning_rate_init=0.05,
    n_gradient_descent_steps=1,
    initial_relative_stiffness=20)

t0 = dt.datetime.utcnow()
ABRmodel.fit(X_train_c, y_train_c, verbose=False)
t1 = dt.datetime.utcnow()

# print('Model:')
# print(ABRmodel)
print(f'Training took {t1-t0}')

# evaluate performance
log_loss_train = log_loss(y_train_c, ABRmodel.predict_proba(X_train_c))
log_loss_test = log_loss(y_test_c, ABRmodel.predict_proba(X_test_c))
accuracy_train = accuracy_score(y_train_c, ABRmodel.predict(X_train_c))
accuracy_test = accuracy_score(y_test_c, ABRmodel.predict(X_test_c))

info_train = f'Train: Log-loss = {log_loss_train}, accuracy = {100*accuracy_train:.4f} %'
info_test = f'Test: Log-loss = {log_loss_test}, accuracy = {100*accuracy_test:.4f} %'
print(f'Depth:  {ABRmodel.get_depth()}')
print(f'Leaves: {ABRmodel.get_n_leaves()}')
print(info_train)
print(info_test)
print(f'Feature importance: {ABRmodel.feature_importance()}')

Training took 0:00:00.412416


ValueError: Bad input dimensions: Expected 9, got 5

### Mid Price Prediction for Counterparty 0

In [None]:
def add_features_rf(df):
    df['Notional_cuts'] = pd.cut(df['Notional'], [0, 100000, 1000000, 10000000, 50000000], labels=[0, 1, 2, 3], include_lowest=True, right=True)
    df['NextMidP'] = df['NextMidPrice']/df['MidPrice']
    df['Traded'] = df['Traded'].map(target_mapping)
    
    cols = ['Side']
    one_hot_encode = pd.get_dummies(df[cols])
    df = df.join(one_hot_encode)
    
    cols3 = ['Bond']
    one_hot_encode3 = pd.get_dummies(df[cols3])
    df = df.join(one_hot_encode3)
    
    cols_to_drop = ['Bond', 'Side', 'Time', 'QuotedPrice'] + ['NextMidPrice', 'Notional', 'Traded']
    
    return df.drop(columns=cols_to_drop)

In [None]:
def random_forest_hyperparameters(X_train, y_train, X_test, y_test):
    estimators = np.arange(5, 51, 1)
    train_rmses = []
    test_rmses = []
    train_sign_errors = []
    test_sign_errors = []
    
    for e in tqdm(estimators):
        regressor, preds, metrics = random_forest(X_train, y_train, X_test, y_test, e)
        train_rmses.append(metrics[0])
        test_rmses.append(metrics[1])
        train_sign_errors.append(metrics[2])
        test_sign_errors.append(metrics[3])
        
    plt.plot(estimators, train_rmses)
    plt.title('Train RMSE')
    plt.show()
    plt.plot(estimators, test_rmses)
    plt.title('Test RMSE')
    plt.show()
    
    plt.plot(estimators, train_sign_errors, label='Train')
    plt.plot(estimators, test_sign_errors, label='Test')
    plt.title('Sign Errors')
    plt.legend()
    plt.show()
    
    return regressor, [train_rmses, test_rmses, train_sign_errors, test_sign_errors]

In [None]:
def random_forest(X_train, y_train, X_test, y_test, e):
    regressor = RandomForestRegressor(n_estimators=e, random_state=0)
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    train_rmse = (np.sqrt(metrics.mean_squared_error(regressor.predict(X_train), y_train)))
    test_rmse = (np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    train_se = (np.sign(regressor.predict(X_train) -np.ones(len(X_train))) != np.sign(y_train - np.ones(len(X_train)))).sum()/len(X_train)
    test_se = (np.sign(y_pred - np.ones(len(y_pred))) != np.sign(y_test - np.ones(len(y_pred)))).sum()/len(y_test)
    return regressor, y_pred, [train_rmse, test_rmse, train_se, test_se]

In [None]:
def get_train_test(df_train, df_test):
    X_train = df_train.drop(columns=['NextMidP', 'MidPrice', 'Counterparty']).values
    y_train = df_train[['NextMidP']].values.ravel()
    X_test = df_test.drop(columns=['NextMidP', 'MidPrice', 'Counterparty']).values
    y_test = df_test[['NextMidP']].values.ravel()
    
    return X_train, y_train, X_test, y_test

In [None]:
counterparty_models = {}
counterparty_ecdfs = {}

In [None]:
def random_forest_for_counterparty(training, test, counterparty):
    train = add_features_rf(training.query("Counterparty == @counterparty").copy())
    test = add_features_rf(test.query("Counterparty == @counterparty").copy())
    
    train_midp = train[['MidPrice']].values.ravel()
    
    X_train, y_train, X_test, y_test = get_train_test(train, test)
    
    reg, y_pred, error_metrics = random_forest(X_train, y_train, X_test, y_test, 40)
    print(counterparty)
    print("Train RMSE =", error_metrics[0])
    print("Test RMSE =", error_metrics[1])
    print("Train sign error =", error_metrics[2])
    print("Test sign error =", error_metrics[3])
    
    train_preds = reg.predict(X_train)
    next_midp_pred = train_preds * train_midp
    next_midp_true = y_train * train_midp
    errors = next_midp_pred - next_midp_true
    print('Train RMSE in $ =', np.sqrt(np.mean(np.power(errors, 2))))
    ecdf = ECDF(errors)
    
    counterparty_models[counterparty] = reg
    counterparty_ecdfs[counterparty] = ecdf
    
    #counterparty_models[counterparty] = {'model':reg, 'ecdf':ecdf}

In [None]:
def base_pred_errors_for_counterparty(training, counterparty):
    df = training.query("Counterparty == @counterparty").copy()
    train_midp = df[['MidPrice']].values.ravel()
    train_preds = train_midp
    next_midp_pred = train_midp
    next_midp_true = df[['NextMidPrice']].values.ravel()
    errors = next_midp_pred - next_midp_true
    print('RMSE in $ =', np.sqrt(np.mean(np.power(errors, 2))))
    ecdf = ECDF(errors)
    #plt.hist(errors, bins=50);
    
    counterparty_ecdfs[counterparty] = ecdf

In [None]:
def get_bond_stdevs(training):
    trial_training = add_features(training.copy(deep=True))
    trial_training['MidPrice_diff'] = trial_training['NextMidPrice'] - trial_training['MidPrice']
    bond_stds = trial_training.groupby(['Bond', 'Notional_cuts'])['MidPrice_diff'].std().reset_index()
    return bond_stds

In [None]:
random_forest_for_counterparty(training, test, "Ctpy_0")

In [None]:
random_forest_for_counterparty(training, test, "Ctpy_2")

In [None]:
base_pred_errors_for_counterparty(training, "Ctpy_1")

In [None]:
base_pred_errors_for_counterparty(training, "Ctpy_3")

In [None]:
bond_stds = get_bond_stdevs(training)

In [None]:
def get_bond(trade):
    if trade.loc['Bond_Bond_0'] == 1:
        return 'Bond_0'
    if trade.loc['Bond_Bond_1'] == 1:
        return 'Bond_1'
    if trade.loc['Bond_Bond_2'] == 1:
        return 'Bond_2'

In [None]:
def get_side(trade):
    if trade.loc['Side_Bid'] == 1:
        return 'Bid'
    if trade.loc['Side_Offer'] == 1:
        return 'Offer'

In [None]:
def delta_range(curr_mid, next_mid_pred, sd, notional, side):
    higher = next_mid_pred > curr_mid
    equal = next_mid_pred == curr_mid
    # sd = sd*(notional+1)
    if side == "Offer":
        if higher:
            return np.arange(next_mid_pred, next_mid_pred + sd+0.001, 0.01)
        elif equal:
            return np.arange(next_mid_pred, next_mid_pred + sd+0.001, 0.01)
        else:
            return np.arange(curr_mid, next_mid_pred+0.001, 0.01)
        
    if side == "Bid":
        if higher:
            return np.arange(curr_mid, next_mid_pred)
        elif equal:
            return np.arange(next_mid_pred - sd, next_mid_pred+0.001, 0.01)
        else:
            return np.arange(next_mid_pred - sd, next_mid_pred+0.001, 0.01)

In [None]:
def profit_scenario(side):
    if side == "Offer":
        print('Profit if Quote Price is above Next Mid Price')
    elif side == "Bid":
        print('Profit if Quote Price is below Next Mid Price')

In [None]:
def get_quote_price(trade, identifier, counterparty_lambdas):
    midp = np.round(trade['MidPrice'], 2)
    next_midp_true = np.round(midp*trade['NextMidP'], 2)
    trade_rf = trade.drop(index=['NextMidP', 'MidPrice']).values.reshape(1, -1)
    
    trade_bond = get_bond(trade)
    trade_notional = int(trade['Notional_cuts'])
    trade_side = get_side(trade)
    
    print(f"Trade is for {trade_side} side")
    profit_scenario(trade_side)
    
    if identifier in counterparty_models.keys():
        print('Using model to generate midprice forecast')
        pred = counterparty_models[identifier].predict(trade_rf)[0]
        next_midp_pred = np.round(midp*pred, 2)
    else:
        print('Current midprice is estimate of next midprice')
        next_midp_pred = np.round(midp, 2)
    
    print(f"Current mid price is: {midp}")
    print(f"Predicted next mid price is: {next_midp_pred} and actual next mid price is: {next_midp_true}")
    
    
        
    sd = bond_stds.query("Bond == @trade_bond and Notional_cuts == @trade_notional").iloc[0]['MidPrice_diff']
    
    deltas = delta_range(midp, next_midp_pred, sd, trade_notional, trade_side)
    
    deltas_df = pd.DataFrame(deltas, columns=['Quotes'])
    deltas_df['Competitors'] = trade['Competitors']
    deltas_df['Notional_cuts'] = trade['Notional_cuts']
    deltas_df['Side_Bid'] = trade['Side_Bid']
    deltas_df['Side_Offer'] = trade['Side_Offer']
    deltas_df['quote_diff'] = (deltas_df['Quotes'] - midp)
    
    if trade_side == 'Offer':
        deltas_df['p_neg'] = counterparty_ecdfs[identifier](next_midp_pred - deltas_df['Quotes'])
    if trade_side == 'Bid':
        deltas_df['p_neg'] = 1 - counterparty_ecdfs[identifier](next_midp_pred - deltas_df['Quotes'])
    
    deltas_df['p_pos'] = 1 - deltas_df['p_neg']
    
    deltas_df['p_win'] = ABRmodel.predict_proba(deltas_df.drop(columns=['Quotes', 'p_neg', 'p_pos']).values)[:, 1]
    
    lambd = counterparty_lambdas[identifier]
    deltas_df['objective_func'] = deltas_df['p_win']*deltas_df['p_pos'] - lambd*(deltas_df['p_win']*deltas_df['p_neg'])
    
    if trade_side == "Offer":
        deltas_df = deltas_df[::-1].reset_index(drop=True)
    deltas_df['rate_of_change'] = (deltas_df['objective_func']/deltas_df['objective_func'].shift(-1)).fillna(0)
                                                                                    
    # find point where "utility starts diminishing
    best_quote = np.round(deltas_df.iloc[deltas_df['rate_of_change'].idxmax()-1]['Quotes'], 2)
    print(f"Best Quote price is: {best_quote}")
    
    return deltas_df, (math.floor(best_quote*100)/100)

In [None]:
trial_test = test.copy(deep=True)
trial_test = add_features_rf(trial_test)
trial_trade0 = trial_test.loc[995, :]
trial_trade1 = trial_test.loc[996, :]
trial_trade2 = trial_test.loc[990, :]
trial_trade3 = trial_test.loc[997, :]

In [None]:
trial_trade3

In [None]:
test.loc[995, :]

In [None]:
counterparty_lambdas = {'Ctpy_0': 0, 'Ctpy_1': 0, 'Ctpy_2': 0, 'Ctpy_3': 0}

In [None]:
delta_df, best_q = get_quote_price(trial_trade0.drop(index='Counterparty'), 
                                   trial_trade0.loc['Counterparty'], counterparty_lambdas)

In [None]:
delta_df

In [None]:
delta_df1, best_q1 = get_quote_price(trial_trade1.drop(index='Counterparty'), 
                                   trial_trade1.loc['Counterparty'], counterparty_lambdas)

In [None]:
delta_df1

In [None]:
delta_df2, best_q2 = get_quote_price(trial_trade2.drop(index='Counterparty'), 
                                   trial_trade2.loc['Counterparty'], counterparty_lambdas)

In [None]:
delta_df2

In [None]:
delta_df3, best_q3 = get_quote_price(trial_trade3.drop(index='Counterparty'), 
                                   trial_trade3.loc['Counterparty'], counterparty_lambdas)

In [None]:
delta_df3

### Random Analysis

In [None]:
df_analysis = training.query("Counterparty == 'Ctpy_1'").copy()
df_analysis = add_features(df_analysis)
train_midp = df_analysis[['MidPrice']].values.ravel()
train_preds = train_midp
next_midp_pred = train_midp
next_midp_true = df_analysis[['NextMidPrice']].values.ravel()
errors = next_midp_pred - next_midp_true
plt.hist(errors, bins=50);

In [None]:
df_analysis = training.query("Counterparty == 'Ctpy_3'").copy()
df_analysis = add_features(df_analysis)

In [None]:
train_midp = df_analysis[['MidPrice']].values.ravel()
train_preds = train_midp
next_midp_pred = train_midp
next_midp_true = df_analysis[['NextMidPrice']].values.ravel()
errors = next_midp_pred - next_midp_true

In [None]:
plt.hist(errors, bins=50);

In [None]:
side_dict = {'Offer':1, 'Bid':-1}
traded_dict = {'DONE':1, 'MISSED':np.nan}
target_mapping = {'MISSED': 0, 'DONE': 1}

In [None]:
df_analysis['side_tag'] = 1*(df_analysis['Side'] == 'Offer') + -1*(df_analysis['Side'] == 'Bid')
df_analysis['traded_tag'] = 1*(df_analysis['Traded'] == 1) + 0*(df_analysis['Traded'] == 0)
df_analysis.loc[df_analysis['traded_tag'] == 0,'traded_tag'] = np.nan
df_analysis['profit'] = (df_analysis['QuotedPrice'] - df_analysis['NextMidPrice'])*df_analysis['side_tag']*df_analysis['traded_tag']
df_analysis['profit'] = (df_analysis['QuotedPrice'] - df_analysis['NextMidPrice'])*df_analysis['side_tag']*df_analysis['traded_tag']

In [None]:
df_analysis.groupby(['Bond', 'Notional_cuts'])['profit'].describe().reset_index()

In [None]:
df_analysis['MidP_diff'] = df_analysis['NextMidPrice']/df_analysis['MidPrice'] - 1
df_analysis['Quote_diff'] = df_analysis['QuotedPrice']/df_analysis['MidPrice'] - 1
df_analysis['Abs_MidP_diff'] = np.abs(df_analysis['NextMidPrice']/df_analysis['MidPrice'] - 1)
df_analysis['log_notional'] = np.log(df_analysis['Notional'])

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Quote_diff',y='log_notional', data=df_analysis[df_analysis['Side'] == 'Offer'], hue='Traded')
plt.show()