# Imports

In [None]:
# Basic utils
import pandas as pd
import numpy as np
import os

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Methods from de Prado
from deprado import *

# Read in Vector for Labeling

In [None]:
# read in the steepness data
swap2s20s = pd.read_parquet('data/swap2s20s.parquet')

# get y data and convert to decimal
y_data = swap2s20s['MID_PRICE'].copy() / 100

In [None]:
# Plot Steepness
plt.style.use(['science','ieee','no-latex'])
plt.figure()
plt.plot(y_data * 10000) # convert to bps
plt.xlabel('Date')
plt.ylabel('Steepness [bps]')
plt.show()

# Read in the Feature Space

In [None]:
# read in feature space X
X = pd.read_parquet(f'data/features.parquet')

In [None]:
# dropping y data to match X
print(X.shape, y_data.shape)
mask = y_data.index.intersection(X.index)
y_data = y_data.loc[mask]
X = X.loc[mask]
print(X.shape, y_data.shape)

# Labeling
- For labeling the data we use the input vector y_data that contains the 2s20s steepness
- The function getEvents computes the times when barrier hits occur
- The function getBins computes the actual labels based on when the barriers got hit

In [None]:
trgtval = 0.005  # since we use data in decimal format this number has to be multiplied by 10000 to obtain bps

 # the constant 50 bps target need to be set at every time step for the follwing functions
trgt = pd.Series(data=np.full(y_data.shape[0], trgtval), index=y_data.index)

In [None]:
# getEvents uses multiprocessing to speed up the process
e = getEvents(y_data, trgt.index, ptSl=[1,1], trgt=trgt, minRet=0.00, numThreads=16, t1=False)
bins = getBins(e, y_data, trgtval)

In [None]:
# we need to drop all nans
y = bins['bin'].dropna()
e = e.drop(e[e['t1'].isnull()].index)
t1 = e['t1'].drop(e[e['t1'].isnull()].index)

- not all original points were labeled because at the end of the data there may not occur barrier hits anymore
- so we again need to equalize X and y

In [None]:
print(X.shape, y.shape)
mask = y.index.intersection(X.index)
y = y.loc[mask]
X = X.loc[mask]
print(X.shape, y.shape)

In [None]:
# start and end date of data that is used for the model
print(X.index[0])
print(X.index[-1])

In [None]:
# Barrier Labeling Example plot

y_copy = y_data.copy() * 10000
date = '2009-05-07'
event = e['t1'].loc[date]
upper = y_copy.loc[date] + trgtval * 10000
lower = y_copy.loc[date] - trgtval * 10000

# Plot
plt.style.use(['science','ieee','no-latex'])
plt.figure()
plt.plot(y_copy)
plt.hlines(upper, pd.Timestamp(date), y_copy.index[-1], ls=':', color='r')
plt.hlines(lower, pd.Timestamp(date), y_copy.index[-1], ls=':', color='r')
plt.hlines(y_copy.loc[date], pd.Timestamp(date), y_copy.index[-1], ls=':', color='r')
plt.vlines(pd.Timestamp(date), upper, lower, color='r')
plt.annotate("Lower barrier is hit first", xy=(pd.Timestamp(event), y_copy.loc[event]), xytext=(pd.Timestamp(event), lower - 200), arrowprops=dict(arrowstyle="->",color='r'), color='r')
plt.ylabel('Steepness [bps]')
plt.xlabel('Date')
plt.show()


In [None]:
# Plot the actual labels

y_copy = y_data.copy() * 10000
plt.style.use(['science','ieee','no-latex'])
plt.figure()
plt.plot(y_copy)
plt.scatter(bins['bin'].index[bins['bin'] == -1], y_copy.loc[bins.index].loc[bins['bin'] == -1], c='red', s=5)
plt.scatter(bins['bin'].index[bins['bin'] == 1], y_copy.loc[bins.index].loc[bins['bin'] == 1], c='green', s=5)
plt.legend(['Steepness', 'Label "-1"', 'Label "1"'])
plt.xlabel('Date')
plt.ylabel('Steepness [bps]')
plt.show()


# Correlation Matrix
- a clustering technique was used to resort the features so they form visible blocks in the matrix
- the clustering algorithm was not included in the thesis because one could also sort the features manually in this case
- so using the clustering algorithm is little bit pointless here but nonetheless comfortable to see visible blocks of high correlation without manually resorting the features

In [None]:
# correlation matrix
corr0 = np.corrcoef(X.T)
corr0 = pd.DataFrame(corr0, columns=X.columns, index=X.columns)
corr1, clstrs, silh = clusterKMeansTop(corr0=corr0, maxNumClusters=12, n_init=100)

sns.heatmap(corr1, vmin=-1, vmax=1)
plt.show()

# Variation of Information Matrix

In [None]:
corr0var = varInfoMat(X, norm=True)
corr1var, clstrsvar, silhvar = clusterKMeansTop(corr0=corr0var, maxNumClusters=12, n_init=100)

sns.heatmap(corr1var, vmin=0, vmax=1)
plt.show()

# Compute Average Uniqueness
- this step also uses multiprocessing

In [None]:
numCoEvents = mpPandasObj(mpNumCoEvents, ('molecule', e.index), numThreads=12, closeIdx=y.index, t1=e['t1'])
numCoEvents = numCoEvents.loc[~numCoEvents.index.duplicated(keep='last')]
numCoEvents = numCoEvents.reindex(y.index).fillna(0)
out = pd.DataFrame()
out['tW'] = mpPandasObj(mpSampleTW, ('molecule',e.index), numThreads=12, t1=e['t1'], numCoEvents=numCoEvents)

In [None]:
# Average uniqueness used for maxSamples in Classifiers
avgU = out['tW'].mean()
avgU

# Drop Features
- here we drop all features that share high amount of information as motivated in the thesis

In [None]:
# drop columns
colstodrop = ['M1 Money Supply', 'M2 Money Supply', 'M3 Money Supply', 'GDP', 'M2 Money Velocity']
X.drop(colstodrop, axis=1, inplace=True)

In [None]:
# Plot X with matplotlib
plt.style.use(['science','ieee','no-latex'])
cycler = plt.cycler(linestyle=['solid', 'solid', 'solid', 'solid', 'solid'],
                color=['black', 'red', 'blue', 'green', 'grey'],
)
fig, ax = plt.subplots()
ax.set_prop_cycle(cycler)
plt.plot(X.loc['2016-01-01':])
plt.legend(X.columns)
plt.xlabel('Date')
plt.ylabel('Feature Value')
plt.show()


# Permutation Feature Importance
- Finally we use the feature importane technique
- Beware that the values here can slightly differ from that presented in the thesis
- This is due to the randomized nature of training decision trees in the bagging classifier
- However no large discrepancies should occur because the method is statistically robust

In [None]:
# set up base classifier
from sklearn.tree import DecisionTreeClassiﬁer
from sklearn.ensemble import BaggingClassiﬁer

clf=DecisionTreeClassifier(criterion='entropy',max_features=1, class_weight='balanced', min_weight_fraction_leaf=0)
clf=BaggingClassifier(estimator=clf,n_estimators=1000,max_features=1.,max_samples=avgU,oob_score=False)

In [None]:
imp = featImpMDA(clf,X,y,10,t1)

In [None]:
plt.style.use(['science','ieee','no-latex'])
plt.figure()
y_pos = np.arange(imp.shape[0])
plt.scatter(y=y_pos, x=imp['mean'], marker='o', s=10)
plt.errorbar(x=imp['mean'], y=y_pos, xerr=imp['std'], capsize=2, fmt='none')
plt.yticks(y_pos, imp.index)
plt.xlabel('Feature Importance Value F')
plt.legend(['Mean', 'Standard Deviation'])
plt.show()

# Performance of Base Classifier
- Here we assess the performance of the base classifier by using 10-fold purged cross validation

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

n_splits = 10
cvGen = PurgedKFold(n_splits=n_splits,t1=t1) # use purged cv
scr0, scr1 = pd.Series(dtype=np.float64), pd.DataFrame(columns=clstrs.keys())  # make empty scorer
cm = []
for i,(train,test) in tqdm(enumerate(cvGen.split(X=X, y=y)), total=n_splits):
    
    # train and test by cv folds
    X0, y0 = X.iloc[train,:], y.iloc[train]
    X1, y1 = X.iloc[test,:], y.iloc[test]
    
    # fit classifier and compute score
    fit = clf.fit(X=X0,y=y0)
    prob = fit.predict_proba(X1)
    
    # compute accuracy
    scr0.loc[i] = accuracy_score(y1, fit.predict(X1))

    # compute confusion matrix
    cm.append(confusion_matrix(y1, fit.predict(X1)))

    # compute log loss
    # scr0.loc[i] = -log_loss(y1, prob, labels=clf.classes_)


Of course also the accuracy differs but should also be in the range presented in the thesis

In [None]:
cm_mean = np.mean(cm, axis=0).astype(int)

tp = cm_mean[1, 1]  # True Positives
tn = cm_mean[0, 0]  # True Negatives
total = tp + tn + cm_mean[0, 1] + cm_mean[1, 0]  # Total observations

acc = (tp + tn) / total  # Accuracy
print(f'Accuracy: {acc:.4f}')

In [None]:
sns.heatmap(cm_mean, annot=True, 
            fmt="d",
            cbar=False,
            xticklabels=["Negative", "Positive"],
            yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()