In [1]:
import copy
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error,accuracy_score, f1_score, recall_score, precision_score, roc_curve, roc_auc_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import TimeSeriesSplit
from math import sqrt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")

### Reading in Data

In [2]:
days_in_month = 22
days_in_week = 5
days_in_year = 252
    
def getData():
    """
     Reads all the data from the Data file and constructs df_main which holds all the predictors
     input: nothing - Make sure you have the .xlsx file in the same folder as this .py file
     output: 
         df_main- contains all the predictors
         df_spot- Spot price for all the currencies
         df_eurfwd- 1M EUR FWD rates
         df_Implied_vols- ATM Implied Volatilities for all the FX pairs
         df_deporates- 3M deposit rates for the given currencies
         df_realized_vol - 2 month realized vols for all the currencies in df_spot
    """    
    sheetname       = ["FX SPOT", "ATM VOLS","3M 25D RR", "3M DEPOSIT RATES","10Y YIELD","EQUITY INDICES","COMDTY","CREDIT SPREADS","IMM POSITIONING"]
    levels          = ["ATM VOLS", "3M 25D RR"]
    filename        = "DataTables.xlsx"
    
    # Dataframe of Spot prices
    df_spot         = pd.read_excel(filename,"FX SPOT",parse_dates=True, index_col='Dates')
    df_spot         = df_spot[df_spot.index.dayofweek < days_in_week]
    
    # Dataframe of EURUSD 1M forward rates prices
    df_eurfwd       = pd.read_excel(filename,"1M EUR FWD",parse_dates=True, index_col='Dates')
    df_eurfwd       = df_eurfwd[df_eurfwd.index.dayofweek < days_in_week]
    
    # Dataframe of 1M Implied Volatilities
    df_Implied_vols = pd.read_excel(filename,"ATM VOLS",parse_dates=True, index_col='Dates')
    df_Implied_vols = df_Implied_vols[df_Implied_vols.index.dayofweek < days_in_week]
    
    # Dataframe of Deposit rates
    df_deporates    = pd.read_excel(filename,"3M DEPOSIT RATES",parse_dates=True, index_col='Dates')
    df_deporates    = df_deporates[df_deporates.index.dayofweek < days_in_week]
    
    #df_main holds all the data - predictors all 373 of them
    df_main         = pd.DataFrame(index = df_spot.index)
    
    # Calculating spot returns to be further used in calculating 2M realized volatilities
    df_returns              = df_spot.pct_change()
    df_realized_vol         = pd.DataFrame(df_returns.rolling(window = days_in_month*2).std()*np.sqrt(days_in_year), index = df_spot.index, columns = df_spot.columns).shift(1)
    df_realized_vol.columns = [str(col) + 'Vol2M' for col in df_realized_vol.columns]
    
    
    # Calculating 1W change in realized Volatilities
    df_1W_vol_per_change            = (df_realized_vol.astype(float) / df_realized_vol.astype(float).shift(days_in_week) - 1) 
    df_1W_vol_per_change.columns    = [str(col) + '1W' for col in df_1W_vol_per_change.columns]
    
    
    # Calculating 1month change in realized Volatilities
    df_1M_vol_per_change            = (df_realized_vol.astype(float) / df_realized_vol.astype(float).shift(days_in_month) - 1) 
    df_1M_vol_per_change.columns    = [str(col) + '1M' for col in df_1M_vol_per_change.columns]
    
    
    # Adding the Volatilite, 1W change in vols and 1M change in realized vols to the master dataframe
    df_main     =   df_main.join(df_realized_vol)
    df_main     =   df_main.join(df_1W_vol_per_change)
    df_main     =   df_main.join(df_1M_vol_per_change)
    
    
    #Looping through all the sheets and individual predictors to calculate 1week and 1month change 
    #and joining them in the Master dataframe - df_main
    for sheet in sheetname:
        df      =       pd.DataFrame()
        df      =       pd.read_excel(filename,sheet,parse_dates=True, index_col='Dates')
        df      =       df[df.index.dayofweek < days_in_week] # removing all the weekend dates from the dataset
    
        if sheet in levels:
            df_main     =       df_main.join(df.shift(1))
        
        print("Reading sheet", sheet)
        df_1W_per_change            = (df.astype(float) / df.astype(float).shift(days_in_week) - 1) 
        df_1W_per_change.columns    = [str(col) + '1W' for col in df_1W_per_change.columns]
        df_1M_per_change            = (df.astype(float) / df.astype(float).shift(days_in_month) - 1) 
        df_1M_per_change.columns    = [str(col) + '1M' for col in df_1M_per_change.columns]
        
        df_main         =   df_main.join(df_1W_per_change.shift(1))
        df_main         =   df_main.join(df_1M_per_change.shift(1))
        
    print("Reading sheet JPM EASI")
    df_easi         =   pd.read_excel(filename,"JPM EASI",parse_dates=True, index_col='Dates')
    df_easi         =   df_easi[df_easi.index.dayofweek < days_in_week]
    df_easi.fillna(0, inplace = True)
    
    # JPM EASI is an index value between -100 to +100, so we have divided by total range (200) to find out change in 1W and 1M
    df_easi_1W          = (df_easi.astype(float) - df_easi.astype(float).shift(days_in_week))/ 200
    df_easi_1W.columns  = [str(col) + '1W' for col in df_easi_1W.columns]
    df_easi_1M          = (df_easi.astype(float) - df_easi.astype(float).shift(days_in_month))/200 
    df_easi_1M.columns  = [str(col) + '1M' for col in df_easi_1M.columns]
    df_main             = df_main.join(df_easi_1W.shift(1))
    df_main             = df_main.join(df_easi_1M.shift(1))
    
    return df_main, df_spot, df_eurfwd, df_Implied_vols, df_deporates, df_realized_vol

df_main, _, _, _, _, _ = getData()
df_main = df_main.replace([np.inf, -np.inf], np.nan) # replace infinity with nan
df_main = df_main.fillna(0)
df_main['date'] = df_main.index

Reading sheet FX SPOT
Reading sheet ATM VOLS
Reading sheet 3M 25D RR
Reading sheet 3M DEPOSIT RATES
Reading sheet 10Y YIELD
Reading sheet EQUITY INDICES
Reading sheet COMDTY
Reading sheet CREDIT SPREADS
Reading sheet IMM POSITIONING
Reading sheet JPM EASI


### Create Targets

In [3]:
def get_returns_all(cmdata):
    retdata = copy.deepcopy(cmdata)
    cols = [x for x in list(cmdata.columns) if x != 'date']
    retdata[cols] = (cmdata[cols].shift(1)/cmdata[cols].shift(0) - 1) > 0
    retdata[[x+'_1d' for x in cols]] = (cmdata[cols].shift(0)/cmdata[cols].shift(-1) - 1) 
    retdata[[x+'_1w' for x in cols]] = (cmdata[cols].shift(0)/cmdata[cols].shift(-days_in_week) - 1) 
    retdata[[x+'_1m' for x in cols]] = (cmdata[cols].shift(0)/cmdata[cols].shift(-days_in_month) - 1)
    retdata[[x+'_1y' for x in cols]] = (cmdata[cols].shift(0)/cmdata[cols].shift(-days_in_year) - 1) 
    '''
    retdata = retdata.dropna()
    def bin_targets(x):
        if x<0: return -1
        if x>0: return 1
        return 0

    for col in cols:
        retdata[col]=retdata[col].apply(bin_targets)
    '''
    return retdata.dropna().reset_index(drop=True)

def get_returns_overall(cmdata):
    retdata = pd.DataFrame({'date':cmdata.date, 'returns':cmdata.sum(axis=1)})
    cols = [x for x in list(cmdata.columns) if x != 'date']
    retdata['returns'] = (retdata['returns'].shift(1)/retdata['returns'].shift(0) - 1) > 0
    retdata[[x+'_1d' for x in cols]] = (cmdata[cols].shift(0)/cmdata[cols].shift(-1) - 1) 
    retdata[[x+'_1w' for x in cols]] = (cmdata[cols].shift(0)/cmdata[cols].shift(-days_in_week) - 1) 
    retdata[[x+'_1m' for x in cols]] = (cmdata[cols].shift(0)/cmdata[cols].shift(-days_in_month) - 1)
    retdata[[x+'_1y' for x in cols]] = (cmdata[cols].shift(0)/cmdata[cols].shift(-days_in_year) - 1) 
    return retdata.dropna().reset_index(drop=True)

# True = Up
# False = Down or same value/0% return
cmdata = pd.read_excel('Commodity Data.xlsx')
#cmdata = cmdata.dropna()
cmdata=cmdata.fillna(0)
ret_df = get_returns_overall(cmdata)
#ret_df = get_returns_all(cmdata)

### Train Test Split

In [4]:
def traintestsplit(X,Y,split):
    """
     Splits the data set into training & Quarantine/test set from the given dataframes X & Y
     input: X & Y Dataframes as formed in the main, 
            split - the % split between training & test/quarantine dataset
     output: trainX,testX, trainY, testY

    """
    trainX      =   X.iloc[0:round(split*X.shape[0]),:]
    testX       =   X.iloc[round(split*X.shape[0]):,:]
    trainY      =   Y.iloc[0:round(split*Y.shape[0]),:]
    testY       =   Y.iloc[round(split*Y.shape[0]):,:]
    
    return trainX,testX, trainY, testY

### Optimize Hyperparameters

In [5]:
def optimizeHyperParameters(train_X,train_Y, model, cv_folds):
    """
     Optimize the models Hyper Parameter.
     For simplicity we have used only one Hyperparameter for the model to regularize
     input: training and test data set in dataframe format, the algorithm to be applied and the number of folds for cross validation
     output: index of the tuned HyperParameter,training accuracy & test accuracy
    """
    
    tscv = TimeSeriesSplit(n_splits=cv_folds)
    acctrain    = []
    acctest     = []
    error       = []

    # Iterate over all the 10 folds for each parameter     
    for train, test in tscv.split(train_X,train_Y):

        model.fit(train_X[train], train_Y[train])
        pred    =   model.predict(train_X[test])

        # Append the scores to the respective training and test scores list
        acctrain.append(model.score(train_X[train], train_Y[train]))
        acctest.append(model.score(train_X[test], train_Y[test]))

    #Compute accuracy on the training set
    train_accuracy   = np.mean(acctrain)

    #Compute accuracy on the Cross Validation set
    test_accuracy    = np.mean(acctest)

    return train_accuracy, test_accuracy

In [6]:
#Overall
ret_df = get_returns_overall(cmdata)
full_df=pd.merge(left=ret_df, right=df_main, left_on='date', right_on='date')
full_df = full_df[full_df[[x for x in df_main.columns if x!='date']].sum(axis=1)!=0] # get rid of 0-value features
print("Total Matching Dates: {}".format((len(full_df))))
X = full_df[[x for x in full_df.columns if x not in ['date', 'returns']]]
Y = full_df[['date','returns']]
trainX, testX, trainY, testY = traintestsplit(X, Y, 0.8)
trainX, testX, trainY, testY = np.array(trainX), np.array(testX), np.array(trainY['returns']), np.array(testY['returns'])

trainY_ = trainY
testY_ = testY

Total Matching Dates: 2615


In [7]:
#Specific Commodity

#ret_df = get_returns_all(cmdata)
#full_df=pd.merge(left=ret_df, right=df_main, left_on='date', right_on='date')
#full_df = full_df[full_df[[x for x in df_main.columns if x!='date']].sum(axis=1)!=0] # get rid of 0-value features
#print("Total Matching Dates: {}".format((len(full_df))))
#X = full_df[[x for x in full_df.columns if x not in cmdata.columns]]
#Y = full_df[[x for x in cmdata.columns if x!='date']]
#trainX, testX, trainY, testY = traintestsplit(X, Y, 0.8)
#trainX, testX, trainY, testY = np.array(trainX), np.array(testX), np.array(trainY), np.array(testY)

### If using specific commodity
#commodity_idx = 0 # 0 = wheat
#trainY_ = trainY[:,commodity_idx]
#testY_ = testY[:,commodity_idx]

### PCA

In [8]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
pca.fit(trainX)
trainX_pca = pca.transform(trainX)
testX_pca = pca.transform(testX)

### Classifiers

In [162]:
clf1 = MLPClassifier(hidden_layer_sizes=(32,16,8,4,2))
clf2 = XGBClassifier(max_depth=20)
clf3 = LogisticRegression()
clf4 = LinearSVC(C=0.1)
clf5 = RandomForestClassifier(max_depth=20)
clf6 = DecisionTreeClassifier(max_depth=20)
clf7 = KNeighborsClassifier(n_neighbors=10)

In [75]:
# First train the neural network onto the data then evalulate the test performance
cv_res1 = optimizeHyperParameters(trainX, trainY_, clf1, 10)
print("Training Accuracy:{:.03f}\t CV Accuracy: {:.03f}".format(cv_res1[0], cv_res1[1]))

Training Accuracy:0.761	 CV Accuracy: 0.531


In [76]:
clf1.fit(trainX, trainY_)
preds1 = clf1.predict(testX)
print(accuracy_score(testY_, preds1))
print(f1_score(testY_, preds1))
print(recall_score(testY_, preds1))
print(precision_score(testY_, preds1))
print(roc_auc_score(testY_, preds1))
print(confusion_matrix(testY_, preds1))
print(classification_report(testY_, preds1))

0.4397705544933078
0.5684830633284241
0.8464912280701754
0.4279379157427938
0.4859574784418674
[[ 37 258]
 [ 35 193]]
              precision    recall  f1-score   support

       False       0.51      0.13      0.20       295
        True       0.43      0.85      0.57       228

   micro avg       0.44      0.44      0.44       523
   macro avg       0.47      0.49      0.39       523
weighted avg       0.48      0.44      0.36       523



In [163]:
# The train XGBoost onto the data then evalulate the test performance
cv_res2 = optimizeHyperParameters(trainX, trainY_, clf2, 10)
print("Training Accuracy:{:.03f}\t CV Accuracy: {:.03f}".format(cv_res2[0], cv_res2[1]))

Training Accuracy:1.000	 CV Accuracy: 0.561


In [164]:
clf2.fit(trainX, trainY_)
preds2 = clf2.predict(testX)
print(accuracy_score(testY_, preds2))
print(f1_score(testY_, preds2))
print(recall_score(testY_, preds2))
print(precision_score(testY_, preds2))
print(roc_auc_score(testY_, preds2))
print(confusion_matrix(testY_, preds2))
print(classification_report(testY_, preds2))

0.5487571701720841
0.48471615720524025
0.4868421052631579
0.4826086956521739
0.5417261373773417
[[176 119]
 [117 111]]
              precision    recall  f1-score   support

       False       0.60      0.60      0.60       295
        True       0.48      0.49      0.48       228

   micro avg       0.55      0.55      0.55       523
   macro avg       0.54      0.54      0.54       523
weighted avg       0.55      0.55      0.55       523



In [124]:
# The train the Logistic Regression onto the data then evalulate the test performance
cv_res3 = optimizeHyperParameters(trainX, trainY_, clf3, 10)
print("Training Accuracy:{:.03f}\t CV Accuracy: {:.03f}".format(cv_res3[0], cv_res3[1]))

Training Accuracy:0.728	 CV Accuracy: 0.506


In [125]:
clf3.fit(trainX, trainY_)
preds3 = clf3.predict(testX)
print(accuracy_score(testY_, preds3))
print(f1_score(testY_, preds3))
print(recall_score(testY_, preds3))
print(precision_score(testY_, preds3))
print(roc_auc_score(testY_, preds3))
print(confusion_matrix(testY_, preds3))
print(classification_report(testY_, preds3))

0.4474187380497132
0.5614567526555386
0.8114035087719298
0.42923433874709976
0.48875260184359204
[[ 49 246]
 [ 43 185]]
              precision    recall  f1-score   support

       False       0.53      0.17      0.25       295
        True       0.43      0.81      0.56       228

   micro avg       0.45      0.45      0.45       523
   macro avg       0.48      0.49      0.41       523
weighted avg       0.49      0.45      0.39       523



In [126]:
# The train the Linear SVC onto the data then evalulate the test performance
cv_res4 = optimizeHyperParameters(trainX, trainY_, clf4, 10)
print("Training Accuracy:{:.03f}\t CV Accuracy: {:.03f}".format(cv_res4[0], cv_res4[1]))

Training Accuracy:0.630	 CV Accuracy: 0.503


In [127]:
clf4.fit(trainX, trainY_)
preds4 = clf4.predict(testX)
print(accuracy_score(testY_, preds4))
print(f1_score(testY_, preds4))
print(recall_score(testY_, preds4))
print(precision_score(testY_, preds4))
print(roc_auc_score(testY_, preds4))
print(confusion_matrix(testY_, preds4))
print(classification_report(testY_, preds4))

0.5009560229445507
0.4727272727272727
0.5131578947368421
0.43820224719101125
0.5023416592328278
[[145 150]
 [111 117]]
              precision    recall  f1-score   support

       False       0.57      0.49      0.53       295
        True       0.44      0.51      0.47       228

   micro avg       0.50      0.50      0.50       523
   macro avg       0.50      0.50      0.50       523
weighted avg       0.51      0.50      0.50       523



In [128]:
# The train the Random Forest Classifier onto the data then evalulate the test performance
cv_res5 = optimizeHyperParameters(trainX, trainY_, clf5, 10)
print("Training Accuracy:{:.03f}\t CV Accuracy: {:.03f}".format(cv_res5[0], cv_res5[1]))

Training Accuracy:0.984	 CV Accuracy: 0.505


In [129]:
clf5.fit(trainX, trainY_)
preds5 = clf5.predict(testX)
print(accuracy_score(testY_, preds5))
print(f1_score(testY_, preds5))
print(recall_score(testY_, preds5))
print(precision_score(testY_, preds5))
print(roc_auc_score(testY_, preds5))
print(confusion_matrix(testY_, preds5))
print(classification_report(testY_, preds5))

0.5411089866156787
0.4230769230769231
0.38596491228070173
0.46808510638297873
0.523490930716622
[[195 100]
 [140  88]]
              precision    recall  f1-score   support

       False       0.58      0.66      0.62       295
        True       0.47      0.39      0.42       228

   micro avg       0.54      0.54      0.54       523
   macro avg       0.53      0.52      0.52       523
weighted avg       0.53      0.54      0.53       523



In [130]:
# The train the Deicision tree Classifier onto the data then evalulate the test performance
cv_res6 = optimizeHyperParameters(trainX, trainY_, clf6, 10)
print("Training Accuracy:{:.03f}\t CV Accuracy: {:.03f}".format(cv_res6[0], cv_res6[1]))

Training Accuracy:1.000	 CV Accuracy: 0.497


In [131]:
clf6.fit(trainX, trainY_)
preds6 = clf6.predict(testX)
print(accuracy_score(testY_, preds6))
print(f1_score(testY_, preds6))
print(recall_score(testY_, preds6))
print(precision_score(testY_, preds6))
print(roc_auc_score(testY_, preds6))
print(confusion_matrix(testY_, preds6))
print(classification_report(testY_, preds6))

0.5105162523900574
0.43612334801762115
0.4342105263157895
0.43805309734513276
0.5018510258697592
[[168 127]
 [129  99]]
              precision    recall  f1-score   support

       False       0.57      0.57      0.57       295
        True       0.44      0.43      0.44       228

   micro avg       0.51      0.51      0.51       523
   macro avg       0.50      0.50      0.50       523
weighted avg       0.51      0.51      0.51       523



In [142]:
# The train the K Nearest Neighbours Classifier onto the data then evalulate the test performance
cv_res7 = optimizeHyperParameters(trainX, trainY_, clf7, 10)
print("Training Accuracy:{:.03f}\t CV Accuracy: {:.03f}".format(cv_res7[0], cv_res7[1]))

Training Accuracy:0.625	 CV Accuracy: 0.518


In [143]:
clf7.fit(trainX, trainY_)
preds7 = clf7.predict(testX)
print(accuracy_score(testY_, preds7))
print(f1_score(testY_, preds7))
print(recall_score(testY_, preds7))
print(precision_score(testY_, preds7))
print(roc_auc_score(testY_, preds7))
print(confusion_matrix(testY_, preds7))
print(classification_report(testY_, preds7))

0.49521988527724664
0.46987951807228917
0.5131578947368421
0.43333333333333335
0.4972569134701159
[[142 153]
 [111 117]]
              precision    recall  f1-score   support

       False       0.56      0.48      0.52       295
        True       0.43      0.51      0.47       228

   micro avg       0.50      0.50      0.50       523
   macro avg       0.50      0.50      0.49       523
weighted avg       0.51      0.50      0.50       523

