In [None]:
import time
import warnings

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib 
%matplotlib inline

#from sklearn import cluster, datasets, mixture
#from sklearn.neighbors import kneighbors_graph
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

from itertools import cycle, islice

np.random.seed(0)

In [None]:
path = "\\Users\\pierreguilleminet\\Google Drive\\IA\\ML EY"
if(path[-1] != "\\"):
    path += "\\"

csv_file = "Price.csv"
print(path + csv_file)


In [None]:
data = pd.DataFrame.from_csv(csv_file, sep='\t')
#data = pd.read_csv(path + csv_file, sep='\t')

In [None]:
print(data.shape)
print(list(data.columns.values))
print(data["Price_outlier"].shape)

In [None]:
input_to_keep = ['Strike', 'Rate', 'Vol', 'Maturity']
input_to_drop = []

for x in data.columns.values:
    if x not in input_to_keep:
        input_to_drop.append(x)
        
X = data.drop(labels=input_to_drop, axis=1)
y = data["Price"]

print(list(X.columns.values))

In [6]:
train_set = 1 # to train the RNN
test_set = 1-train_set # to test the calibration
validation_set = 0 # to test several sets of hyperparameters
final_test_set = 0 # to test the best calibration 

probs = np.random.rand(len(data))
training_mask = probs < train_set
test_mask = (probs>=train_set) & (probs < train_set+test_set)
validation_mask = (probs>=train_set+test_set) & (probs < train_set+test_set+validation_set)
final_test_mask = probs >= train_set+test_set+validation_set

X_train = X[training_mask]
X_test = X[test_mask]
X_validation = X[validation_mask]
X_final_test = X[final_test_mask]

y_train = y[training_mask]
y_test = y[test_mask]
y_validation = y[validation_mask]
y_final_test = y[final_test_mask]

In [7]:
print("train_set:" + str(len(X_train)) + " (" + str(round(len(X_train) / len(data), 2)) + ")")
print("test_set:" + str(len(X_test)) + " (" + str(round(len(X_test) / len(data), 2)) + ")")
print("validation_set:" + str(len(X_validation)) + " (" + str(round(len(X_validation) / len(data), 2)) + ")")
print("final_test_set:" + str(len(X_final_test)) + " (" + str(round(len(X_final_test) / len(data), 2)) + ")")
print("Check: " + str(len(data) == len(X_train) + len(X_test)+len(X_validation)+len(X_final_test)))

train_set:475200 (1.0)
test_set:0 (0.0)
validation_set:0 (0.0)
final_test_set:0 (0.0)
Check: True


In [8]:
# Setting RNN parameters
# http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor

params_default = {
    "hidden_layer_sizes" : (1, ), 
    "activation" : "relu", 
    "solver" : "adam", 
    "alpha" : 0.0001, 
    "batch_size" : "auto", 
    "learning_rate" : "constant", 
    "learning_rate_init" : 0.001, 
    "power_t" : 0.5, 
    "max_iter" : 200, 
    "shuffle" : True, 
    "random_state" : None, 
    "tol" : 0.0001, 
    "verbose" : False, 
    "warm_start" : False, 
    "momentum" : 0.9, 
    "nesterovs_momentum" : True, 
    "early_stopping" : False, 
    "validation_fraction" : 0.1, 
    "beta_1" : 0.9, 
    "beta_2" : 0.999, 
    "epsilon" : 1e-08
}

In [9]:
algo_params = {
    "hidden_layer_sizes" : (100, 100, 100, 100),
    "batch_size" : "auto",
}

params = params_default.copy()
params.update(algo_params)
print(params)

{'hidden_layer_sizes': (100, 100, 100, 100), 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0001, 'batch_size': 'auto', 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'power_t': 0.5, 'max_iter': 200, 'shuffle': True, 'random_state': None, 'tol': 0.0001, 'verbose': False, 'warm_start': False, 'momentum': 0.9, 'nesterovs_momentum': True, 'early_stopping': False, 'validation_fraction': 0.1, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-08}


In [None]:
rnn = MLPRegressor(
    hidden_layer_sizes =  params["hidden_layer_sizes"], 
    activation =  params["activation"],
    solver =  params["solver"],
    alpha =  params["alpha"], 
    batch_size =  params["batch_size"],
    learning_rate =  params["learning_rate"], 
    learning_rate_init =  params["learning_rate_init"], 
    power_t =  params["power_t"], 
    max_iter =  params["max_iter"], 
    shuffle =  params["shuffle"], 
    random_state =  params["random_state"], 
    tol =  params["tol"], 
    verbose =  params["verbose"], 
    warm_start =  params["warm_start"], 
    momentum =  params["momentum"], 
    nesterovs_momentum =  params["nesterovs_momentum"], 
    early_stopping =  params["early_stopping"], 
    validation_fraction =  params["validation_fraction"], 
    beta_1 =  params["beta_1"], 
    beta_2 =  params["beta_2"], 
    epsilon =  params["epsilon"]
)

In [10]:
X = StandardScaler().fit_transform(X)
if train_set > 0: 
    X_train = StandardScaler().fit_transform(X_train)
    #y_train = StandardScaler().fit_transform(y_train)
if test_set > 0: 
    test_set = StandardScaler().fit_transform(test_set)
    #y_test = StandardScaler().fit_transform(y_test)
if validation_set > 0: 
    validation_set = StandardScaler().fit_transform(validation_set)
    #y_validation = StandardScaler().fit_transform(y_validation)
if final_test_set > 0: 
    final_test_set = StandardScaler().fit_transform(final_test_set)
    #y_final_test = StandardScaler().fit_transform(y_final_test)

In [None]:
rnn.fit(X_train,y_train)
#rnn.fit(X,y)

In [None]:
prediction_In_sample = rnn.predict(X_train)
error_In = prediction_In_sample - y_train
if test_set > 0:
    prediction_Out_sample = rnn.predict(X_test)
    error_Out = prediction_Out_sample - y_test

In [None]:
plt.figure(figsize=(5,5))
sns.distplot(error_In, kde=False, color="blue", hist = True, norm_hist=False)
plt.title("Erreur In sample")
plt.show()

if test_set > 0:
    plt.figure(figsize=(5,5))
    sns.distplot(error_Out, kde=False, color="blue", hist = True, norm_hist=False)
    plt.title("Erreur Out of sample")
    plt.show()

In [None]:
data["Prediction In Sample"] = prediction_In_sample
data["Error In sample"] = error_In
if test_set > 0:
    data["Prediction Out Sample"] = prediction_Out_sample
    data["Error Out sample"] = error_Out

In [None]:
data.to_csv("Prediction.csv", sep='\t')

In [None]:
relative_error = []
for i, x in enumerate(y):
    if np.abs(x) > 0.000001:
        relative_error.append(prediction[i] / y[i] - 1)
#relative_error = prediction / y - 1
#print(np.min(relative_error))
print(np.mean(relative_error))
plt.figure(figsize=(5,5))
sns.distplot(relative_error, kde=False, color="blue", hist = True, norm_hist=False)
plt.title("Error")
plt.show()

In [None]:
mse = np.average(np.power(error,2))
print(mse)

In [11]:
data_qty_min = 0.1
data_qty_max = 1
data_qty_nb = 20
data_qty_step = (data_qty_max - data_qty_min)/(data_qty_nb-1)

layers_min = 1
layers_max = 5
layers_nb = 3
layers_step = (layers_max - layers_min)/(layers_nb-1)

neurones_min = 1
neurones_max = 100
neurones_nb = 3
neurones_step = (neurones_max - neurones_min)/(neurones_nb-1)

In [12]:
test_mode = False
if test_mode:
    data_qty_min = 0.1
    data_qty_max = 1
    data_qty_nb = 2
    data_qty_step = (data_qty_max - data_qty_min)/(data_qty_nb-1)

    layers_min = 1
    layers_max = 2
    layers_nb = 2
    layers_step = (layers_max - layers_min)/(layers_nb-1)

    neurones_min = 1
    neurones_max = 100
    neurones_nb = 2
    neurones_step = (neurones_max - neurones_min)/(neurones_nb-1)

data_qty = np.arange(data_qty_min, data_qty_max+data_qty_step, data_qty_step)
data_qty = [round(x, 2) for x in data_qty]
layers = np.arange(layers_min, layers_max+layers_step, layers_step)
layers = [int(x) for x in layers]
neurones = np.arange(neurones_min, neurones_max+neurones_step, neurones_step)
neurones = [int(x) for x in neurones]

In [13]:
def rnn_layers_neurones(layer, neurone):
    hidden_layer_sizes = (neurone, )
    if layer == 1.0: return hidden_layer_sizes
    elif layer > 1.0:
        for i in range(layer-1):
            hidden_layer_sizes += (neurone,)
        return hidden_layer_sizes

In [3]:
layers = [1, 2, 5]
neurones = [1, 50, 100]

layers_neurones = [[(100, 50, 30, 20)],
                  [(1, 1)]]

for x in layers_neurones:
    print(x)

[(100, 50, 30, 20)]
[(1, 1)]


In [15]:
algo_params = {
"batch_size" : "auto",
}
params = params_default.copy()
params.update(algo_params)

labels = ["data_qty", "layers/neurones", "mse"]
prediction = []

probs = np.random.rand(len(data))
for data_prop in data_qty:
    X_prop = probs < data_prop
    X_data = X[X_prop]
    y_data = y[X_prop]
    X_data = StandardScaler().fit_transform(X_data)
    for layer_neurone in layers_neurones:
        algo_params = {
        "hidden_layer_sizes" : layer_neurone[2] ,
        }
        params = params.copy()
        params.update(algo_params)
        rnn = MLPRegressor(
            hidden_layer_sizes =  params["hidden_layer_sizes"], 
            activation =  params["activation"],
            solver =  params["solver"],
            alpha =  params["alpha"], 
            batch_size =  params["batch_size"],
            learning_rate =  params["learning_rate"], 
            learning_rate_init =  params["learning_rate_init"], 
            power_t =  params["power_t"], 
            max_iter =  params["max_iter"], 
            shuffle =  params["shuffle"], 
            random_state =  params["random_state"], 
            tol =  params["tol"], 
            verbose =  params["verbose"], 
            warm_start =  params["warm_start"], 
            momentum =  params["momentum"], 
            nesterovs_momentum =  params["nesterovs_momentum"], 
            early_stopping =  params["early_stopping"], 
            validation_fraction =  params["validation_fraction"], 
            beta_1 =  params["beta_1"], 
            beta_2 =  params["beta_2"], 
            epsilon =  params["epsilon"]
        )
        rnn.fit(X_data,y_data)
        prediction_temp = rnn.predict(X_data) # prediction in sample
        error = []
        for i, y_temp in enumerate(y_data):
            if np.abs(y_temp) > 0.000001:
                error.append(prediction_temp[i] / y_temp - 1)
        mse_temp = np.average(np.abs(error))
        #mse_temp = np.average(np.power(prediction_temp - y_data,2))
        #MSE_train = mean_squared_error(df_train[use_feature][n_steps:], pred_train[:, 0])
        prediction.append([data_prop, str(layer_neurone[0])+"/"+str(layer_neurone[1]), mse_temp])
        print("Data: " + str(round(data_prop, 2)) + " layer/neurones: " + str(layer_neurone[0])+"/"+str(layer_neurone[1])
             +" error: " + str(mse_temp))
    
output = pd.DataFrame(prediction, columns=labels)

Data: 0.1 layer/neurones: 4/1 error: 32.3042105322
Data: 0.15 layer/neurones: 4/1 error: 19.7477191767
Data: 0.19 layer/neurones: 4/1 error: 25.2980536988
Data: 0.24 layer/neurones: 4/1 error: 18.2534763087
Data: 0.29 layer/neurones: 4/1 error: 18.0436074473
Data: 0.34 layer/neurones: 4/1 error: 33.6019702528
Data: 0.38 layer/neurones: 4/1 error: 11.2667202401
Data: 0.43 layer/neurones: 4/1 error: 19.3395813777
Data: 0.48 layer/neurones: 4/1 error: 8.67793398195
Data: 0.53 layer/neurones: 4/1 error: 30.257952343
Data: 0.57 layer/neurones: 4/1 error: 18.3350685357
Data: 0.62 layer/neurones: 4/1 error: 18.5584110079
Data: 0.67 layer/neurones: 4/1 error: 7.25976082063
Data: 0.72 layer/neurones: 4/1 error: 7.58309771127
Data: 0.76 layer/neurones: 4/1 error: 8.61052349683
Data: 0.81 layer/neurones: 4/1 error: 11.3219175328
Data: 0.86 layer/neurones: 4/1 error: 13.9562794838
Data: 0.91 layer/neurones: 4/1 error: 12.5414879454
Data: 0.95 layer/neurones: 4/1 error: 4.03708854445
Data: 1.0 laye

In [16]:
output.to_csv("Output.csv", sep='\t')

In [None]:
rnn.get_params()