In [None]:
#Basic Modules
import pandas as pd
import numpy as np
#plots
import seaborn as sns
import plotly.graph_objects as go
#Data Handling
from sklearn.preprocessing import (Normalizer,StandardScaler,MinMaxScaler)
from sklearn.preprocessing import OneHotEncoder

# Genetic Modules
from gplearn.genetic import SymbolicTransformer
from sklearn.ensemble import RandomForestClassifier
from matplotlib import rcParams
from cycler import cycler
#Log Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, accuracy_score, recall_score, precision_score,
                             f1_score,balanced_accuracy_score)
#Naive Bayes
from sklearn.naive_bayes import GaussianNB, CategoricalNB
# metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import ConfusionMatrixDisplay,r2_score
# Neural Network
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import plot_model
import matplotlib.pyplot as plt

import pickle
import seaborn as sb
import graphviz
import pydot
from tensorflow.keras.utils import plot_model
import tensorflow as tf
from tensorflow.keras import layers
from scikeras.wrappers import KerasClassifier
from sklearn.pipeline import Pipeline
from  sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from keras.callbacks import EarlyStopping
import tensorflow_addons as tfa
from sklearn import tree
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV


from sklearn.feature_selection import SelectKBest

In [None]:
route = 'C:/Users/Emilio/Desktop/TOG Exp/Datasets/'
# data = pd.read_parquet(route+'ohlcV.parquet')
# data = pd.read_parquet(route+'ohlc-2017.parquet')
data = pd.read_parquet(route + 'ohlc-2021.parquet')
#data = pd.read_parquet(route + 'ohlc1.parquet')
data = data.drop(['drop', 'close_time'], axis=1)
data.head()

In [None]:
print(data.shape)
print(data.isnull().sum())

In [None]:
# %% Quick description of the data
desc = data.describe()
info = data.info()
print(desc, info)

In [None]:
# %matplotlib inline
data.iloc[:,:-5].hist(bins=50, figsize=(20, 15))
plt.show()

data.plot()

In [None]:
data.iloc[:,+4:].hist(bins=50, figsize=(20, 15))
plt.show()


In [None]:
# %% Resampling
rs = '5min'
data1 = data.copy()
data1['open'] = data1['open'].resample(rs).first()
data1['high'] = data1['high'].resample(rs).max()
data1['low'] = data1['low'].resample(rs).min()
data1['close'] = data1['close'].resample(rs).last()
data1['volume'] = data1['volume'].resample(rs).sum()
data1['quote_asset_volume'] = data1['quote_asset_volume'].resample(rs).sum()
data1['trades'] = data1['trades'].resample(rs).sum()
data1['buy_asset_volume'] = data1['buy_asset_volume'].resample(rs).sum()
data1['taker_buy_asset_volume'] = data1['taker_buy_asset_volume'].resample(rs).sum()
data1 = data1.resample(rs).sum()
print(data1.shape)
data1.head()


In [None]:
# %% Feature Engineering

ohlcv = data1.copy()
# OHLC Chart
fig = go.Figure(data=go.Ohlc(x=ohlcv.index,
                             open=ohlcv['open'],
                             high=ohlcv['high'],
                             low=ohlcv['low'],
                             close=ohlcv['close']))

fig.show()


In [None]:
# Volatility
volatility = []
for i in range(len(ohlcv)):
    vol = ohlcv['high'][i] - ohlcv['low'][i]
    volatility.append(vol)
ohlcv['volatility'] = volatility

# micro trends
high_open = []
for i in range(len(ohlcv)):
    ho = ohlcv['high'][i] - ohlcv['open'][i]
    high_open.append(ho)
ohlcv['high_open'] = high_open

open_low = []
for i in range(len(ohlcv)):
    ol = ohlcv['open'][i] - ohlcv['low'][i]
    open_low.append(ol)
ohlcv['open_low'] = open_low

close_open = []
for i in range(len(ohlcv)):
    co = ohlcv['close'][i] - ohlcv['open'][i]
    close_open.append(co)
ohlcv['close_open'] = close_open
ohlcv.head()

ohlcv2 = ohlcv.copy()
ohlcv2.head()

In [None]:
# %% Rolling stats
ohlcv2 = ohlcv2.drop(['quote_asset_volume', 'buy_asset_volume', 'taker_buy_asset_volume', 'trades'], axis=1)

def autoregressive_features(p_data, p_memory):
    """
    Creacion de variables de naturaleza autoregresiva (resagos, promedios, diferencias)
    Parameters
    ----------
    p_data: pd.DataFrame
        with OHLCV columns: Open, High, Low, Close, Volume
    p_memory: int
        A value that represents the implicit assumption of a "memory" effect in the prices
    Returns
    -------
    r_features: pd.DataFrame

    """

    # work with a separate copy of original data
    data = ohlcv2.copy()

    # nth-period final price "movement"
    data['co'] = (data['close'] - data['open'])
    # nth-period uptrend movement
    data['ho'] = (data['high'] - data['open'])
    # nth-period downtrend movement
    data['ol'] = (data['open'] - data['low'])
    # nth-period volatility measure
    data['hl'] = (data['high'] - data['low'])

    # N features with window-based calculations
    for n in range(0, p_memory):
        data['ma_ol'] = data['ol'].rolling(n + 2).mean()
        data['ma_ho'] = data['ho'].rolling(n + 2).mean()
        data['ma_hl'] = data['hl'].rolling(n + 2).mean()

        data['lag_ol_' + str(n + 1)] = data['ol'].shift(n + 1)
        data['lag_ho_' + str(n + 1)] = data['ho'].shift(n + 1)
        data['lag_hl_' + str(n + 1)] = data['hl'].shift(n + 1)

        data['sd_ol_' + str(n + 1)] = data['ol'].rolling(n + 1).std()
        data['sd_ho_' + str(n + 1)] = data['ho'].rolling(n + 1).std()
        data['sd_hl_' + str(n + 1)] = data['hl'].rolling(n + 1).std()

        data['lag_vol_' + str(n + 1)] = data['volume'].shift(n + 1)
        data['sum_vol_' + str(n + 1)] = data['volume'].rolling(n + 1).sum()
        data['mean_vol_' + str(n + 1)] = data['volume'].rolling(n + 1).mean()

    # timestamp as index
    data.index = pd.to_datetime(data.index)
    # select columns, drop for NAs, change column types, reset index
    r_features = data.drop(['open', 'high', 'low', 'close', 'hl', 'ol', 'ho', 'volume'], axis=1)
    r_features = r_features.dropna(axis='columns', how='all')
    # r_features = r_features.dropna(axis='rows')
    r_features.iloc[:, 1:] = r_features.iloc[:, 1:].astype(float)
    r_features.reset_index(inplace=True, drop=True)

    return r_features

In [None]:
p_memory = 24
data_ar = autoregressive_features(p_data=ohlcv2, p_memory=p_memory)
data_ar.index = ohlcv2.index

In [None]:
ohlcv = pd.merge(ohlcv, data_ar, on='timestamp')
ohlcv = ohlcv.dropna(axis='rows')
ohlcv

In [None]:
ohlcv['co'].describe()

In [None]:
quantile= abs(ohlcv['co'].quantile(.25)) + abs(ohlcv['co'].quantile(.75))
treshold1 =ohlcv['co'].quantile(.25)
treshold2 =ohlcv['co'].quantile(.50)
treshold3 = ohlcv['co'].quantile(.75)

In [None]:
# Target 
# %% Target Engineering y_hat:CO_{t}


ohlc = pd.DataFrame(ohlcv)

y_hat = []

for i in range(len(ohlc)):
    y_hat1 = ohlc["close"][i] - ohlc["open"][i]
    y_hat.append(y_hat1)
for i in range(len(ohlc)):
    if y_hat[i] < treshold1:
        y_hat[i] = -2
    elif treshold1 <= y_hat[i] < 0:
        y_hat[i] = -1
    elif y_hat[i] == 0:
        y_hat[i] = -1
    elif 0 < y_hat[i] <= treshold3:
        y_hat[i] = 1
    else:
        y_hat[i] = 2
# cambiar y_hat por y_hat
ohlc['y_hat'] = y_hat
y_hat_test = y_hat
ohlc.head()

In [None]:
ohlc['y_hat'] = y_hat
ohlc['y_hat'] = ohlc['y_hat'].shift(-1)
# ohlc.dropna(inplace = True, axis=0)
ohlc.head()
ohlc['y_hat'].value_counts()


In [None]:
mart = []
# cambiar threshold a variables, en vez de hardcode que defina las clases y puede ser asimetrica.
for i in range(len(ohlc)):
    y_hat1 = ohlc["close"][i] - ohlc["open"][i]
    mart.append(y_hat1)
for i in range(len(ohlc)):
    if mart[i] < treshold1:
        mart[i] = -2
    elif treshold1 <= mart[i] < 0:
        mart[i] = -1
    elif mart[i] == 0:
        mart[i] = -1
    elif 0 < mart[i] <= treshold3:
        mart[i] = 1
    else:
        mart[i] = 2


In [None]:
# Martingale

ohlc['martingale'] = mart
ohlc['martingale'] = ohlc['martingale'].shift(+1)
ohlc = ohlc.fillna(-1)
ohlc2 = pd.DataFrame(ohlc)
ohlc2.to_csv('C:/Users/Emilio/Desktop/Thesis/ohlc2.csv')
ohlc2['martingale'].value_counts()


In [None]:
print(ohlc['y_hat'].describe())

In [None]:
accuracy_martingala = accuracy_score(y_hat_test, ohlc2['martingale']) * 100
recall_martingala = recall_score(y_hat_test, ohlc2['martingale'], average='weighted') * 100
precision_martingala = precision_score(y_hat_test, ohlc2['martingale'], average='weighted') * 100
f1_martingala = f1_score(y_hat_test, ohlc2['martingale'], average='weighted') * 100
balanced_martingala = balanced_accuracy_score(y_hat_test, ohlc2['martingale']) * 100

print('Accuracy:', accuracy_martingala, '%')
print('Recall:', recall_martingala, '%')
print('Precision:', precision_martingala, '%')
print('F1:', f1_martingala, '%')
print('Balanced Accuracy:', balanced_martingala, '%')


In [None]:
confusion_mart = confusion_matrix(ohlc2['y_hat'], ohlc2['martingale'])
print(confusion_mart)

In [None]:
fig, px = plt.subplots(figsize=(4, 4))
px.matshow(confusion_mart, cmap=plt.cm.YlOrRd, alpha=0.5)
for m in range(confusion_mart.shape[0]):
    for n in range(confusion_mart.shape[1]):
        px.text(x=m, y=n, s=confusion_mart[m, n], va='center', ha='center', size='large')

# Sets the labels
plt.xlabel('Predictions', fontsize=16)
plt.ylabel('Actual', fontsize=16)
plt.title('Confusion Matrix Martingale', fontsize=15)
plt.show()

In [None]:
ohlc2['y_hat'].hist()


In [None]:
plt.plot(ohlc2['y_hat'])

In [None]:
plt.boxplot(ohlc2['y_hat'], notch=None, vert=None, patch_artist=None, widths=None)

In [None]:
corr_y = ohlc2.corrwith(ohlc2['y_hat']).abs()
corr_y

In [None]:
X = ohlc2.iloc[:, :-2]
X.head()
y = ohlc2['y_hat'].copy()
y_true = y.copy()
y.head()

# %% Heatmaps
# Heat map
correlation = X.corr()
print(correlation)

heat = sns.heatmap(
    correlation,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
heat.set_xticklabels(
    heat.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
plt.show()

In [None]:
correlation = X.corr() 
print(correlation)

heat = sns.heatmap( correlation, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(20, 220, n=200), square=False ) 
heat.set_xticklabels( heat.get_xticklabels(), rotation=90, horizontalalignment='right' ) 
plt.show()


In [None]:
corr = correlation[correlation >= .80]
corr2 = corr.corr(method='spearman')
print(corr2)
plt.figure(figsize=(12, 8))
sns.heatmap(corr, cmap="Reds")

In [None]:
corr = correlation[correlation >= .80]
corr2 = corr.corr(method='pearson')
print(corr2)
plt.figure(figsize=(12, 8))
sns.heatmap(corr, cmap="Reds")


In [None]:
cor_matrix = X.corr().abs()
print(cor_matrix)
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))
print(upper_tri)

In [None]:
# .5 according to JF, but not many features after that.
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] >= .80)]
print();
print(to_drop)

In [None]:
x = X.drop(X[to_drop], axis=1)
print();
print(x.head())
x.head()

In [None]:

best = SelectKBest(k=50)
x_new= best.fit_transform(x,y)
x_new.shape
selected = best.get_support(indices=True)
print(x.columns[selected])

In [None]:
used_features = x.columns[selected]
plt.title('Correlación de Características Seleccionadas');
sb.heatmap(x[used_features].astype(float).corr(),
           linewidths=0.1,
           vmax=1.0, 
           square=False, 
           cmap='viridis', 
           linecolor='white', 
           annot=True);

In [None]:
x_neo = x[used_features]



In [None]:
#  Min Max Scaler
scale = MinMaxScaler().fit(x_neo)
X_scale = scale.transform(x_neo)
X_scale = pd.DataFrame(X_scale, index=x_neo.index, columns=x_neo.columns)
X_scale.head()

In [None]:
# Normalizer
transformer = Normalizer(norm='max').fit(X_scale)
transformed = transformer.transform(X_scale)
X_normalized = pd.DataFrame(transformed, index=x_neo.index, columns=x_neo.columns)
X_normalized.head()

In [None]:
x = X_normalized.copy()
x.head()

In [None]:
rcParams['figure.figsize'] = 15, 5
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False
rcParams['axes.prop_cycle'] = cycler(color=['#365977'])
rcParams['lines.linewidth'] = 2.5
plt.title('x', size=20)
plt.plot(X);

In [None]:
rcParams['figure.figsize'] = 15, 5
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False
rcParams['axes.prop_cycle'] = cycler(color=['#365977'])
rcParams['lines.linewidth'] = .5
plt.title('Y', size=20)
plt.plot(y);

In [None]:
test_size = int(len(x) * .30)
trial_size = int(len(x) * .10)
X_train = x[:-test_size].copy()
X_test = x[-test_size:].copy()
X_test = X_test[:-trial_size].copy()
X_trial = X_test[-trial_size:].copy()

X_train2 = X_train.copy()
X_test2 = X_test.copy()
X_trial2 = X_trial.copy()

X_train3 = X_train.copy()
X_test3 = X_test.copy()
X_trial3 = X_trial.copy()

X_train4 = X_train.copy()
X_test4 = X_test.copy()
X_trial4 = X_trial.copy()

X_train5 = X_train.copy()
X_test5 = X_test.copy()
X_trial5 = X_trial.copy()

In [None]:
plt.title('X', size=20)
plt.plot(X_train)  # ,label='Training set')
plt.plot(X_test, label='Test set', color='orange')
plt.plot(X_trial,label= 'Trial set',color='green')
plt.legend;


In [None]:
y_train = y[:-test_size].copy()
y_test = y[-test_size:].copy()
y_test = y_test[:-trial_size].copy()
y_trial = y_test[-trial_size:].copy()

y_train2 = y_train.copy()
y_test2 = y_test.copy()
y_trial2 = y_trial.copy()

y_train3 = y_train.copy()
y_test3 = y_test.copy()
y_trial3 = y_trial.copy()

y_train4 = y_train.copy()
y_test4 = y_test.copy()
y_trial4 = y_trial.copy()

y_train5 = y_train.copy()
y_test5 = y_test.copy()
y_trial5 = y_trial.copy()

Y_train_dum = pd.get_dummies(y_train5, prefix='y')
Y_test_dum = pd.get_dummies(y_test5, prefix='y')
Y_trial_dum = pd.get_dummies(y_trial5,prefix='y')

In [None]:
plt.title('y', size=20)
plt.plot(y_train, label='Training set')
plt.plot(y_test, label='Test set', color='orange')
plt.plot(y_trial,label='Trial set',color='green')
plt.legend;


In [None]:
#naive bayes
# %% Naive Bayes
print(X_train.shape, X_test.shape)
# instantiate the model
gnb = GaussianNB()
# gnb = CategoricalNB()
# pgmpy()

# fit the model
gnb.fit(X_train2, y_train2)

In [None]:
y_train

In [None]:
y_pred2 = gnb.predict(X_test2)

In [None]:
accuracy_naive = (accuracy_score(y_test2, y_pred2)) * 100
recall_naive = recall_score(y_test2, y_pred2, average='weighted') * 100
precision_naive = precision_score(y_test2, y_pred2, average='weighted', zero_division=0) * 100
f1_naive = f1_score(y_test2, y_pred2, average='weighted') * 100
balanced_naive = balanced_accuracy_score(y_test2, y_pred2) * 100
print('Accuracy:', accuracy_naive, '%')
print('Recall:', recall_naive, '%')
print('Precision:', precision_naive, '%')
print('F1:', f1_naive, '%')
print('Balanced Accuracy:', balanced_naive, '%')

In [None]:
y_pred_train = gnb.predict(X_train2)
conf_mat = confusion_matrix(y_test2, y_pred2)
fig, px = plt.subplots(figsize=(3.5, 3.5))
px.matshow(conf_mat, cmap=plt.cm.YlOrRd, alpha=0.5)
for m in range(conf_mat.shape[0]):
    for n in range(conf_mat.shape[1]):
        px.text(x=m, y=n, s=conf_mat[m, n], va='center', ha='center', size='large')
# Sets the labels
plt.xlabel('Predictions', fontsize=16)
plt.ylabel('Actual', fontsize=16)
plt.title('Confusion Matrix Naive Bayes', fontsize=15)
plt.show()

In [None]:
y_pred2_trial = gnb.predict(X_trial2)

In [None]:
accuracy_naive2 = (accuracy_score(y_trial2, y_pred2_trial)) * 100
recall_naive2 = recall_score(y_trial2, y_pred2_trial, average='weighted') * 100
precision_naive2 = precision_score(y_trial2, y_pred2_trial, average='weighted', zero_division=0) * 100
f1_naive2 = f1_score(y_trial2, y_pred2_trial, average='weighted') * 100
balanced_naive2 = balanced_accuracy_score(y_trial2, y_pred2_trial) * 100
print('Accuracy:', accuracy_naive2, '%')
print('Recall:', recall_naive2, '%')
print('Precision:', precision_naive2, '%')
print('F1:', f1_naive2, '%')
print('Balanced Accuracy:', balanced_naive2, '%')

In [None]:
# Logistic regression simple
logistic_model_simple = LogisticRegression(max_iter=1000000)
logistic_model_simple.fit(X_train,y_train)

In [None]:
# logistic_model.predict_proba((X[:2, :])
y_pred = logistic_model_simple.predict(X_test)
accuracy = accuracy_score(y_test, y_pred) * 100
logistic_model_simple.score(x, y)
confusion_mat = confusion_matrix(y_test, y_pred)
print(accuracy)


In [None]:
logistic_model_simple.get_params(deep=True)

In [None]:
# si fueron 0 se quitaron por efecto de la regularizacion L1 y son:
# si
coef = logistic_model_simple.coef_
print(logistic_model_simple.intercept_)
print(coef)

In [None]:
predict_ = logistic_model_simple.predict_proba(X_train)
print(predict_)

In [None]:
plt.plot(X_test, y_pred, 'r')
plt.plot(X, y_true, 'b')
plt.axis([1, 30000, -3, 3])


In [None]:
y_pred_simple = logistic_model_simple.predict(X_test)
accuracy_logistic_simple = accuracy_score(y_test, y_pred) * 100
logistic_model_simple.score(x, y)
confusion_mat = confusion_matrix(y_test, y_pred)
recall_logistic_simple = recall_score(y_test, y_pred, average='weighted') * 100
precision_logistic_simple = precision_score(y_test, y_pred, average='weighted', zero_division=0) * 100
f1_logistic_simple = f1_score(y_test, y_pred, average='weighted') * 100
balanced_logistic_simple = balanced_accuracy_score(y_test, y_pred) * 100

print('Accuracy:', accuracy_logistic_simple, '%')
print('Recall:', recall_logistic_simple, '%')
print('Precision:', precision_logistic_simple, '%')
print('F1:', f1_logistic_simple, '%')
print('Balanced Accuracy:', balanced_logistic_simple, '%')
print("Confusion Matrix:")

print(confusion_mat)


In [None]:
y_pred_simple2 = logistic_model_simple.predict(X_trial)
accuracy = accuracy_score(y_trial, y_pred_simple2) * 100
logistic_model_simple.score(x, y)
confusion_mat = confusion_matrix(y_trial, y_pred_simple2)
print(accuracy)

In [None]:
# Regression with Elastic Net and hiper parameters optimization


In [None]:
pipelineLR = make_pipeline(
LogisticRegression(random_state=False, penalty='elasticnet',solver ='saga', max_iter=1000000, C=1
 ))
param_grid_lr = [{
    'logisticregression__l1_ratio':[.10,.20,.30,.40,.50,.60,.70,.80,.90],
    'logisticregression__class_weight':[None],
    'logisticregression__C':[0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 10.0],
    'logisticregression__solver':['saga']
}]
gd_lm = GridSearchCV(estimator=pipelineLR,
                     param_grid=param_grid_lr,
                     scoring='accuracy',
                     cv=10,
                     n_jobs=-1)
gd_lm.fit(X_train3, y_train3)
print(gd_lm.best_score_)

In [None]:
gd_lm.best_params_

In [None]:
clfLR = gd_lm.best_estimator_
clfLR

In [None]:
clfLR.score(X_test3, y_test3)

In [None]:
logistic_model = LogisticRegression(random_state=False, penalty='elasticnet',
                                            solver='saga', l1_ratio=0.4, max_iter=1000000, C=10,
                                            class_weight= None, )
logistic_model.fit(X_train3, y_train3)

In [None]:
y_pred3 = logistic_model.predict(X_test3)
accuracy = accuracy_score(y_test3, y_pred3) * 100
logistic_model.score(x, y)
confusion_mat = confusion_matrix(y_test3, y_pred3)

y_pred_trial = logistic_model.predict(X_trial3)
accuracy_trial = accuracy_score(y_trial3, y_pred_trial) * 100
logistic_model.score(x, y)
print(accuracy)
print(accuracy_trial)

In [None]:
pickle.dump(clfLR, open('logg1.1.pkl', 'wb'))

In [None]:
accuracy_logistic= accuracy_score(y_test3, y_pred3) * 100
recall_logistic = recall_score(y_test3, y_pred3, average='weighted') * 100
precision_logistic = precision_score(y_test3, y_pred3, average='weighted', zero_division=0) * 100
f1_logistic = f1_score(y_test3, y_pred3, average='weighted') * 100
balanced_logistic = balanced_accuracy_score(y_test3, y_pred3) * 100

print('Accuracy:', accuracy_logistic, '%')
print('Recall:', recall_logistic, '%')
print('Precision:', precision_logistic, '%')
print('F1:', f1_logistic, '%')
print('Balanced Accuracy:', balanced_logistic, '%')
print("Confusion Matrix:")

print(confusion_mat)


In [None]:
fig, px = plt.subplots(figsize=(3.5, 3.5))
px.matshow(confusion_mat, cmap=plt.cm.YlOrRd, alpha=0.5)
for m in range(confusion_mat.shape[0]):
    for n in range(confusion_mat.shape[1]):
        px.text(x=m, y=n, s=confusion_mat[m, n], va='center', ha='center', size='large')

# Sets the labels
plt.xlabel('Predictions', fontsize=16)
plt.ylabel('Actuals', fontsize=16)
plt.title('Confusion Matrix Regresion Log', fontsize=15)
plt.show()

# Random Forest

In [None]:
seed = 4

In [None]:
pipelineRFC = make_pipeline(
RandomForestClassifier(max_depth=None,
                       min_samples_split=2,
                       max_features='auto',
                       n_estimators=1000,
                       bootstrap=True,
                       oob_score=False,
                       verbose=1))

param_grid_rfc = [{
    'randomforestclassifier__max_features':['sqrt', 'log2', None],
    'randomforestclassifier__criterion':['gini','entropy','log_loss'],
    #'ranfomforestclassifier__n_estimators':[100, 500,1000]    
}]

gsrfc =GridSearchCV(estimator= pipelineRFC,
                   param_grid=param_grid_rfc,
                   scoring='accuracy',
                   cv=10,
                   n_jobs=1)
gs_rfc =gsrfc.fit(X_train4,y_train4)
print(gs_rfc.best_score_)

In [None]:
print(gs_rfc.best_params_)


In [None]:
clfRFC = gs_rfc.best_estimator_
clfRFC

In [None]:
clfRFC.score(X_test4, y_test4)

In [None]:
# Create a Random forest Classifier
clf = RandomForestClassifier(n_estimators=1000, max_depth=None,
                             max_features='sqrt', min_samples_split=2,
                             oob_score=True,n_jobs=4, criterion= 'gini')

# Train the model using the training sets
clf.fit(X_train4, y_train4)


In [None]:
pickle.dump(clfRFC, open('forest.pkl', 'wb'))

In [None]:
# performing predictions on the test dataset
y_pred4 = clf.predict(X_test4)
accuracy_forest = accuracy_score(y_test4, y_pred4) * 100
print(clf.score(X_test4, y_test4))

In [None]:
# performing predictions on the trial dataset
y_pred4_trial = clf.predict(X_trial4)
accuracy_forest_trial = accuracy_score(y_trial4, y_pred4_trial) * 100
print(clf.score(X_trial4, y_trial4))

In [None]:
confusion_mat = confusion_matrix(y_test4, y_pred4)

In [None]:
recall_forest = recall_score(y_test4, y_pred4, average='weighted') * 100
precision_forest = precision_score(y_test4, y_pred4, average='weighted', zero_division=0) * 100
f1_forest = f1_score(y_test4, y_pred4, average='weighted') * 100
balanced_forest = balanced_accuracy_score(y_test4, y_pred4) * 100

print('Accuracy:', accuracy_forest, '%')
print('Recall:', recall_forest, '%')
print('Precision:', precision_forest, '%')
print('F1:', f1_forest, '%')
print('Balanced Accuracy:', balanced_forest, '%')
print("Confusion Matrix:")

print(confusion_mat)

In [None]:
#%% Attributes of the model bag
clf.base_estimator_ # Base estimator: Base configuration of each model
clf.estimators_ # list of individual models created in each iteration
clf.estimator_params # extraction of the configurable parameters of each estimator
score = clf.score(x,y) # Model fit metric
#modelo.oob_score_ # Out-of-the-bag data score

In [None]:
#%% Visualizar el arbol de decision

tree.plot_tree(clf.estimators_[0]) 

In [None]:
# Decision tree
Yprob_train_rf = clf.predict_proba(X_train4)[:,1]
Yprob_test_rf = clf.predict_proba(X_test4)[:,1]

## Multilayer perceptron for Classification

In [None]:
n_inputs = len(list(X_train5.columns))
neuron_quant = [n_inputs, n_inputs+2,n_inputs+4,n_inputs+8,n_inputs+16,n_inputs+32,n_inputs+64]
quant_hidden = [1,2]
learning_rate = [0.01,0.1,0.5,1]
batch_size = [1,8,16]



In [None]:
recc = []
fitness=0
model_best =[]
for i in neuron_quant:
    for j in quant_hidden:
        for k in batch_size:
                
               

                # Agregar: learning rate, momentum, nesterov, dropout, modificar capas, neuronas por capa, funcion activacion
                # por capa.
                n_inputs = len(list(X_train5.columns))
                # Neural network structure
                model = Sequential()
                
                model.add(Dense(n_inputs, activation='sigmoid', input_shape=(n_inputs,)))
                # model.add(Dense(10, activation='softplus'))
                model.add(Dense(i, activation='sigmoid'))
                model.add(Dense(4, activation='softmax'))
                # Optimizer configuration
                # model.compile(loss='binary_crossentropy',
                # with no one hot encoding
                #model.compile(loss='sparse_categorical_crossentropy',
                #              optimizer='Adam',
                #              metrics=['accuracy'])
                # with one hot encoding
                model.compile(loss='categorical_crossentropy',
                              optimizer='Adam',
                              metrics=['accuracy'])
                model_history = model.fit(X_train5, Y_train_dum, epochs=50, batch_size=k, verbose=1,
                                         validation_data=(X_test5,Y_test_dum))
                score = model.evaluate(X_test5, Y_test_dum,verbose=1)
                Y_prob = model.predict(X_test5)
                R2_score_val = r2_score(Y_test_dum, Y_prob)
                metric = tfa.metrics.F1Score(num_classes=4, threshold=0.5)
                metric.update_state(Y_test_dum, Y_prob)
                result = metric.result()
                result_mean= np.mean(result)

                result.numpy()
                if result_mean > fitness:
                    model_best = model
                fitness = result_mean
                
                #model.evalu model.evaluate(x_test, y_test, batch_size=128) 
                recc.append(model_history)
model.save_weights(route+'weights')

In [None]:
model_best.save_weights(route+'weights1.1')

In [None]:
model_best

In [None]:
pickle.dump(model_best, open('mlp1.1.pkl', 'wb'))

In [None]:
# %% View the training performance

fig = plt.figure(figsize=(20, 10))
plt.subplot(121)
plt.plot(model_history.history['loss'])
plt.xlabel('Epochs'), plt.ylabel('Loss function')
plt.subplot(122)
plt.plot(model_history.history['accuracy'])
plt.xlabel('Epochs'), plt.ylabel('Accuracy function')

In [None]:

# %% Neural network weights
model_best.layers[0].get_weights()
# model.get_config() #model configuration
# %% View the model
plot_model(model_best)
# plot_model(model, to_file='../figures/P9_fig/model.png', show_shapes=True)
# %% Use the model
# Latest version
Y_prob = model.predict(X_test5)
Y_pred = np.argmax(Y_prob, axis=1)

In [None]:

#see the inputs and outputs
plot_model(model,show_shapes=True)

In [None]:
score = model_best.evaluate(X_test4, Y_test_dum, verbose=1)
print(score)

In [None]:
# %% More metrics

Yhat_train = np.argmax(model.predict(X_train5), axis=1)
Yhat_test = np.argmax(model.predict(X_test5), axis=1)
accu_train = accuracy_score(y_train5, Yhat_train)
prec_train = precision_score(y_train5, Yhat_train, average='weighted',zero_division = 1)
reca_train = recall_score(y_train5, Yhat_train, average='weighted',zero_division = 1)
accu_test = accuracy_score(y_test5, Yhat_test)
prec_test = precision_score(y_test5, Yhat_test, average='weighted',zero_division = 0)
reca_test = recall_score(y_test5, Yhat_test, average='weighted',zero_division = 0)
print(' \t\t Accu \t Prec \t Reca\n Train \t %0.3f \t %0.3f \t %0.3f\n  Test \t %0.3f \t %0.3f \t %0.3f' % (accu_train,
                                                                                                            prec_train,
                                                                                                            reca_train,
                                                                                                            accu_test,
                                                                                                            prec_test,
                                                                                                            reca_test))

In [None]:
results = {'Accuracy': [accuracy_martingala, accuracy_logistic, accuracy_naive, accuracy_forest],
           'Recall': [recall_martingala, recall_logistic, recall_naive, recall_forest],
           'Precision': [precision_martingala, precision_logistic, precision_naive, precision_forest],
           'F1': [f1_martingala, f1_logistic, f1_naive, f1_forest],
           'Balanced Accuracy': [balanced_martingala, balanced_logistic, balanced_naive, balanced_forest]
           }

table = pd.DataFrame(results, index=['Martingale', 'Logistic Regression', 'Naive Bayes', 'Random Forest'])
print(table)