### Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier

import xgboost as xgb
from imblearn.over_sampling import SMOTE

from MachineLearning import models

### Read Databases

In [2]:
local_view = pd.read_csv(
    "Preprocessed\preprocessed_local_view.csv", sep=",")
global_view = pd.read_csv(
    "Preprocessed\preprocessed_global_view.csv", sep=",")

local_view.drop(["Unnamed: 0"], axis=1, inplace=True)
global_view.drop(["Unnamed: 0"], axis=1, inplace=True)

dropna_list = [local_view, global_view]

for var in dropna_list:
    var.dropna(inplace=True)
    
print("\n============================================================================================================")
print("Checking base balance: ")

targets = pd.concat([local_view[['label']].rename(columns={'label': 'target_local'}), global_view[[
                    'label']].rename(columns={'label': 'target_global'})], axis=0, ignore_index=True)
counts = targets.apply(pd.Series.value_counts).fillna(0).astype(int)

print(counts)


Checking base balance: 
                target_local  target_global
FALSE POSITIVE          4744           4215
CONFIRMED               2639           2429


### Transform target column values ​​into 0 and 1

In [3]:
target_map = {'CONFIRMED': 0, 'FALSE POSITIVE': 1}
local_view['label'] = local_view['label'].map(target_map)
global_view['label'] = global_view['label'].map(target_map)

### Separating into training and testing

In [4]:
# ============= Separating into X and y =============

X_local = local_view.iloc[:, :-1]
X_global = global_view.iloc[:, :-1]

y_local = local_view['label']
y_global = global_view['label']

# ============= Separating into training and testing =============

X_train_local, X_test_local, y_train_local, y_test_local = train_test_split(
    X_local, y_local, test_size= 0.3, random_state=42, stratify=y_local)

X_train_global, X_test_global, y_train_global, y_test_global = train_test_split(
    X_global, y_global, test_size= 0.3, random_state=42, stratify=y_global)

### Smote balancing

In [5]:
# Smote balancing
smote = SMOTE()  # Create a SMOTE instance
X_train_local, y_train_local = smote.fit_resample(X_train_local, y_train_local)  # Apply SMOTE to data local
X_train_global, y_train_global = smote.fit_resample(X_train_global, y_train_global)  # Apply SMOTE to data global

### All models and parameters of classification models

In [6]:
models_and_parameters_C = {
    'AdaBoostClassifier': {
        'clf': AdaBoostClassifier(random_state=42),
        'parameters': {
            'n_estimators': range(60, 220, 40)
        },
    },
    'XGBClassifier': {
        'clf': xgb.XGBClassifier(objective = "binary:logistic", random_state=42),
        'parameters': {
            'min_child_weight': [1, 5, 10],
            'gamma': [0.5, 1, 1.5, 2, 5],
            'max_depth': [3, 4, 5]
        }
    },
    'SVM': {
        'clf': SVC(probability=True, random_state=42),
        'parameters': {
            'C': [1, 3, 5, 10, 15],
            'kernel': ['linear', 'rbf'],
            'tol': [1e-3, 1e-4]
        },
    },
    'MLPClassifier': {
        'clf': MLPClassifier(random_state=42),
        'parameters': {
            'solver': ['sgd', 'adam'], 
            'max_iter': [1000, 1300, 1500, 2000], 
            'alpha': 10.0 ** -np.arange(1, 10), 
            'hidden_layer_sizes':np.arange(10, 15),
            'tol': [1e-3, 1e-4]
        },
        },
    }

### Classifier models

In [7]:
# models.defining_classifiers(models_and_parameters_C, X_train_local, y_train_local, X_test_local, y_test_local, "local")
# models.defining_classifiers(models_and_parameters_C, X_train_global, y_train_global, X_test_global, y_test_global, "global")

### LSTM

In [8]:
print(X_local.head())

          0         1         2         3         4         5         6  \
0  1.837927  1.802662  1.232059  1.512449  1.569320  1.405640  1.000000   
1  1.000000  1.476023  2.462279  1.453719  1.024518  1.866817  1.159148   
2  1.078824  1.181373  2.207187  1.587193  0.980563  0.577552  1.163036   
3  1.081321  1.580252  0.362759  0.644941  0.309738  0.704545  1.099352   
4  0.680829  0.319184 -0.042460 -0.404105 -0.765749 -0.415386 -0.065023   

          7         8         9  ...       191       192       193       194  \
0  1.496697  0.705534  1.123828  ...  1.601433  1.375619  1.492625  1.445211   
1  1.171658  2.225083  2.339150  ...  1.587496 -0.104021  0.708417  2.168780   
2  1.401895  2.084558  0.594387  ...  1.928679  0.972320  1.613166  2.017149   
3  1.028856  0.958360  0.887864  ...  1.378002  1.048320  2.519353 -0.161869   
4  0.285340  0.635703  0.986067  ...  1.318989  1.396417  1.473845  1.551273   

        195       196       197       198       199       200  
0  1

In [9]:
print(y_local.head())

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64


In [10]:
# array list, no caso seria -> 201 instancias + label + 201 instancias + label + ...

# uni_data = [[1.837927  1.802662  1.232059  1.512449 LABEL(1 or 0) 1.569320  1.405640  1.000000 1.551273 LABEL(1 or 0)]]

uni_data = local_view.values

# Usando numpy.ravel para transformar a matriz em um array unidimensional
uni_data = uni_data.ravel()

# As variáveis abaixo garantem padronização e reprodutibilidade

TRAIN_SPLIT = 1038788

# calcular numero de linhas 
# 7383 * 201 = 1.483.983 
# Treino 70 % -> 1.038.788

In [11]:
X_train_local, X_test_local, y_train_local, y_test_local = X_train_local.values.ravel(), X_test_local.values.ravel(), y_train_local.values.ravel(), y_test_local.values.ravel()  

In [12]:
uni_data[:203]

array([ 1.83792682,  1.80266181,  1.23205949,  1.51244916,  1.56932018,
        1.40564018,  1.        ,  1.49669697,  0.70553375,  1.12382756,
        1.1114299 ,  1.44419017,  1.50309692,  2.12711933,  1.52828495,
        1.16212154,  2.40785409,  1.51900931,  1.4633947 ,  1.76495743,
        1.38783919,  1.25999072,  1.88723364,  1.70934591,  1.23361735,
        1.42847913,  1.03088031,  1.17966405,  1.29297629,  1.17453201,
        1.94744208,  1.68249518,  1.34438724,  2.08653443,  1.4763223 ,
        2.11519395,  1.68377202,  1.97412647,  1.89862413,  1.11353673,
        1.92158557,  0.84935254,  1.71099146,  1.92820263,  1.42726483,
        0.99790787,  1.23127348,  1.25001944,  1.25288871,  1.32347451,
        1.85450614,  0.96068753,  1.35640173,  0.51606193,  0.4945905 ,
        0.23911974,  0.86746737, -0.16382229,  1.04232218,  0.33728038,
        0.71143815,  0.45230076, -0.26494808,  0.60850513,  0.48261954,
        0.63432782,  0.23709672,  0.21006438,  0.32733172, -0.47

In [16]:
'''Tamanho da Janela do Historico'''
univariate_past_history = 201
future = univariate_future_target = 1

x_train_uni, y_train_uni = models.univariate_data(uni_data, 0, TRAIN_SPLIT,
                                        univariate_past_history,
                                        univariate_future_target)
x_val_uni, y_val_uni = models.univariate_data(uni_data, TRAIN_SPLIT, None,
                                    univariate_past_history,
                                    univariate_future_target)


# Test LSTM data local
# models.method_LSTM(X_train_local, y_train_local, X_test_local, y_test_local, univariate_past_history, future)
models.method_LSTM(x_train_uni, y_train_uni, x_val_uni, y_val_uni, univariate_past_history, future)

# Test LSTM data global
# models.method_LSTM(x_train_uni, y_train_uni, x_val_uni, y_val_uni)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 8)                 320       
                                                                 
 dense_2 (Dense)             (None, 2)                 18        
                                                                 
Total params: 338
Trainable params: 338
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10


ValueError: in user code:

    File "c:\Users\alex-\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\alex-\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\alex-\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\alex-\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1051, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "c:\Users\alex-\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1109, in compute_loss
        return self.compiled_loss(
    File "c:\Users\alex-\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\alex-\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "c:\Users\alex-\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\alex-\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\losses.py", line 2156, in binary_crossentropy
        backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
    File "c:\Users\alex-\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\backend.py", line 5707, in binary_crossentropy
        return tf.nn.sigmoid_cross_entropy_with_logits(

    ValueError: `logits` and `labels` must have the same shape, received ((None, 2) vs (None, 1)).
