# Neural Network Methods 1


<div class="alert alert-block alert-warning">
  
<b>Notebook objectives:</b>
    
* Apply NNs methods to train and test generalization classification on:

    - Baseline data set
    - Class Imbalance treatment data set
    - Evolutionary Algorithm regularizaion
    
* Compile the results of the performance metrics

    
    

In [7]:
# if Notebook set up throws and error uncomment and install keras and reset kernel
# !pip install keras

In [8]:
# if Notebook set up throws and error uncomment and install tensorflow and reset kernel
# !pip install tensorflow

In [9]:
# Upgrade  tensorflow and tensflow gpu to latest version
# !pip install --upgrade tensorflow
# !pip install --upgrade tensorflow-gpu

In [4]:
# Try downgrading pandas version if pickle throws an error while loading and reset kernel
# !pip install pandas==1.4.1

# 1. Notebook set up

In [8]:
###### Import packages

### Data handling
import numpy as np
import pandas as pd
import datetime as dt
#from IPython.display import HTML, Image #display formatted texts
import warnings
warnings.filterwarnings('ignore')

### Plotting packages
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn') # pretty graphs
import matplotlib.ticker as mticker
from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter, FuncFormatter
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go


### Files to pickle
import pickle
import bz2
import _pickle as cPickle

### sampling
from sklearn.model_selection import train_test_split

### sk-learn pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

### Model selection

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

# if xgboost throws and error uncomment and install xgboost package
# !pip install xgboost
# from xgboost import XGBClassifier
# from xgboost import plot_importance, to_graphviz

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer

# Cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

# Dimensionality reduction
from sklearn.decomposition import PCA


from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, accuracy_score, precision_score, recall_score, average_precision_score, f1_score
from sklearn.inspection import partial_dependence


# NN methods
import tensorflow as tf
from tensorflow import keras 
from keras.models import Sequential 
from keras.layers import Dense, Dropout,Conv1D
from keras.utils  import np_utils 

# time progress bar
from tqdm.notebook import tqdm_notebook
from tqdm import tqdm
import time

# Google path set up
pickles = "/content/drive/MyDrive/pickles/"
pickles_w1 = "/content/drive/MyDrive/pickles/w1/"
pickles_w2 = "/content/drive/MyDrive/pickles/w2/"
pickles_w3 = "/content/drive/MyDrive/pickles/w3/"
pickles_w4 = "/content/drive/MyDrive/pickles/w4/"
pickles_w5 = "/content/drive/MyDrive/pickles/w5/"
pickles_imbalance = "/content/drive/MyDrive/pickles/class_imbalance_learn/"

# Path set up
path = "/project/data/"
path_w1 = "/project/data/w1/"
path_w2 = "/project/data/w2/"
path_w3 = "/project/data/w3/"
path_w4 = "/project/data/w4/"
path_w5 = "/project/data/w5/"
path_feature = "/project/data/feature_importance/"


# fixed values
seed = 2323
colors = {'c1':['blue', 'red'], 'c2': ['red', 'blue', 'grey', 'purple']}
bar_width = 0.3
bin_num = 25
size = {'small_tick': 9, 'tick': 10 , 'label': 14, 'sub_title': 16, 'main_title': 20}
fig_size = {'large': (30,25), 'small': (10, 5)}

# pandas display set up
pd.options.display.max_columns = None

# 2. Loading pre-processed X and y

In [2]:
# Loading X
pickled_data_X = bz2.BZ2File(path_w2 + 'X_train_dense_w2_1pct', 'rb')
X = cPickle.load(pickled_data_X)
pickled_data_X.close()
print(F"X_train shape: rows {X.shape[0]}, columns {X.shape[1]}")

X_train shape: rows 338282, columns 304


In [3]:
# Loading y
pickled_data_y = bz2.BZ2File(path_w2 + 'y_train_dense_w2_1pct', 'rb')
y = cPickle.load(pickled_data_y)
pickled_data_X.close()
print(F"X_train shape: rows {y.shape[0]}, columns 1")

X_train shape: rows 338282, columns 1


In [4]:
# Loading X resampled
pickled_data_X = bz2.BZ2File(path_w2 + 'X_resampled_w2_1pct', 'rb')
X_resampled = cPickle.load(pickled_data_X)
pickled_data_X.close()
print(F"X_train shape: rows {X_resampled.shape[0]}, columns {X_resampled.shape[1]}")

X_train shape: rows 101415, columns 304


In [5]:
# Loading X resampled
pickled_data_y = bz2.BZ2File(path_w2 + 'y_resampled_w2_1pct', 'rb')
y_resampled = cPickle.load(pickled_data_y)
pickled_data_X.close()
print(F"X_train shape: rows {y_resampled.shape[0]}, columns 1")

X_train shape: rows 101415, columns 1


In [6]:
# Loading X test
pickled_data_X = bz2.BZ2File(path_w2 + 'X_test_dense_w2_1pct', 'rb')
X_test_dense = cPickle.load(pickled_data_X)
pickled_data_X.close()
print(F"X_train shape: rows {X_test_dense.shape[0]}, columns {X_test_dense.shape[1]}")

X_train shape: rows 59697, columns 304


In [7]:
# Loading y test
pickled_data_y = bz2.BZ2File(path_w2 + 'y_test_dense_w2_1pct', 'rb')
y_test_dense = cPickle.load(pickled_data_y)
pickled_data_y.close()
print(F"y_train shape: rows {y_test_dense.shape[0]}, columns 1")
# y_train = np.array(y_train)

y_train shape: rows 59697, columns 1


# 3. Neural Network methods

In [None]:
def models_function(X):

  # Model 1 
  # Input layer: Number of features
  # Hiden layer: 2*Num of features
  # Turns off 40% of neurons
  # Capa oculta: 2*Num of features
  # Turns off 40% of neurons
  # Hiden layer: Num of features
  # Turns off 40% of neurons
  # Output layer: 1 neuron
  # Hyper Parameters: Loss function -> binary cross entropy, metrics: Recall y AUC, Optimizer: Adam 
  n_columns = X.shape[1]

  model = Sequential()
  model.add(Dense(units=n_columns, activation='relu'))
  model.add(Dropout(0.4))
  model.add(Dense(units=n_columns*2, activation='relu'))
  model.add(Dropout(0.4))
  model.add(Dense(units=n_columns*2, activation='relu'))
  model.add(Dropout(0.4))
  model.add(Dense(units=n_columns, activation='relu'))
  model.add(Dropout(0.4))
  model.add(Dense(units=1, activation = 'sigmoid'))
  model.compile(loss = 'binary_crossentropy', metrics=['Recall', 'AUC'], 
                optimizer = 'adam')



  # Model 2
  # Input layer: Number of features 
  # Turns off 40% of neurons 
  # Hiden layer: Num of features
  # Turns off 40% of neurons
  # Output layer: 1 neuron
  # Characteristics: Loss function -> binary cross entropy, metrics: Recall y AUC, Optimizer: Adam 
  n_columns = X.shape[1]

  model2 = Sequential()
  model2.add(Dense(units=n_columns, activation='relu'))
  model2.add(Dropout(0.4))
  model2.add(Dense(units=n_columns, activation='relu'))
  model2.add(Dropout(0.4))
  model2.add(Dense(units=1, activation = 'sigmoid'))
  model2.compile(loss = 'binary_crossentropy', metrics=['Recall', 'AUC'], 
                optimizer = 'adam')



  # Model 3
  # Input layer: Number of features 
  # Turns off 40% of neurons 
  # Hiden layer: Num of features divided by 2  
  # Turns off 40% of neurons 
  # Hiden layer: Num of features divided by 4 
  # Turns off 40% of neurons 
  # Hiden layer: Num of features divided by 2  
  # Turns off 40% of neurons 
  # Input layer: Number of features 
  # Turns off 40% of neurons 
  # Output layer: 1 neuron
  # Characteristics: Loss function -> binary cross entropy, metrics: Recall y AUC, Optimizer: Adam 

  n_columns = X.shape[1]

  model3 = Sequential()
  model3.add(Dense(units=n_columns, activation='relu'))
  model3.add(Dropout(0.4))
  model3.add(Dense(units=int(n_columns/2), activation='relu'))
  model3.add(Dropout(0.4))
  model3.add(Dense(units=int(n_columns/4), activation='relu'))
  model3.add(Dropout(0.4))
  model3.add(Dense(units=int(n_columns/2), activation='relu'))
  model3.add(Dropout(0.4))
  model3.add(Dense(units=n_columns, activation='relu'))
  model3.add(Dropout(0.4))
  model3.add(Dense(units=1, activation = 'sigmoid'))
  model3.compile(loss = 'binary_crossentropy', metrics=['Recall', 'AUC'], 
                optimizer = 'adam')

  # Modelo 4
  # Input layer: Number of features 
  # Turns off 40% of neurons 
  # Hiden layer: Num of features divided by 2  
  # Turns off 40% of neurons 
  # Hiden layer: Num of features divided by 4 
  # Turns off 40% of neurons 
  # Hiden layer: Num of features divided by 8
  # Turns off 40% of neurons 
  # Hiden layer: Num of features divided by 4 
  # Turns off 40% of neurons 
  # Hiden layer: Num of features divided by 2  
  # Turns off 40% of neurons 
  # Input layer: Number of features  
  # Turns off 40% of neurons 
  # Output layer: 1 neuron 
  # Characteristics: Loss function -> binary cross entropy, metrics: Recall y AUC, Optimizer: Adam 
  n_columns = X.shape[1]

  model4 = Sequential()
  model4.add(Dense(units=n_columns, activation='relu'))
  model4.add(Dropout(0.4))
  model4.add(Dense(units=int(n_columns/2), activation='relu'))
  model4.add(Dropout(0.4))
  model4.add(Dense(units=int(n_columns/4), activation='relu'))
  model4.add(Dropout(0.4))
  model4.add(Dense(units=int(n_columns/8), activation='relu'))
  model4.add(Dropout(0.4))
  model4.add(Dense(units=int(n_columns/4), activation='relu'))
  model4.add(Dropout(0.4))
  model4.add(Dense(units=int(n_columns/2), activation='relu'))
  model4.add(Dropout(0.4))
  model4.add(Dense(units=n_columns, activation='relu'))
  model4.add(Dropout(0.4))
  model4.add(Dense(units=1, activation = 'sigmoid'))
  model4.compile(loss = 'binary_crossentropy', metrics=['Recall', 'AUC'], 
                optimizer = 'adam')
  return model, model2, model3, model4

## 3.1 Baseline predictions



In [None]:
X = X_data
y = y_data

In [None]:
models = models_function(X)

In [None]:
Results = pd.DataFrame()
recall = []
auc = []
name= []
i = 1
for model in models:
    model.fit(X, y, batch_size = 128, epochs = 20, verbose = 1)
    pred = model.predict(X_test_dense)
    score = model.evaluate(X_test_dense, y_test_dense, verbose=1)
    print('Loss Function:', score[0])
    print('Test recall:', score[1])
    print('Test auc:', score[2])
    print(model.summary())
    auc.append(score[2])
    recall.append(score[1])
    name.append('model_number_{}'.format(i))
    i = i+1

Results['Modelo'] = name
Results['AUC'] = auc 
Results['Recall'] = recall 


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Loss Function: 0.008671252988278866
Test recall: 0.0
Test auc: 0.5
Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_44 (Dense)            (None, 304)               92720     
                                                                 
 dropout_36 (Dropout)        (None, 304)               0         
                                                                 
 dense_45 (Dense)            (None, 608)               185440    
                                                                 
 dropout_37 (Dropout)        (None, 608)               0         
                                                                 
 dense_46 (Dense)   

In [None]:
display(Results)

Unnamed: 0,Modelo,AUC,Recall
0,model_number_1,0.5,0.0
1,model_number_2,0.499227,0.0
2,model_number_3,0.5,0.0
3,model_number_4,0.5,0.0


## 3.2 Results with imbalance class transformations

Training model 1

In [None]:
X = X_resampled
y = y_resampled

In [None]:
Results = pd.DataFrame()
recall = []
auc = []
name= []
i = 1
for model in models:
  model.fit(X, y, batch_size = 128, epochs = 20, verbose = 1)
  pred = model.predict(X_test_dense)
  score = model.evaluate(X_test_dense, y_test_dense, verbose=1)
  print('Loss Function:', score[0])
  print('Test recall:', score[1])
  print('Test auc:', score[2])
  print(model.summary())
  auc.append(score[2])
  recall.append(score[1])
  name.append('model_number_{}'.format(i))
  i = i+1

Results['Modelo'] = name
Results['AUC'] = auc 
Results['Recall'] = recall 


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Loss Function: 0.31309032440185547
Test recall: 0.0
Test auc: 0.507856011390686
Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_44 (Dense)            (None, 304)               92720     
                                                                 
 dropout_36 (Dropout)        (None, 304)               0         
                                                                 
 dense_45 (Dense)            (None, 608)               185440    
                                                                 
 dropout_37 (Dropout)        (None, 608)               0         
                                                                 
 dense_

In [None]:
display(Results)

Unnamed: 0,Modelo,AUC,Recall
0,model_number_1,0.507856,0.0
1,model_number_2,0.50738,0.018868
2,model_number_3,0.533857,0.056604
3,model_number_4,0.516062,0.0


## 3.3 Results Genetic Algorithm regularization


In [None]:
variables = ['total_conversions', 'device_make', 'inventory_source', 'tod', 'ad_type',
 'browser', 'os', 'insertion_order', 'creative_name', 'device_type',
 'device_make', 'inventory_source', 'post_click_conversions',
 'post_view_conversions', 'total_conversions', 'viewable_impressions',
 'creative_size', 'billable_impressions', 'tod', 'device_type',
 'media_cost_log', 'impressions', 'viewable_impressions', 'creative_size',
 'post_view_conversions', 'day_of_week']

In [None]:
X_data.columns.tolist()

['impressions',
 'billable_impressions',
 'viewable_impressions',
 'total_conversions',
 'post_view_conversions',
 'media_cost_log',
 'total_media_cost_log',
 'device_model',
 'app_url',
 'city',
 'date_03/04/2022',
 'date_06/04/2022',
 'date_08/04/2022',
 'date_11/04/2022',
 'date_16/04/2022',
 'date_17/04/2022',
 'tod_tod_10',
 'tod_tod_11',
 'tod_tod_12',
 'tod_tod_13',
 'tod_tod_14',
 'tod_tod_15',
 'tod_tod_16',
 'tod_tod_17',
 'tod_tod_18',
 'tod_tod_19',
 'tod_tod_2',
 'tod_tod_20',
 'tod_tod_21',
 'tod_tod_22',
 'tod_tod_23',
 'tod_tod_24',
 'tod_tod_3',
 'tod_tod_4',
 'tod_tod_5',
 'tod_tod_6',
 'tod_tod_7',
 'tod_tod_8',
 'tod_tod_9',
 'insertion_order_insertion_order2',
 'insertion_order_insertion_order3',
 'line_item_line_item10',
 'line_item_line_item11',
 'line_item_line_item12',
 'line_item_line_item13',
 'line_item_line_item14',
 'line_item_line_item15',
 'line_item_line_item16',
 'line_item_line_item17',
 'line_item_line_item18',
 'line_item_line_item19',
 'line_item_l