**Analysis owner**: Felipe Matheus <br>
**Date**: 12/10/2021 <br>
**Related Links**:

The objective of this notebook is to download porcesssed data from eda notebook and do a simple classification (identification) model.

# Index

1. Setup
2. Loading data
3. Launching H2O AutoML
4. Verifying results
5. Conclusions

# 1. Setup

In [1]:
import os
import h2o
import sys
import json
import pickle
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.11" 2021-04-20; OpenJDK Runtime Environment (build 11.0.11+9-Ubuntu-0ubuntu2.20.04); OpenJDK 64-Bit Server VM (build 11.0.11+9-Ubuntu-0ubuntu2.20.04, mixed mode, sharing)
  Starting server from /local/home/ff268477/projects/phd-data-wrangling/.eda-jeff-data/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpavg8rmg1
  JVM stdout: /tmp/tmpavg8rmg1/h2o_ff268477_started_from_python.out
  JVM stderr: /tmp/tmpavg8rmg1/h2o_ff268477_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.34.0.3
H2O_cluster_version_age:,15 days
H2O_cluster_name:,H2O_from_python_ff268477_7ocojs
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.820 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [14]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.feature_engineering.FeatureEngineering import FeatureEngineering
from src.modeling.Modeling import Modeling
from src.processing.Processing import Processing

Variables

In [4]:
pd.options.display.max_rows = 6
with open('../config/variables.json') as json_file:
    var = json.load(json_file)
STRUCTURED_DATA = var.get('STRUCTURED_DATA')

Objects

In [15]:
feng = FeatureEngineering()
modeling = Modeling()
processing = Processing()

# 2. Loading Data

In [6]:
df = pd.read_pickle(STRUCTURED_DATA + "dataframe_21_1200.pkl")

# 3. Launching H2O AutoML

In [7]:
feng.label_dataframe(df, 'bd')

In [8]:
df = df.drop(columns = ["-1.0", "-2.0"]).fillna(0)

In [16]:
df

Unnamed: 0,1.0,10.0,100.0,101.0,102.0,103.0,104.0,105.0,106.0,107.0,...,889.0,899.0,990.0,875.0,944.0,945.0,967.0,-0.0,992.0,has_bd
eu_bd_0,0.000030,0.009258,0.003755,0.003283,0.003688,0.003548,0.003445,0.003445,0.003541,0.003799,...,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,True
eu_bd_1,0.000044,0.009922,0.003445,0.003703,0.003305,0.003681,0.003578,0.003467,0.003497,0.003740,...,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,True
eu_bd_2,0.000059,0.009604,0.003548,0.003482,0.003489,0.003290,0.003460,0.003585,0.003076,0.003445,...,0.00003,0.000007,0.000007,0.0,0.0,0.0,0.0,0.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ba_co_1197,0.000000,0.008867,0.001977,0.001837,0.001571,0.001778,0.001734,0.001497,0.001571,0.001999,...,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,False
ba_co_1198,0.000000,0.008579,0.001992,0.001586,0.001483,0.001770,0.001468,0.001608,0.001748,0.001866,...,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,False
ba_co_1199,0.000000,0.008439,0.001815,0.001556,0.001711,0.001497,0.001520,0.001549,0.001903,0.001719,...,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,False


In [21]:
df.loc[:, df.columns != "has_bd"]

Unnamed: 0,1.0,10.0,100.0,101.0,102.0,103.0,104.0,105.0,106.0,107.0,...,871.0,889.0,899.0,990.0,875.0,944.0,945.0,967.0,-0.0,992.0
eu_bd_0,0.000030,0.009258,0.003755,0.003283,0.003688,0.003548,0.003445,0.003445,0.003541,0.003799,...,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
eu_bd_1,0.000044,0.009922,0.003445,0.003703,0.003305,0.003681,0.003578,0.003467,0.003497,0.003740,...,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
eu_bd_2,0.000059,0.009604,0.003548,0.003482,0.003489,0.003290,0.003460,0.003585,0.003076,0.003445,...,0.000007,0.00003,0.000007,0.000007,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ba_co_1197,0.000000,0.008867,0.001977,0.001837,0.001571,0.001778,0.001734,0.001497,0.001571,0.001999,...,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
ba_co_1198,0.000000,0.008579,0.001992,0.001586,0.001483,0.001770,0.001468,0.001608,0.001748,0.001866,...,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
ba_co_1199,0.000000,0.008439,0.001815,0.001556,0.001711,0.001497,0.001520,0.001549,0.001903,0.001719,...,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
scaled_features = processing.scale(df.loc[:, df.columns != "has_bd"])



In [23]:
df_scaled = pd.DataFrame(scaled_features)

In [24]:
df_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,944,945,946,947,948,949,950,951,952,953
0,0.687187,0.310310,0.875375,0.773273,0.871872,0.860360,0.815315,0.806807,0.848849,0.875876,...,0.000000,0.000000,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.794294,0.508008,0.819319,0.869870,0.784284,0.881381,0.846847,0.812312,0.840340,0.866867,...,0.000000,0.000000,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.881882,0.403403,0.843343,0.830330,0.834835,0.807307,0.819319,0.838839,0.737738,0.810310,...,0.666667,0.992492,0.69019,0.92993,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25197,0.000000,0.280280,0.578579,0.589089,0.505005,0.592092,0.577077,0.467467,0.475475,0.577578,...,0.000000,0.000000,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
25198,0.000000,0.255255,0.582082,0.494494,0.463964,0.590591,0.468468,0.517518,0.547047,0.544044,...,0.000000,0.000000,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
25199,0.000000,0.234234,0.523524,0.482482,0.568569,0.482482,0.489990,0.489990,0.588589,0.497998,...,0.000000,0.000000,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
pca_features = processing.reduce_dimension(df_scaled, 0.8)

In [30]:
df_pca = pd.DataFrame(pca_features)

In [31]:
df_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,-4.971151,-1.410942,1.968420,0.760675,2.475709,1.166081,0.498858,0.349708,0.515057,0.358442,...,0.166726,0.319509,0.655591,-0.139559,-0.184487,0.179352,-0.218999,-0.016956,-0.710284,-0.579743
1,-5.562099,-1.574601,1.536875,0.809586,2.331037,1.249744,0.953034,-0.554801,0.684451,0.018334,...,-0.237098,-0.163459,-0.150259,-0.417085,-0.132615,-0.187714,0.163930,-0.245515,-0.745143,-0.229617
2,-5.532205,-1.649871,1.395735,0.732571,2.114463,1.121459,1.017321,-0.399947,-0.271746,-0.165159,...,-0.165380,0.125509,0.335697,-0.118711,-0.202447,0.653231,-0.244227,0.369147,0.067211,-0.106821
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25197,11.726173,-1.739515,1.618560,0.737163,-2.502996,1.404979,0.432592,0.036466,-0.083602,0.337082,...,-0.035256,0.005396,0.000571,-0.021619,0.038810,0.002856,-0.004027,-0.004836,-0.009952,-0.005025
25198,11.647468,-1.683714,1.688236,0.686666,-2.497269,1.549103,0.435204,0.064752,-0.080622,0.321426,...,-0.033161,-0.022233,0.010067,-0.019753,0.014145,0.009122,0.036148,0.007490,-0.002718,-0.035748
25199,11.728495,-1.781068,1.694701,0.728633,-2.451190,1.472627,0.446997,0.073618,-0.094535,0.343201,...,-0.037578,-0.002649,-0.004191,-0.030963,0.034273,0.021465,0.023552,0.007531,0.010461,-0.013217


In [34]:
df_to_train = pd.concat([df_pca.set_index(df.index), df.loc[:, df.columns == "has_bd"]], axis = 1)

In [35]:
df_to_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,has_bd
eu_bd_0,-4.971151,-1.410942,1.968420,0.760675,2.475709,1.166081,0.498858,0.349708,0.515057,0.358442,...,0.319509,0.655591,-0.139559,-0.184487,0.179352,-0.218999,-0.016956,-0.710284,-0.579743,True
eu_bd_1,-5.562099,-1.574601,1.536875,0.809586,2.331037,1.249744,0.953034,-0.554801,0.684451,0.018334,...,-0.163459,-0.150259,-0.417085,-0.132615,-0.187714,0.163930,-0.245515,-0.745143,-0.229617,True
eu_bd_2,-5.532205,-1.649871,1.395735,0.732571,2.114463,1.121459,1.017321,-0.399947,-0.271746,-0.165159,...,0.125509,0.335697,-0.118711,-0.202447,0.653231,-0.244227,0.369147,0.067211,-0.106821,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ba_co_1197,11.726173,-1.739515,1.618560,0.737163,-2.502996,1.404979,0.432592,0.036466,-0.083602,0.337082,...,0.005396,0.000571,-0.021619,0.038810,0.002856,-0.004027,-0.004836,-0.009952,-0.005025,False
ba_co_1198,11.647468,-1.683714,1.688236,0.686666,-2.497269,1.549103,0.435204,0.064752,-0.080622,0.321426,...,-0.022233,0.010067,-0.019753,0.014145,0.009122,0.036148,0.007490,-0.002718,-0.035748,False
ba_co_1199,11.728495,-1.781068,1.694701,0.728633,-2.451190,1.472627,0.446997,0.073618,-0.094535,0.343201,...,-0.002649,-0.004191,-0.030963,0.034273,0.021465,0.023552,0.007531,0.010461,-0.013217,False


In [36]:
split_dict = modeling.spliting(df=df_to_train, split_ratio= 0.8, target_variable="has_bd")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [37]:
aml = modeling.training(
    model_name= "bd_identification_model",
    target_variable= "has_bd",
    features=list(split_dict["X"].columns),
    train= split_dict["X"],
    test= split_dict["test"],
    maxmodels = 5,
    seed = 1
)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


In [38]:
aml.leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
XGBoost_1_AutoML_1_20211022_173918,1,0.00130026,1,0,0.00460872,2.12403e-05
StackedEnsemble_AllModels_1_AutoML_1_20211022_173918,1,0.000479919,1,0,0.000540762,2.92423e-07
StackedEnsemble_BestOfFamily_3_AutoML_1_20211022_173918,1,7.68422e-05,1,0,8.51535e-05,7.25112e-09
StackedEnsemble_BestOfFamily_2_AutoML_1_20211022_173918,1,0.000479919,1,0,0.000540762,2.92424e-07
StackedEnsemble_BestOfFamily_4_AutoML_1_20211022_173918,1,0.00265791,1,0,0.00290721,8.45187e-06
GLM_1_AutoML_1_20211022_173918,1,0.000434548,1,0,0.000744857,5.54812e-07
StackedEnsemble_AllModels_4_AutoML_1_20211022_173918,1,5.33704e-05,1,0,6.09103e-05,3.71006e-09
XGBoost_2_AutoML_1_20211022_173918,1,0.000606363,1,0,0.00138517,1.91868e-06
StackedEnsemble_BestOfFamily_1_AutoML_1_20211022_173918,1,0.000560118,1,0,0.000616293,3.79817e-07
GBM_1_AutoML_1_20211022_173918,1,3.70748e-17,1,0,2.7148e-16,7.37016e-32




In [41]:
aml.leader.confusion_matrix()


Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.8570694923400879: 


Unnamed: 0,Unnamed: 1,False,True,Error,Rate
0,False,18000.0,0.0,0.0,(0.0/18000.0)
1,True,0.0,7200.0,0.0,(0.0/7200.0)
2,Total,18000.0,7200.0,0.0,(0.0/25200.0)




# Weird, it says it is right 100% of time, something is wrong
- Inspect mannualy the predictions (run model.predict on some random cases)
- Generate new data and test with it
- Teste models on other elements