**Analysis owner**: Felipe Matheus <br>
**Date**: 12/10/2021 <br>
**Related Links**:

The objective of this notebook is to download porcesssed data from eda notebook and do a simple classification (identification) model.

# Index

1. Setup
2. Loading data
3. Launching H2O AutoML
4. Verifying results
5. Conclusions

# 1. Setup

In [1]:
import os
import h2o
import sys
import json
import pickle
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.11" 2021-04-20; OpenJDK Runtime Environment (build 11.0.11+9-Ubuntu-0ubuntu2.20.04); OpenJDK 64-Bit Server VM (build 11.0.11+9-Ubuntu-0ubuntu2.20.04, mixed mode, sharing)
  Starting server from /local/home/ff268477/projects/phd-data-wrangling/.eda-jeff-data/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp6joyvxk0
  JVM stdout: /tmp/tmp6joyvxk0/h2o_ff268477_started_from_python.out
  JVM stderr: /tmp/tmp6joyvxk0/h2o_ff268477_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.34.0.3
H2O_cluster_version_age:,19 days
H2O_cluster_name:,H2O_from_python_ff268477_ugo4gy
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.820 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [3]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.feature_engineering.FeatureEngineering import FeatureEngineering
from src.DataLoader import DataLoader
from src.modeling.Modeling import Modeling
from src.processing.Processing import Processing
from src.utils import read_files

Variables

In [4]:
pd.options.display.max_rows = 6
with open('../config/variables.json') as json_file:
    var = json.load(json_file)
STRUCTURED_DATA = var.get('STRUCTURED_DATA')
RAW_DATA = var.get('RAW_DATA')

Objects

In [5]:
feng = FeatureEngineering()
modeling = Modeling()
processing = Processing()
load = DataLoader(RAW_DATA)

# 2. Loading Data

In [6]:
df = pd.read_pickle(STRUCTURED_DATA + "dataframe_21_1200.pkl")
fits = load.load_fits(read_files(RAW_DATA))
energies = processing.get_energy_dict(fits)

# 3. Preparing Data

## 3.1 Dividing and labeling DataFrames

In [9]:
df_thousand = pd.DataFrame(0.0, index=df.index, columns = [str(float(col)) for col in range(1000)])
df_thousand.update(df)

In [11]:
for label in energies.keys():
    feng.label_dataframe(df_thousand, label)

In [16]:
df_thousand

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,997.0,998.0,999.0,has_eu,has_bd,has_cs,has_sp,has_am,has_ba,has_co
eu_bd_0,0.000000,0.000030,0.000162,0.000671,0.004574,0.009486,0.011242,0.008645,0.007259,0.009656,...,0.0,0.0,0.0,True,True,False,False,False,False,False
eu_bd_1,0.000022,0.000044,0.000111,0.000701,0.004352,0.009405,0.010069,0.008380,0.007296,0.009641,...,0.0,0.0,0.0,True,True,False,False,False,False,False
eu_bd_2,0.000007,0.000059,0.000074,0.000775,0.004219,0.008771,0.011427,0.008933,0.007568,0.009427,...,0.0,0.0,0.0,True,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ba_co_1197,0.000000,0.000000,0.000007,0.000022,0.000015,0.000037,0.000184,0.002597,0.013042,0.015698,...,0.0,0.0,0.0,False,False,False,False,False,True,True
ba_co_1198,0.000000,0.000000,0.000015,0.000007,0.000030,0.000037,0.000229,0.002619,0.012821,0.015867,...,0.0,0.0,0.0,False,False,False,False,False,True,True
ba_co_1199,0.000000,0.000000,0.000022,0.000007,0.000030,0.000037,0.000125,0.002545,0.013145,0.015277,...,0.0,0.0,0.0,False,False,False,False,False,True,True


In [18]:
mask_label = [col for col in df_thousand if col.startswith('has')]
mask_features = [col for col in df_thousand if not col.startswith('has')]

In [19]:
df_labels =  df_thousand[mask_label]
df_features =  df_thousand[mask_features]

In [20]:
df_labels

Unnamed: 0,has_eu,has_bd,has_cs,has_sp,has_am,has_ba,has_co
eu_bd_0,True,True,False,False,False,False,False
eu_bd_1,True,True,False,False,False,False,False
eu_bd_2,True,True,False,False,False,False,False
...,...,...,...,...,...,...,...
ba_co_1197,False,False,False,False,False,True,True
ba_co_1198,False,False,False,False,False,True,True
ba_co_1199,False,False,False,False,False,True,True


In [21]:
df_features

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,990.0,991.0,992.0,993.0,994.0,995.0,996.0,997.0,998.0,999.0
eu_bd_0,0.000000,0.000030,0.000162,0.000671,0.004574,0.009486,0.011242,0.008645,0.007259,0.009656,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eu_bd_1,0.000022,0.000044,0.000111,0.000701,0.004352,0.009405,0.010069,0.008380,0.007296,0.009641,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eu_bd_2,0.000007,0.000059,0.000074,0.000775,0.004219,0.008771,0.011427,0.008933,0.007568,0.009427,...,0.000007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ba_co_1197,0.000000,0.000000,0.000007,0.000022,0.000015,0.000037,0.000184,0.002597,0.013042,0.015698,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ba_co_1198,0.000000,0.000000,0.000015,0.000007,0.000030,0.000037,0.000229,0.002619,0.012821,0.015867,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ba_co_1199,0.000000,0.000000,0.000022,0.000007,0.000030,0.000037,0.000125,0.002545,0.013145,0.015277,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 3.2 Applying PCA

In [22]:
PCA_FEATURES = 42

In [24]:
features, pca_object = processing.reduce_dimension(df_features, PCA_FEATURES)

In [33]:
pickle.dump(pca_object, open(STRUCTURED_DATA+"pca.pkl","wb"))

In [34]:
df_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,-0.002719,0.072220,0.075179,0.016850,-0.011178,0.022665,-0.000026,-0.000007,0.000010,0.000089,...,0.000284,-0.000005,-0.000266,-8.959821e-05,-0.000070,0.000180,-0.000109,0.000033,-0.000219,4.172518e-06
1,-0.002878,0.072140,0.074975,0.016540,-0.010832,0.021969,0.000181,-0.000069,0.000083,-0.000118,...,-0.000173,-0.000115,0.000105,-9.387291e-05,-0.000175,-0.000207,0.000877,0.000065,0.000199,-1.306129e-04
2,-0.003182,0.072197,0.076247,0.016395,-0.011064,0.022309,0.000752,-0.000193,0.000290,0.000091,...,-0.000146,-0.000146,0.000194,9.058734e-05,-0.000259,0.000249,0.000136,0.000063,0.000332,-5.555392e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25197,-0.076303,0.002078,-0.072666,-0.060643,-0.021367,-0.001885,0.000592,-0.000272,0.000032,0.000060,...,0.000113,-0.000329,0.000239,8.753508e-07,-0.000194,0.000316,-0.000173,0.000053,0.000035,-1.573625e-04
25198,-0.075832,0.002867,-0.072420,-0.059704,-0.021012,-0.001615,-0.001055,-0.000475,0.000176,-0.000595,...,-0.000062,0.000060,0.000117,-4.086103e-04,-0.000206,-0.000063,-0.000076,-0.000014,-0.000192,-2.155709e-05
25199,-0.076801,0.001736,-0.072476,-0.060628,-0.020649,-0.001944,-0.000134,-0.001058,0.000245,0.000312,...,0.000066,-0.000095,-0.000390,-4.150010e-04,-0.000242,0.000310,-0.000255,-0.000229,-0.000440,-2.654166e-04


## 3.3 Creating dict with all 7 labels

In [27]:
labeled_dfs = {label: pd.concat([df_pca.set_index(df.index), df_labels.filter(regex=label)], axis = 1) for label in energies.keys()}

In [28]:
labeled_dfs['eu']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,has_eu
eu_bd_0,-0.002719,0.072220,0.075179,0.016850,-0.011178,0.022665,-0.000026,-0.000007,0.000010,0.000089,...,-0.000005,-0.000266,-8.959821e-05,-0.000070,0.000180,-0.000109,0.000033,-0.000219,4.172518e-06,True
eu_bd_1,-0.002878,0.072140,0.074975,0.016540,-0.010832,0.021969,0.000181,-0.000069,0.000083,-0.000118,...,-0.000115,0.000105,-9.387291e-05,-0.000175,-0.000207,0.000877,0.000065,0.000199,-1.306129e-04,True
eu_bd_2,-0.003182,0.072197,0.076247,0.016395,-0.011064,0.022309,0.000752,-0.000193,0.000290,0.000091,...,-0.000146,0.000194,9.058734e-05,-0.000259,0.000249,0.000136,0.000063,0.000332,-5.555392e-07,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ba_co_1197,-0.076303,0.002078,-0.072666,-0.060643,-0.021367,-0.001885,0.000592,-0.000272,0.000032,0.000060,...,-0.000329,0.000239,8.753508e-07,-0.000194,0.000316,-0.000173,0.000053,0.000035,-1.573625e-04,False
ba_co_1198,-0.075832,0.002867,-0.072420,-0.059704,-0.021012,-0.001615,-0.001055,-0.000475,0.000176,-0.000595,...,0.000060,0.000117,-4.086103e-04,-0.000206,-0.000063,-0.000076,-0.000014,-0.000192,-2.155709e-05,False
ba_co_1199,-0.076801,0.001736,-0.072476,-0.060628,-0.020649,-0.001944,-0.000134,-0.001058,0.000245,0.000312,...,-0.000095,-0.000390,-4.150010e-04,-0.000242,0.000310,-0.000255,-0.000229,-0.000440,-2.654166e-04,False


# 4. Launching H2O AutoML

In [30]:
MAX_MODELS = 3
SEED = 1

In [31]:
for label, df in labeled_dfs.items():
    split_dict = modeling.spliting(df=df, split_ratio= 0.8, target_variable=f"has_{label}")

    aml = modeling.training(
        model_name= f"{label}_identification_model",
        target_variable= f"has_{label}",
        features=list(split_dict["X"].columns),
        train= split_dict["X"],
        test= split_dict["test"],
        maxmodels = MAX_MODELS,
        seed = SEED
    )
    
    aml.leader.save_mojo(f"../data/structured/{label}_best-model_{MAX_MODELS}-models_seed-{SEED}")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |█████████████████████████████████

In [None]:

# Binário
h2o.save_model(boston_model, path = 'gs://hurb_transient/INTERNAL/DATA_SCIENCE/tutorials/tutorial_training', force = True)

# MOJO
boston_model.save_mojo(f"../data/structured/{label}_model)

In [36]:
split_dict = modeling.spliting(df=df_to_train, split_ratio= 0.8, target_variable="has_bd")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [37]:
aml = modeling.training(
    model_name= "bd_identification_model",
    target_variable= "has_bd",
    features=list(split_dict["X"].columns),
    train= split_dict["X"],
    test= split_dict["test"],
    maxmodels = MAX_MODELS,
    seed = SEED
)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


In [38]:
aml.leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
XGBoost_1_AutoML_1_20211022_173918,1,0.00130026,1,0,0.00460872,2.12403e-05
StackedEnsemble_AllModels_1_AutoML_1_20211022_173918,1,0.000479919,1,0,0.000540762,2.92423e-07
StackedEnsemble_BestOfFamily_3_AutoML_1_20211022_173918,1,7.68422e-05,1,0,8.51535e-05,7.25112e-09
StackedEnsemble_BestOfFamily_2_AutoML_1_20211022_173918,1,0.000479919,1,0,0.000540762,2.92424e-07
StackedEnsemble_BestOfFamily_4_AutoML_1_20211022_173918,1,0.00265791,1,0,0.00290721,8.45187e-06
GLM_1_AutoML_1_20211022_173918,1,0.000434548,1,0,0.000744857,5.54812e-07
StackedEnsemble_AllModels_4_AutoML_1_20211022_173918,1,5.33704e-05,1,0,6.09103e-05,3.71006e-09
XGBoost_2_AutoML_1_20211022_173918,1,0.000606363,1,0,0.00138517,1.91868e-06
StackedEnsemble_BestOfFamily_1_AutoML_1_20211022_173918,1,0.000560118,1,0,0.000616293,3.79817e-07
GBM_1_AutoML_1_20211022_173918,1,3.70748e-17,1,0,2.7148e-16,7.37016e-32




In [41]:
aml.leader.confusion_matrix()


Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.8570694923400879: 


Unnamed: 0,Unnamed: 1,False,True,Error,Rate
0,False,18000.0,0.0,0.0,(0.0/18000.0)
1,True,0.0,7200.0,0.0,(0.0/7200.0)
2,Total,18000.0,7200.0,0.0,(0.0/25200.0)




# Weird, it says it is right 100% of time, something is wrong
- Inspect mannualy the predictions (run model.predict on some random cases)
- Generate new data and test with it
- Teste models on other elements