## Importar Liberías

**Librerías básicas**

In [401]:
import pandas as pd

**Módulos locales**

In [402]:
from Src.cleaner import *
from Src.featureSelection import *

**Librerías de apoyo**

In [478]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline

import webbrowser

**Librerías de Machine Learning**

In [404]:
import h2o
h2o.init(nthreads = -1, max_mem_size = 8)

from h2o.automl import H2OAutoML

from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

from h2o.estimators.glm import H2OGeneralizedLinearEstimator

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,1 hour 21 mins
H2O cluster timezone:,Europe/Madrid
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,26 days
H2O cluster name:,H2O_from_python_alex_7b22ja
H2O cluster total nodes:,1
H2O cluster free memory:,7.906 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


## Establecer Variables principales del proyecto

In [449]:
y = "author"

model_path = ""

trainSet = "Inputs/training_dataset.csv"

inputTest = "Inputs/test_dataset.csv"

outputTest = "Outputs/bible_cleaned_test(V3).csv"

submission = "Outputs/submission.csv"


## Elegir modelo a aplicar

In [450]:
model = 4

## Limpieza de datos

**Ver procedimiento de limpieza detallado en documento "cleaner.ipynb" en este mismo repositorio**

In [406]:
df = pd.read_csv(trainSet)

In [407]:
df.head(3)

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,9,author
0,0,0.142636,0.43107,0.032912,-0.486796,0.261718,-1.013906,-0.081827,0.330352,0.438262,-0.761088,Ubuntius
1,1,0.019197,0.525284,0.068476,0.166658,0.261718,0.231839,0.635431,0.133209,0.313536,-0.002374,Philippus
2,2,0.09326,-0.087108,-2.268081,-0.164963,0.261718,1.477585,0.635431,-0.370641,0.126447,1.078357,Marcus


In [408]:
class cleanBible(TransformerMixin):
    def __init__(self):
        self.features =["id", "0","1","2","3","4","5","6","7","8","9","author"]
    
    def fit(self, df):
        #Iniciamos limpieza
        X = df[self.features]
        X = df.drop(columns=["id","author"], axis=1)
        self.X = X
        self.cols = self.X.columns
        # self.X = pd.get_dummies(X)
       
        
        return self
    
    def transform(self, X):
        return self.X

In [409]:
cleaner = cleanBible()
pipe = make_pipeline(*[
    cleaner,
    #StandardScaler(),
    #Normalizer(),
])

In [410]:
X = pipe.fit_transform(df)
X= pd.DataFrame(X, columns=cleaner.cols)
X = X.join(df[f"{y}"])
pd.options.display.max_columns =28
X.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,author
0,0.142636,0.43107,0.032912,-0.486796,0.261718,-1.013906,-0.081827,0.330352,0.438262,-0.761088,Ubuntius
1,0.019197,0.525284,0.068476,0.166658,0.261718,0.231839,0.635431,0.133209,0.313536,-0.002374,Philippus
2,0.09326,-0.087108,-2.268081,-0.164963,0.261718,1.477585,0.635431,-0.370641,0.126447,1.078357,Marcus


In [411]:
df_export = X.copy()
df_export.to_csv("Outputs/bible_cleaned(V1).csv", index=False)

## Transformación de dataset

In [412]:
loan_csv = "Outputs/bible_cleaned(V1).csv"

data = h2o.import_file(loan_csv, header=1) 

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [413]:
data.shape

(12017, 11)

In [414]:
data.head(3)

0,1,2,3,4,5,6,7,8,9,author
0.142636,0.43107,0.032912,-0.486796,0.261718,-1.01391,-0.081827,0.330352,0.438262,-0.761088,Ubuntius
0.019197,0.525284,0.068476,0.166658,0.261718,0.231839,0.635431,0.133209,0.313536,-0.002374,Philippus
0.09326,-0.087108,-2.26808,-0.164963,0.261718,1.47758,0.635431,-0.370641,0.126447,1.07836,Marcus




## Partición de los datos

In [415]:
splits = data.split_frame(ratios=[0.7, 0.15], seed=1)  

train = splits[0]
valid = splits[1]
test = splits[2]

In [416]:
print(train.nrow)
print(valid.nrow)
print(test.nrow)

8451
1792
1774


## Extracción de Features

In [417]:
y = 'author'
x = list(data.columns)

x.remove(y)

In [418]:
x

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

## Machine Learning!!

In [419]:
if model == 0:

    rf_fit0 = H2OGeneralizedLinearEstimator(family='multinomial', model_id='rf_fit0')
    rf_fit0.train(x=x, y=y, training_frame=train)

In [420]:
if model == 1:

    rf_fit1 = H2ODeepLearningEstimator(model_id='rf_fit1', seed=1)
    rf_fit1.train(x=x, y=y, training_frame=train)


In [421]:
if model == 2:
    rf_fit2 = H2OGradientBoostingEstimator(model_id='gbm_fit3', 
                                            ntrees=500, 
                                            score_tree_interval=5,     
                                            stopping_rounds=3,         
                                            stopping_metric='AUC',     
                                            stopping_tolerance=0.0005, 
                                            seed=1)
    rf_fit2.train(x=x, y=y, training_frame=train, validation_frame=valid)

In [422]:
if model == 3:

    rf_fit3 = H2ORandomForestEstimator(model_id='rf_fit3', ntrees=200, nfolds=2, seed=1)
    rf_fit3.train(x=x, y=y, training_frame=data)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [None]:
if model == 4:
    saved_model = h2o.load_model(model_path)

## Métricas de rendimiento

In [424]:
if model == 0:
    rf_perf0 = rf_fit0.model_performance(test)
    print(rf_perf0)
if model == 1:
    rf_perf1 = rf_fit1.model_performance(test)
    print(rf_perf1)
if model == 2:
    rf_perf2 = rf_fit2.model_performance(test)
    print(rf_perf2)
if model == 3:
    rf_perf3 = rf_fit3.model_performance(test)
    print(rf_perf3)
if model == 4:
    rf_perf4 = rf_fit4.model_performance(test)

## Generación de resultado final

**Importación de dataset de testeo**


In [464]:
data_test = pd.read_csv(inputTest)
data_test.head(2)


Unnamed: 0,id,0,1,2,3,4,5,6,7,8,9
0,0,0.019197,0.038511,0.19295,0.323367,0.261718,-0.10036,0.106925,0.463833,0.064084,-0.025146
1,1,0.006853,0.462474,-0.016878,0.649795,-1.347089,-0.307984,-0.119577,0.855597,0.375899,-0.09204


**Limpieza de dataset**

In [472]:
class cleanBible_test(TransformerMixin):
    def __init__(self):
        self.features =["id", "0","1","2","3","4","5","6","7","8","9"]
    
    def fit(self, df):
        #Iniciamos limpieza
        X = df[self.features]
        X = df.drop(columns=["id"], axis=1)
        self.X = X
        self.cols = self.X.columns
        # self.X = pd.get_dummies(X)
       
        
        return self
    
    def transform(self, X):
        return self.X

In [473]:
cleaner = cleanBible_test()
pipe = make_pipeline(*[
    cleaner,
    #StandardScaler(),
    #Normalizer(),
])

In [474]:
X = pipe.fit_transform(data_test)
X= pd.DataFrame(X, columns=cleaner.cols)
pd.options.display.max_columns =28
X.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.019197,0.038511,0.19295,0.323367,0.261718,-0.10036,0.106925,0.463833,0.064084,-0.025146
1,0.006853,0.462474,-0.016878,0.649795,-1.347089,-0.307984,-0.119577,0.855597,0.375899,-0.09204
2,0.229043,-0.369751,0.484575,0.250174,0.261718,-0.01731,0.446679,0.232984,-0.123005,-0.126856


In [476]:
df_export = X.copy()
df_export.to_csv(outputTest, index=False)

**Transformación a dataframe de H2O**

In [434]:
loan_csv = outputTest

data_test_h2o = h2o.import_file(loan_csv,header=1) 

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [477]:
data_test_h2o.head(2)

0,1,2,3,4,5,6,7,8,9,author
0.125799,0.10976,0.0347863,-0.477907,0.266038,-0.917224,-0.0900011,0.328784,0.414138,-0.752526,Ubuntius
-0.000328204,0.135538,0.067771,0.183015,0.266038,0.202721,0.482713,0.132677,0.298764,0.0179226,Philippus




**Extracción de predicciones**

In [443]:
y_pred = rf_fit3.predict(test_data=data_test_h2o)

conclusion = y_pred.as_data_frame()

conclusion = conclusion.rename(columns={"predict":f"{y}"})
conclusion.head()


drf prediction progress: |████████████████████████████████████████████████| 100%


Unnamed: 0,author,Blanquius,Clarius,FerrumEffractarius,Marcus,Ovionicus,Paithonius,Philippus,Ubuntius
0,Marcus,0.007392,0.0,0.0,0.471379,0.007392,0.04435,0.210782,0.258706
1,Marcus,0.0,0.0,0.0,0.691429,1e-05,0.010603,0.243617,0.054341
2,Marcus,0.0,0.011668,0.0,0.671853,0.221693,0.018848,0.070103,0.005834
3,Clarius,0.0,0.829206,0.004319,0.110043,0.051825,0.004607,0.0,0.0
4,Philippus,0.0,0.0,0.0,0.240231,1.3e-05,0.021934,0.723198,0.014623


In [444]:
result = data_test[["id"]].join(conclusion[f"{y}"])

result.head()

Unnamed: 0,id,author
0,0,Marcus
1,1,Marcus
2,2,Marcus
3,3,Clarius
4,4,Philippus


**Generación de reporte final**

In [445]:
resultDef = result[["id",f"{y}"]]

resultDef.head(2)

Unnamed: 0,id,author
0,0,Marcus
1,1,Marcus


In [446]:
resultDef.to_csv (submission, index = False, header=True)

## Última comprobación final

In [440]:
df_check = pd.read_csv(submission)

In [441]:
df_check.head()

Unnamed: 0,id,author
0,0,Philippus
1,1,Marcus
2,2,Marcus
3,3,Clarius
4,4,Marcus


In [447]:
data_test.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,9
0,0,0.019197,0.038511,0.19295,0.323367,0.261718,-0.10036,0.106925,0.463833,0.064084,-0.025146
1,1,0.006853,0.462474,-0.016878,0.649795,-1.347089,-0.307984,-0.119577,0.855597,0.375899,-0.09204
2,2,0.229043,-0.369751,0.484575,0.250174,0.261718,-0.01731,0.446679,0.232984,-0.123005,-0.126856
3,3,-0.104241,-0.055704,0.299642,0.391249,0.17234,0.314889,0.106925,0.220694,0.219991,0.368972
4,4,1.043737,-0.401156,0.473906,0.506215,0.261718,0.06574,-0.195078,0.51544,-0.808997,0.315303
