## Importar Liberías

**Librerías básicas**

In [98]:
import pandas as pd

**Módulos locales**

In [99]:
from Src.cleaner import *
from Src.featureSelection import *

**Librerías de apoyo**

In [100]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline

**Librerías de Machine Learning**

In [101]:
import h2o
h2o.init(nthreads = -1, max_mem_size = 8)

from h2o.automl import H2OAutoML

from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,20 mins 28 secs
H2O cluster timezone:,Europe/Madrid
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,26 days
H2O cluster name:,H2O_from_python_alex_7b22ja
H2O cluster total nodes:,1
H2O cluster free memory:,7.998 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


## Limpieza de datos

**Ver procedimiento de limpieza detallado en documento "cleaner.ipynb" en este mismo repositorio**

In [102]:
df = pd.read_csv("Inputs/training_dataset.csv")

In [103]:
df.head(3)

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,9,author
0,0,0.142636,0.43107,0.032912,-0.486796,0.261718,-1.013906,-0.081827,0.330352,0.438262,-0.761088,Ubuntius
1,1,0.019197,0.525284,0.068476,0.166658,0.261718,0.231839,0.635431,0.133209,0.313536,-0.002374,Philippus
2,2,0.09326,-0.087108,-2.268081,-0.164963,0.261718,1.477585,0.635431,-0.370641,0.126447,1.078357,Marcus


In [104]:
class cleanBible(TransformerMixin):
    def __init__(self):
        self.features =["id", "0","1","2","3","4","5","6","7","8","9"]
    
    def fit(self, df):
        #Iniciamos limpieza
        X = df[self.features]
        X = df.drop(columns="id", axis=1)
        self.X = pd.get_dummies(X)
        self.cols = self.X.columns
        return self
    
    def transform(self, X):
        return self.X

In [105]:
cleaner = cleanBible()
pipe = make_pipeline(*[
    cleaner,
    StandardScaler(),
    #Normalizer(),
])

In [106]:
X = pipe.fit_transform(df)
X= pd.DataFrame(X, columns=cleaner.cols)
pd.options.display.max_columns =28
X.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,author_Blanquius,author_Clarius,author_FerrumEffractarius,author_Marcus,author_Ovionicus,author_Paithonius,author_Philippus,author_Ubuntius
0,0.125799,0.10976,0.034786,-0.477907,0.266038,-0.917224,-0.090001,0.328784,0.414138,-0.752526,-0.304063,-0.233833,-0.192409,-0.861305,-0.348736,-0.237766,-0.492395,4.61465
1,-0.000328,0.135538,0.067771,0.183015,0.266038,0.202721,0.482713,0.132677,0.298764,0.017923,-0.304063,-0.233833,-0.192409,-0.861305,-0.348736,-0.237766,2.030891,-0.216701
2,0.075348,-0.032019,-2.09933,-0.152396,0.266038,1.322667,0.482713,-0.368525,0.125703,1.115368,-0.304063,-0.233833,-0.192409,1.161029,-0.348736,-0.237766,-0.492395,-0.216701


In [107]:
df_export = X.copy()
df_export.to_csv("Outputs/bible_cleaned(V1).csv", index=False)

## Transformación de dataset

In [138]:
loan_csv = "Outputs/bible_cleaned(V1).csv"

data = h2o.import_file(loan_csv, header=1) 

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [140]:
data.shape

(12017, 18)

In [141]:
data.head(3)

0,1,2,3,4,5,6,7,8,9,author_Blanquius,author_Clarius,author_FerrumEffractarius,author_Marcus,author_Ovionicus,author_Paithonius,author_Philippus,author_Ubuntius
0.125799,0.10976,0.0347863,-0.477907,0.266038,-0.917224,-0.0900011,0.328784,0.414138,-0.752526,-0.304063,-0.233833,-0.192409,-0.861305,-0.348736,-0.237766,-0.492395,4.61465
-0.000328204,0.135538,0.067771,0.183015,0.266038,0.202721,0.482713,0.132677,0.298764,0.0179226,-0.304063,-0.233833,-0.192409,-0.861305,-0.348736,-0.237766,2.03089,-0.216701
0.0753478,-0.0320192,-2.09933,-0.152396,0.266038,1.32267,0.482713,-0.368525,0.125703,1.11537,-0.304063,-0.233833,-0.192409,1.16103,-0.348736,-0.237766,-0.492395,-0.216701




## Partición de los datos

In [111]:
splits = data.split_frame(ratios=[0.7, 0.15], seed=1)  

train = splits[0]
valid = splits[1]
test = splits[2]

In [112]:
print(train.nrow)
print(valid.nrow)
print(test.nrow)

8452
1792
1774


## Extracción de Features

In [113]:
y = 'price'
x = list(data.columns)

x.remove(y)

ValueError: list.remove(x): x not in list

In [None]:
x

## Machine Learning!!

In [None]:
# rf_fit1 = H2ODeepLearningEstimator(model_id='rf_fit1', seed=1)
# rf_fit1.train(x=x, y=y, training_frame=train)


In [None]:
# gbm_fit3 = H2OGradientBoostingEstimator(model_id='gbm_fit3', 
                                        # ntrees=500, 
                                        # score_tree_interval=5,     #used for early stopping
                                        # stopping_rounds=3,         #used for early stopping
                                        # stopping_metric='AUC',     #used for early stopping
                                        # stopping_tolerance=0.0005, #used for early stopping
                                        # seed=1)
# gbm_fit3.train(x=x, y=y, training_frame=train, validation_frame=valid)

In [None]:
# rf_fit3 = H2ORandomForestEstimator(model_id='rf_fit3', ntrees=500, nfolds=8, seed=1)
# rf_fit3.train(x=x, y=y, training_frame=data)

In [None]:
# rf_fit3

## Métricas de rendimiento

In [None]:
# rf_perf1 = rf_fit1.model_performance(test)
# rf_perf2 = rf_fit2.model_performance(test)
# rf_perf3 = rf_fit3.model_performance(test)

In [None]:
# print(rf_perf1)

In [None]:
# print(rf_perf2)

In [None]:
# print(rf_perf3)

## Auto Machine Learning!!!

**Imposible en mi PC, migramos a Google Colab (ver documento "Colab.ipynb" en el respositorio)**

## Generación de resultado final

**Importación de dataset de testeo**


In [None]:

data_test = pd.read_csv("Inputs/diamonds_test.csv")

data_test.head(2)


**Limpieza de dataset**

In [None]:
class cleanDiamonds_test(TransformerMixin):
    def __init__(self):
        self.features =["id","carat","table","depth","y","z","cut","color","clarity","x"]
    
    def fit(self, df):
        #Iniciamos limpieza
        X = df[self.features]
        X.drop(columns=["table","depth","x","z"], axis=1, inplace=True)
        X["cut"] = X["cut"].replace("Premium", 1).replace("Ideal", 2).replace("Very Good", 3).replace("Very Good", 4).replace("Fair", 5).replace("Good", 6)
        X["color"] = X["color"].replace("D", 1).replace("E", 2).replace("F", 3).replace("G", 4).replace("H", 5).replace("I", 6).replace("J", 7)
        X["clarity"] = X["clarity"].replace("IF", 1).replace("VVS1", 2).replace("VVS2", 3).replace("VS1", 4).replace("VS2", 5).replace("SI1", 6).replace("SI2", 7).replace("I1", 8)
        self.X = X
        self.cols = self.X.columns
        return self
    
    def transform(self, X):
        return self.X

In [None]:
cleaner = cleanDiamonds_test()
pipe = make_pipeline(*[
    cleaner,
    #StandardScaler(),
    #Normalizer(),
])

In [None]:
X = pipe.fit_transform(data_test)
X= pd.DataFrame(X, columns=cleaner.cols)
pd.options.display.max_columns =28
X.head(3)

In [None]:
df_export = X.copy()
df_export.to_csv("Outputs/diamonds_cleaned_test(V3).csv", index=False)

**Transformación a dataframe de H2O**

In [None]:
loan_csv = "Outputs/diamonds_cleaned_test(V1).csv"

data_test_h2o = h2o.import_file(loan_csv) 

In [None]:
data_test_h2o.head(2)

**Extracción de predicciones**

In [None]:
y_pred = rf_fit3.predict(test_data=data_test_h2o)

price = y_pred.as_data_frame()

price = price.rename(columns={"predict":"price"})
price.head()

In [None]:
result = data_test[["id"]].join(price)

result.head()

**Generación de reporte final**

In [None]:
resultDef = result[["id","price"]]

resultDef.head(2)

In [None]:
resultDef.to_csv ("Outputs/submission.csv", index = False, header=True)

## Última comprobación final

In [None]:
df_check = pd.read_csv("Outputs/submission.csv")

In [None]:
df_check.head()

In [None]:
data_test.head()