## Importar Liberías

**Librerías básicas**

In [85]:
import pandas as pd

**Módulos locales**

In [86]:
from Src.cleaner import *
from Src.featureSelection import *

**Librerías de apoyo**

In [87]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline

**Librerías de Machine Learning**

In [88]:
import h2o
h2o.init(nthreads = -1, max_mem_size = 8)

from h2o.automl import H2OAutoML

from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.6" 2020-01-14; OpenJDK Runtime Environment (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1); OpenJDK 64-Bit Server VM (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1, mixed mode, sharing)
  Starting server from /home/alex/.local/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmprwxzuysy
  JVM stdout: /tmp/tmprwxzuysy/h2o_alex_started_from_python.out
  JVM stderr: /tmp/tmprwxzuysy/h2o_alex_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Europe/Madrid
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,25 days
H2O cluster name:,H2O_from_python_alex_vpln75
H2O cluster total nodes:,1
H2O cluster free memory:,8 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


## Limpieza de datos

In [89]:
df = pd.read_csv("Inputs/diamonds_train.csv")

In [90]:
df.head(3)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.78,Premium,F,VS1,61.5,58.0,5.93,5.98,3.66,3446
1,1,0.31,Ideal,D,SI1,60.8,56.0,4.37,4.32,2.64,732
2,2,0.3,Ideal,F,SI1,62.3,54.0,4.3,4.34,2.69,475


In [91]:
class cleanDiamonds(TransformerMixin):
    def __init__(self):
        self.features =["price", "carat","table","depth","y","z","cut","color","clarity","x"]
    
    def fit(self, df):
        #Iniciamos limpieza
        X = df[self.features]
        X.drop(columns=["table","depth","x","z"], axis=1, inplace=True)
        X["cut"] = X["cut"].replace("Premium", 1).replace("Ideal", 2).replace("Very Good", 3).replace("Very Good", 4).replace("Fair", 5).replace("Good", 6)
        X["color"] = X["color"].replace("D", 1).replace("E", 2).replace("F", 3).replace("G", 4).replace("H", 5).replace("I", 6).replace("J", 7)
        X["clarity"] = X["clarity"].replace("IF", 1).replace("VVS1", 2).replace("VVS2", 3).replace("VS1", 4).replace("VS2", 5).replace("SI1", 6).replace("SI2", 7).replace("I1", 8)
        self.cols = self.X.columns
        return self
    
    def transform(self, X):
        return self.X

In [92]:
cleaner = cleanDiamonds()
pipe = make_pipeline(*[
    cleaner,
    #StandardScaler(),
    #Normalizer(),
])

In [93]:
X = pipe.fit_transform(df)
X= pd.DataFrame(X, columns=cleaner.cols)
pd.options.display.max_columns =28
X.head(3)

Unnamed: 0,price,carat,table,depth,y,z,x,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,3446,0.78,58.0,61.5,5.98,3.66,5.93,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
1,732,0.31,56.0,60.8,4.32,2.64,4.37,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,475,0.3,54.0,62.3,4.34,2.69,4.3,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0


In [94]:
df_export = X.copy()
df_export.to_csv("Outputs/diamonds_cleaned(V1).csv", index=False)


## Transformación de dataset

In [95]:
loan_csv = "Outputs/diamonds_cleaned(V1).csv"

data = h2o.import_file(loan_csv) 

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [96]:
data.shape

(40345, 27)

In [97]:
data.head(3)

price,carat,table,depth,y,z,x,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
3446,0.78,58,61.5,5.98,3.66,5.93,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
732,0.31,56,60.8,4.32,2.64,4.37,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
475,0.3,54,62.3,4.34,2.69,4.3,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0




## Partición de los datos

In [None]:
splits = data.split_frame(ratios=[0.7, 0.15], seed=1)  

train = splits[0]
valid = splits[1]
test = splits[2]

In [None]:
print(train.nrow)
print(valid.nrow)
print(test.nrow)

## Extracción de Features

In [None]:
y = 'price'
x = list(data.columns)

x.remove(y)

In [None]:
x

## Machine Learning!!

In [None]:
# rf_fit1 = H2ODeepLearningEstimator(model_id='rf_fit1', seed=1)
# rf_fit1.train(x=x, y=y, training_frame=train)


In [None]:
# gbm_fit3 = H2OGradientBoostingEstimator(model_id='gbm_fit3', 
                                        # ntrees=500, 
                                        # score_tree_interval=5,     #used for early stopping
                                        # stopping_rounds=3,         #used for early stopping
                                        # stopping_metric='AUC',     #used for early stopping
                                        # stopping_tolerance=0.0005, #used for early stopping
                                        # seed=1)
# gbm_fit3.train(x=x, y=y, training_frame=train, validation_frame=valid)

In [None]:
# rf_fit3 = H2ORandomForestEstimator(model_id='rf_fit3', ntrees=500, nfolds=8, seed=1)
# rf_fit3.train(x=x, y=y, training_frame=data)

In [None]:
# rf_fit3

## Métricas de rendimiento

In [None]:
# rf_perf1 = rf_fit1.model_performance(test)
# rf_perf2 = rf_fit2.model_performance(test)
# rf_perf3 = rf_fit3.model_performance(test)

In [None]:
# print(rf_perf1)

In [None]:
# print(rf_perf2)

In [None]:
# print(rf_perf3)

## Generación de resultado final

**Importación de dataset de testeo**


In [None]:

data_test = pd.read_csv("Inputs/diamonds_test.csv")

data_test.head(2)


**Limpieza de dataset**

In [None]:
class cleanDiamonds_test(TransformerMixin):
    def __init__(self):
        self.features =["id","carat","table","depth","y","z","cut","color","clarity","x"]
    
    def fit(self, df):
        #Iniciamos limpieza
        X = df[self.features]
        X.drop(columns=["id","table","depth","x","z"], axis=1, inplace=True)
        X["cut"] = X["cut"].replace("Premium", 1).replace("Ideal", 2).replace("Very Good", 3).replace("Very Good", 4).replace("Fair", 5).replace("Good", 6)
        X["color"] = X["color"].replace("D", 1).replace("E", 2).replace("F", 3).replace("G", 4).replace("H", 5).replace("I", 6).replace("J", 7)
        X["clarity"] = X["clarity"].replace("IF", 1).replace("VVS1", 2).replace("VVS2", 3).replace("VS1", 4).replace("VS2", 5).replace("SI1", 6).replace("SI2", 7).replace("I1", 8)
        self.cols = self.X.columns
        return self
    
    def transform(self, X):
        return self.X

In [None]:
cleaner = cleanDiamonds_test()
pipe = make_pipeline(*[
    cleaner,
    #StandardScaler(),
    #Normalizer(),
])

In [None]:
X = pipe.fit_transform(data_test)
X= pd.DataFrame(X, columns=cleaner.cols)
pd.options.display.max_columns =28
X.head(3)

In [None]:
df_export = X.copy()
df_export.to_csv("Outputs/diamonds_cleaned_test(V1).csv", index=False)

**Transformación a dataframe de H2O**

In [None]:
loan_csv = "Outputs/diamonds_cleaned_test(V1).csv"

data_test_h2o = h2o.import_file(loan_csv) 

In [None]:
data_test_h2o.head(2)

**Extracción de predicciones**

In [None]:
y_pred = rf_fit3.predict(test_data=data_test_h2o)

price = y_pred.as_data_frame()

price = price.rename(columns={"predict":"price"})
price.head()

In [None]:
result = data_test[["id"]].join(price)

result.head()

**Generación de reporte final**

In [None]:
resultDef = result[["id","price"]]

resultDef.head(2)

In [None]:
resultDef.to_csv ("Outputs/submission.csv", index = False, header=True)

## Última comprobación final

In [None]:
df_check = pd.read_csv("Outputs/submission.csv")

In [None]:
df_check.head()

In [None]:
data_test.head()