## Importar Liberías

**Librerías básicas**

In [288]:
import pandas as pd

**Módulos locales**

In [289]:
from Src.cleaner import *
from Src.featureSelection import *

**Librerías de apoyo**

In [290]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline

**Librerías de Machine Learning**

In [291]:
import h2o
h2o.init(nthreads = -1, max_mem_size = 8)

from h2o.automl import H2OAutoML

from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,9 mins 29 secs
H2O cluster timezone:,Europe/Madrid
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,25 days
H2O cluster name:,H2O_from_python_alex_zxa7e2
H2O cluster total nodes:,1
H2O cluster free memory:,7.997 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


## Limpieza de datos

In [292]:
df = pd.read_csv("Inputs/diamonds_train.csv")

In [293]:
df.head(3)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.78,Premium,F,VS1,61.5,58.0,5.93,5.98,3.66,3446
1,1,0.31,Ideal,D,SI1,60.8,56.0,4.37,4.32,2.64,732
2,2,0.3,Ideal,F,SI1,62.3,54.0,4.3,4.34,2.69,475


In [294]:
class cleanDiamonds(TransformerMixin):
    def __init__(self):
        self.features =["price", "carat","table","depth","y","z","cut","color","clarity","x"]
    
    def fit(self, df):
        #Iniciamos limpieza
        X = df[self.features]
        X.drop(columns=["table","depth","x","z"], axis=1, inplace=True)
        X["cut"] = X["cut"].replace("Premium", 1).replace("Ideal", 2).replace("Very Good", 3).replace("Very Good", 4).replace("Fair", 5).replace("Good", 6)
        X["color"] = X["color"].replace("D", 1).replace("E", 2).replace("F", 3).replace("G", 4).replace("H", 5).replace("I", 6).replace("J", 7)
        X["clarity"] = X["clarity"].replace("IF", 1).replace("VVS1", 2).replace("VVS2", 3).replace("VS1", 4).replace("VS2", 5).replace("SI1", 6).replace("SI2", 7).replace("I1", 8)
        self.X=X
        self.cols = self.X.columns
        return self
    
    def transform(self, X):
        return self.X

In [295]:
cleaner = cleanDiamonds()
pipe = make_pipeline(*[
    cleaner,
    #StandardScaler(),
    #Normalizer(),
])

In [296]:
X = pipe.fit_transform(df)
X= pd.DataFrame(X, columns=cleaner.cols)
pd.options.display.max_columns =28
X.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:

Unnamed: 0,price,carat,y,cut,color,clarity
0,3446,0.78,5.98,1,3,4
1,732,0.31,4.32,2,1,6
2,475,0.3,4.34,2,3,6


In [297]:
df_export = X.copy()
df_export.to_csv("Outputs/diamonds_cleaned(V2).csv", index=False)


## Transformación de dataset

In [298]:
loan_csv = "Outputs/diamonds_cleaned(V1).csv"

data = h2o.import_file(loan_csv) 

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [299]:
data.shape

(40345, 6)

In [300]:
data.head(3)

price,carat,y,cut,color,clarity
3446,0.78,5.98,1,3,4
732,0.31,4.32,2,1,6
475,0.3,4.34,2,3,6




## Partición de los datos

In [301]:
splits = data.split_frame(ratios=[0.7, 0.15], seed=1)  

train = splits[0]
valid = splits[1]
test = splits[2]

In [302]:
print(train.nrow)
print(valid.nrow)
print(test.nrow)

28346
5960
6039


## Extracción de Features

In [303]:
y = 'price'
x = list(data.columns)

x.remove(y)

In [304]:
x

['carat', 'y', 'cut', 'color', 'clarity']

## Machine Learning!!

In [305]:
# rf_fit1 = H2ODeepLearningEstimator(model_id='rf_fit1', seed=1)
# rf_fit1.train(x=x, y=y, training_frame=train)


In [306]:
# gbm_fit3 = H2OGradientBoostingEstimator(model_id='gbm_fit3', 
                                        # ntrees=500, 
                                        # score_tree_interval=5,     #used for early stopping
                                        # stopping_rounds=3,         #used for early stopping
                                        # stopping_metric='AUC',     #used for early stopping
                                        # stopping_tolerance=0.0005, #used for early stopping
                                        # seed=1)
# gbm_fit3.train(x=x, y=y, training_frame=train, validation_frame=valid)

In [307]:
# rf_fit3 = H2ORandomForestEstimator(model_id='rf_fit3', ntrees=500, nfolds=8, seed=1)
# rf_fit3.train(x=x, y=y, training_frame=data)

In [308]:
# rf_fit3

## Métricas de rendimiento

In [309]:
# rf_perf1 = rf_fit1.model_performance(test)
# rf_perf2 = rf_fit2.model_performance(test)
# rf_perf3 = rf_fit3.model_performance(test)

In [310]:
# print(rf_perf1)

In [311]:
# print(rf_perf2)

In [312]:
# print(rf_perf3)

## Auto Machine Learning!!!

**Imposible en mi PC, migramos a Google Colab**

## Generación de resultado final

**Importación de dataset de testeo**


In [313]:

data_test = pd.read_csv("Inputs/diamonds_test.csv")

data_test.head(2)


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,1.1,Premium,H,SI2,62.2,58.0,6.69,6.6,4.13
1,1,0.51,Ideal,I,SI1,62.5,57.0,5.07,5.1,3.18


**Limpieza de dataset**

In [327]:
class cleanDiamonds_test(TransformerMixin):
    def __init__(self):
        self.features =["id","carat","table","depth","y","z","cut","color","clarity","x"]
    
    def fit(self, df):
        #Iniciamos limpieza
        X = df[self.features]
        X.drop(columns=["table","depth","x","z"], axis=1, inplace=True)
        X["cut"] = X["cut"].replace("Premium", 1).replace("Ideal", 2).replace("Very Good", 3).replace("Very Good", 4).replace("Fair", 5).replace("Good", 6)
        X["color"] = X["color"].replace("D", 1).replace("E", 2).replace("F", 3).replace("G", 4).replace("H", 5).replace("I", 6).replace("J", 7)
        X["clarity"] = X["clarity"].replace("IF", 1).replace("VVS1", 2).replace("VVS2", 3).replace("VS1", 4).replace("VS2", 5).replace("SI1", 6).replace("SI2", 7).replace("I1", 8)
        self.X = X
        self.cols = self.X.columns
        return self
    
    def transform(self, X):
        return self.X

In [328]:
cleaner = cleanDiamonds_test()
pipe = make_pipeline(*[
    cleaner,
    #StandardScaler(),
    #Normalizer(),
])

In [329]:
X = pipe.fit_transform(data_test)
X= pd.DataFrame(X, columns=cleaner.cols)
pd.options.display.max_columns =28
X.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:

Unnamed: 0,id,carat,y,cut,color,clarity
0,0,1.1,6.6,1,5,7
1,1,0.51,5.1,2,6,6
2,2,2.03,8.09,1,4,6


In [330]:
df_export = X.copy()
df_export.to_csv("Outputs/diamonds_cleaned_test(V3).csv", index=False)

**Transformación a dataframe de H2O**

In [318]:
loan_csv = "Outputs/diamonds_cleaned_test(V1).csv"

data_test_h2o = h2o.import_file(loan_csv) 

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [319]:
data_test_h2o.head(2)

carat,y,cut,color,clarity
1.1,6.6,1,5,7
0.51,5.1,2,6,6




**Extracción de predicciones**

In [320]:
y_pred = rf_fit3.predict(test_data=data_test_h2o)

price = y_pred.as_data_frame()

price = price.rename(columns={"predict":"price"})
price.head()

NameError: name 'rf_fit3' is not defined

In [None]:
result = data_test[["id"]].join(price)

result.head()

**Generación de reporte final**

In [None]:
resultDef = result[["id","price"]]

resultDef.head(2)

In [None]:
resultDef.to_csv ("Outputs/submission.csv", index = False, header=True)

## Última comprobación final

In [None]:
df_check = pd.read_csv("Outputs/submission.csv")

In [None]:
df_check.head()

In [None]:
data_test.head()