In [78]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statistics
import pickle

# EDA & estandarización
# ==============================================================================
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder 

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [88]:
df=pd.read_csv("datos/train_eda.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Premium,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,Ideal,E,VVS2,62.7,56.0,6.42,6.46,4.04,9.183
2,2,0.72,Ideal,F,VS2,61.8,59.0,5.71,5.74,3.54,7.983
3,3,1.08,Very Good,G,SI2,63.2,57.0,6.54,6.5,4.12,8.371
4,4,0.36,Premium,G,VS1,62.3,59.0,4.5,4.55,2.82,6.588


In [89]:
df.drop("id", axis = 1, inplace = True)
df.head(1)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.3,Premium,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353


In [90]:
# construir el modelo de escalador
robust = RobustScaler()

In [91]:
df_num_col = df.drop(['color', 'clarity','cut','price'], axis=1)
df_num_col.head(2)

Unnamed: 0,carat,depth,table,x,y,z
0,0.3,62.4,58.0,4.31,4.28,2.68
1,1.01,62.7,56.0,6.42,6.46,4.04


In [92]:
# ajustamos el modelo utilizando nuestro set de datos
robust.fit(df_num_col)

In [93]:
# guardamos el robust scaler
with open('datos/robust.pkl', 'wb') as robust_escaler:
        pickle.dump(robust, robust_escaler)

In [94]:
# transformamos los datos
X_robust = robust.transform(df_num_col)

In [95]:
# por último convertiremos el array que nos devuelve en un dataframe. 

df[df_num_col.columns] = X_robust
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,-0.625,Premium,D,SI2,0.4,0.333333,-0.754098,-0.78022,-0.75,6.353
1,0.484375,Ideal,E,VVS2,0.6,-0.333333,0.398907,0.417582,0.464286,9.183
2,0.03125,Ideal,F,VS2,0.0,0.666667,0.010929,0.021978,0.017857,7.983
3,0.59375,Very Good,G,SI2,0.933333,0.0,0.464481,0.43956,0.535714,8.371
4,-0.53125,Premium,G,VS1,0.333333,0.666667,-0.650273,-0.631868,-0.625,6.588


In [97]:
clarity_dict = {"I1": 2, "SI2" :2, "SI1" :1, "VS2" :1, "VS1" :1, "VVS2" :0, "VVS1": 0, "IF": 0}

df['clarity_ord'] = df['clarity'].replace(clarity_dict)
# Eliminar las variables originales
df.drop(['clarity'], axis=1, inplace=True)

In [98]:
df.head()

Unnamed: 0,carat,cut,color,depth,table,x,y,z,price,clarity_ord
0,-0.625,Premium,D,0.4,0.333333,-0.754098,-0.78022,-0.75,6.353,2
1,0.484375,Ideal,E,0.6,-0.333333,0.398907,0.417582,0.464286,9.183,0
2,0.03125,Ideal,F,0.0,0.666667,0.010929,0.021978,0.017857,7.983,1
3,0.59375,Very Good,G,0.933333,0.0,0.464481,0.43956,0.535714,8.371,2
4,-0.53125,Premium,G,0.333333,0.666667,-0.650273,-0.631868,-0.625,6.588,1


In [99]:
# vamos a definir una función que nos aplique este método

def one_hot_encoder(dff, columna):
    
    '''
    columnas: lista
    '''
    
    oh = OneHotEncoder()
    
    transformados = oh.fit_transform(dff[columna])
    
    oh_df = pd.DataFrame(transformados.toarray(), columns = oh.get_feature_names_out(), dtype = int)
    
    dff[oh_df.columns] = oh_df
    
    dff.drop(columna, axis = 1, inplace = True)

    with open(f'datos/{columna}scaler.pkl', 'wb') as one_hot:
        pickle.dump(oh, one_hot)
    
    return dff

In [100]:
df=one_hot_encoder(df,["cut"])
df=one_hot_encoder(df,["color"])
df.head()

Unnamed: 0,carat,depth,table,x,y,z,price,clarity_ord,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J
0,-0.625,0.4,0.333333,-0.754098,-0.78022,-0.75,6.353,2,0,0,0,1,0,1,0,0,0,0,0,0
1,0.484375,0.6,-0.333333,0.398907,0.417582,0.464286,9.183,0,0,0,1,0,0,0,1,0,0,0,0,0
2,0.03125,0.0,0.666667,0.010929,0.021978,0.017857,7.983,1,0,0,1,0,0,0,0,1,0,0,0,0
3,0.59375,0.933333,0.0,0.464481,0.43956,0.535714,8.371,2,0,0,0,0,1,0,0,0,1,0,0,0
4,-0.53125,0.333333,0.666667,-0.650273,-0.631868,-0.625,6.588,1,0,0,0,1,0,0,0,0,1,0,0,0


In [102]:
df.to_csv("datos/train_preproc_robust_onehot_map.csv")

In [103]:
df.columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z', 'price', 'clarity_ord',
       'cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good',
       'color_D', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I',
       'color_J'],
      dtype='object')