In [1]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statistics
import pickle

# EDA & estandarización
# ==============================================================================
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder 

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("datos/train_eda.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Premium,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,Ideal,E,VVS2,62.7,56.0,6.42,6.46,4.04,9.183
2,2,0.72,Ideal,F,VS2,61.8,59.0,5.71,5.74,3.54,7.983
3,3,1.08,Very Good,G,SI2,63.2,57.0,6.54,6.5,4.12,8.371
4,4,0.36,Premium,G,VS1,62.3,59.0,4.5,4.55,2.82,6.588


In [3]:
df.drop(["id","depth","table"], axis = 1, inplace = True)
df.head(1)

Unnamed: 0,carat,cut,color,clarity,x,y,z,price
0,0.3,Premium,D,SI2,4.31,4.28,2.68,6.353


In [4]:
# iniciamos el método para escalar

scaler = StandardScaler()

In [5]:
df_num_col = df.drop(['color', 'clarity','cut','price'], axis=1)
df_num_col.head(2)

Unnamed: 0,carat,x,y,z
0,0.3,4.31,4.28,2.68
1,1.01,6.42,6.46,4.04


In [6]:
# ajustamos el modelo utilizando nuestro set de datos

scaler.fit(df_num_col)

In [7]:
# guardamos el robust scaler
with open('datos/scaler.pkl', 'wb') as standard_escaler:
        pickle.dump(scaler, standard_escaler)

In [8]:
# transformamos los datos

X_escaladas = scaler.transform(df_num_col)
X_escaladas


array([[-1.04551475, -1.26480579, -1.30251693, -1.23816884],
       [ 0.44729666,  0.61503006,  0.65394796,  0.72445751],
       [-0.16244321, -0.01752134,  0.00777607,  0.0029037 ],
       ...,
       [ 0.00576089,  0.11611628,  0.15136983,  0.21936984],
       [ 0.44729666,  0.59721171,  0.67189718,  0.6090089 ],
       [ 1.05703653,  1.22085394,  1.26422142,  1.07080333]])

In [9]:
# por último convertiremos el array que nos devuelve en un dataframe. 

df[df_num_col.columns] = X_escaladas
df.head()

Unnamed: 0,carat,cut,color,clarity,x,y,z,price
0,-1.045515,Premium,D,SI2,-1.264806,-1.302517,-1.238169,6.353
1,0.447297,Ideal,E,VVS2,0.61503,0.653948,0.724458,9.183
2,-0.162443,Ideal,F,VS2,-0.017521,0.007776,0.002904,7.983
3,0.594475,Very Good,G,SI2,0.72194,0.689846,0.839906,8.371
4,-0.919362,Premium,G,VS1,-1.095531,-1.060202,-1.036134,6.588


In [10]:
df.head()

Unnamed: 0,carat,cut,color,clarity,x,y,z,price
0,-1.045515,Premium,D,SI2,-1.264806,-1.302517,-1.238169,6.353
1,0.447297,Ideal,E,VVS2,0.61503,0.653948,0.724458,9.183
2,-0.162443,Ideal,F,VS2,-0.017521,0.007776,0.002904,7.983
3,0.594475,Very Good,G,SI2,0.72194,0.689846,0.839906,8.371
4,-0.919362,Premium,G,VS1,-1.095531,-1.060202,-1.036134,6.588


In [11]:
# vamos a definir una función que nos aplique este método

def one_hot_encoder(dff, columna):
    
    '''
    columnas: lista
    '''
    
    oh = OneHotEncoder()
    
    transformados = oh.fit_transform(dff[columna])
    
    oh_df = pd.DataFrame(transformados.toarray(), columns = oh.get_feature_names_out(), dtype = int)
    
    dff[oh_df.columns] = oh_df
    
    dff.drop(columna, axis = 1, inplace = True)

    with open(f'datos/{columna}scaler.pkl', 'wb') as one_hot:
        pickle.dump(oh, one_hot)
    
    return dff

In [12]:
df=one_hot_encoder(df,["cut"])
df=one_hot_encoder(df,["color"])
df=one_hot_encoder(df,["clarity"])
df.head()

Unnamed: 0,carat,x,y,z,price,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,-1.045515,-1.264806,-1.302517,-1.238169,6.353,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1,0.447297,0.61503,0.653948,0.724458,9.183,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,-0.162443,-0.017521,0.007776,0.002904,7.983,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.594475,0.72194,0.689846,0.839906,8.371,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,-0.919362,-1.095531,-1.060202,-1.036134,6.588,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [13]:
df.to_csv("datos/train_preproc_estandard_onehot_map.csv")

In [14]:
df.columns

Index(['carat', 'x', 'y', 'z', 'price', 'cut_Fair', 'cut_Good', 'cut_Ideal',
       'cut_Premium', 'cut_Very Good', 'color_D', 'color_E', 'color_F',
       'color_G', 'color_H', 'color_I', 'color_J', 'clarity_I1', 'clarity_IF',
       'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2',
       'clarity_VVS1', 'clarity_VVS2'],
      dtype='object')