In [36]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd

# Para la codificación de las variables numéricas
# -----------------------------------------------------------------------
from sklearn.preprocessing import LabelEncoder # para realizar el Label Encoding 
from sklearn.preprocessing import OneHotEncoder  # para realizar el One-Hot Encoding

# Para evitar que salgan los warnings en jupyter
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [37]:
df=pd.read_csv('data/05-primas_norm_est_1.csv',index_col=0)
df.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_Sklearn
0,-1.387916,female,-0.380152,-0.885991,yes,southwest,0.836552
1,-1.459488,male,0.611376,-0.069805,no,southeast,0.032037


Happy coding 🤯📊

# Pair Programming Encoding

## Ana Campos y Ana González

En el pair programming de hoy usaremos el set de datos que guardastéis en el pair programming de normalización y estandarización.

Vuestro set de datos debería tener al menos una variable categórica, el objetivo del pair programming de hoy:

- Hacer una códificación de la/las variables categóricas que tengáis en vuestro set de datos.
- Recordad que lo primero que deberéis hacer es decidir su vuestras variables tienen o no orden, para que en función de esto uséis una aproximación u otra.
- Guardad el dataframe, donde deberíais tener las variables estadandarizas, normalizadas y codificadas en un csv para usarlo en el próximo pairprogramming

In [38]:
df.sample(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_Sklearn
687,-0.171194,female,-1.73907,-0.885991,no,northeast,0.230123
203,0.902384,male,-0.96882,2.378751,no,northwest,0.551839
365,1.260244,male,-0.712069,-0.885991,no,northeast,0.533016
630,0.043522,female,1.96776,-0.885991,no,southeast,0.240959
122,1.1171,male,1.596148,-0.069805,no,southwest,0.495077


Tenemos dos, en el caso de hábitos tabáquicos, asumimos que nuestra variable tiene orden, y le asignaremos más peso a la condición fumador.
La variable de sexo también tenamos orden, según ANOVA la condición masculina obtiene un coeficiente negativo y p> a 0.05, nos quedamos con mayor peso a femenino para probar get dummies, sospechamos que han contestado más varones, pero las mujeres en general, deberían ocasionar más gastos por embarazos, partos, revisiones regulares anuales...toda la condición preventiva y clínica asociada a su sexo.

In [39]:
mapa = {'no':0, 'yes':1} 

In [40]:
df["fumadores"] = df["smoker"].map(mapa)

In [41]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_Sklearn,fumadores
0,-1.387916,female,-0.380152,-0.885991,yes,southwest,0.836552,1
1,-1.459488,male,0.611376,-0.069805,no,southeast,0.032037,0
2,-0.743769,male,0.481312,1.562566,no,southeast,0.176597,0
3,-0.457482,male,-0.214616,-0.885991,no,northwest,0.145677,0
4,-0.529054,female,-0.745008,-0.885991,no,southeast,0.139827,0


In [42]:
# lo haremos para la columna "sex"

dummies = pd.get_dummies(df["sex"], prefix_sep = "_", prefix = "sex", dtype = int)
dummies.head(2)

Unnamed: 0,sex_female,sex_male
0,1,0
1,0,1


In [43]:
# juntamos los datos condf_dummies

df_dummies = pd.concat([df, dummies], axis = 1)
df_dummies.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_Sklearn,fumadores,sex_female,sex_male
0,-1.387916,female,-0.380152,-0.885991,yes,southwest,0.836552,1,1,0
1,-1.459488,male,0.611376,-0.069805,no,southeast,0.032037,0,0,1


In [44]:
df_dummies['sex_female'].value_counts()

1    547
0    517
Name: sex_female, dtype: int64

Categorizamos la variable region, que aún no la hemos quitado del dataframe a pesar de que según anova y gráficos no influye aparentemente en nuestra variable respuesta. Asignamos categorias de igual peso con oneHot.

In [45]:
# iniciamos el método de OneHot Encoder
oh = OneHotEncoder()

In [46]:
# hacemos la codificación de los datos para la variable dada 
transformados = oh.fit_transform(df[['region']])

In [47]:
# convertimos nuestro array con la codificación hecha en un dataframe, donde tendremos dos columnas, una para los "Yes" y otra para los "No"
oh_df = pd.DataFrame(transformados.toarray())
oh_df.head(10)

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0
5,0.0,0.0,1.0,0.0
6,0.0,1.0,0.0,0.0
7,1.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0
9,0.0,0.0,0.0,1.0


In [48]:
# el método get_feature_names nos va a dar el nombre de las columnas nuevas que se nos generarán
oh_df.columns = oh.get_feature_names()
oh_df.columns

Index(['x0_northeast', 'x0_northwest', 'x0_southeast', 'x0_southwest'], dtype='object')

In [49]:
# concatenamos el dataframe original con el dataframe que acabamos de crear
final = pd.concat([df_dummies,oh_df],axis=1)
final.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_Sklearn,fumadores,sex_female,sex_male,x0_northeast,x0_northwest,x0_southeast,x0_southwest
0,-1.387916,female,-0.380152,-0.885991,yes,southwest,0.836552,1,1,0,0.0,0.0,0.0,1.0
1,-1.459488,male,0.611376,-0.069805,no,southeast,0.032037,0,0,1,0.0,0.0,1.0,0.0
2,-0.743769,male,0.481312,1.562566,no,southeast,0.176597,0,0,1,0.0,0.0,1.0,0.0
3,-0.457482,male,-0.214616,-0.885991,no,northwest,0.145677,0,0,1,0.0,1.0,0.0,0.0
4,-0.529054,female,-0.745008,-0.885991,no,southeast,0.139827,0,1,0,0.0,0.0,1.0,0.0


In [50]:
final.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges_Sklearn',
       'fumadores', 'sex_female', 'sex_male', 'x0_northeast', 'x0_northwest',
       'x0_southeast', 'x0_southwest'],
      dtype='object')

In [51]:
final.columns=['age','sex','bmi','children','smoker','region', 'charges_Sklearn','fumadores','mujeres','hombres','x0_northeast', 'x0_northwest', 'x0_southeast', 'x0_southwest']
final.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_Sklearn,fumadores,mujeres,hombres,x0_northeast,x0_northwest,x0_southeast,x0_southwest
0,-1.387916,female,-0.380152,-0.885991,yes,southwest,0.836552,1,1,0,0.0,0.0,0.0,1.0
1,-1.459488,male,0.611376,-0.069805,no,southeast,0.032037,0,0,1,0.0,0.0,1.0,0.0


In [52]:
final.to_csv('data/08-primas_categorizadas_normalizadas.csv')