In [2]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd

# Para la codificación de las variables numéricas
# -----------------------------------------------------------------------
from sklearn.preprocessing import LabelEncoder # para realizar el Label Encoding 
from sklearn.preprocessing import OneHotEncoder  # para realizar el One-Hot Encoding

# Para evitar que salgan los warnings en jupyter
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv('data/05-primas_norm_est_1.csv',index_col=0)

Happy coding 🤯📊

# Pair Programming Encoding

## Ana Campos y Ana González

En el pair programming de hoy usaremos el set de datos que guardastéis en el pair programming de normalización y estandarización.

Vuestro set de datos debería tener al menos una variable categórica, el objetivo del pair programming de hoy:

- Hacer una códificación de la/las variables categóricas que tengáis en vuestro set de datos.
- Recordad que lo primero que deberéis hacer es decidir su vuestras variables tienen o no orden, para que en función de esto uséis una aproximación u otra.
- Guardad el dataframe, donde deberíais tener las variables estadandarizas, normalizadas y codificadas en un csv para usarlo en el próximo pairprogramming

In [5]:
df.sample(5)

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges_Sklearn
113,-0.330441,1.051498,male,1.879556,-0.12074,yes,southeast,0.655059
227,1.144464,-1.262327,male,0.259067,0.770124,yes,southeast,0.368251
47,-1.0591,-1.406941,female,0.874116,0.770124,yes,northeast,0.422798
262,1.604273,0.183813,male,-0.287849,-1.011604,yes,southeast,0.973091
10,-1.564136,1.268419,male,1.46891,-1.011604,yes,southwest,0.643584


Tenemos dos, en el caso de hábitos tabáquicos, asumimos que nuestra variable tiene orden, y le asignaremos más peso a la condición fumador.
La variable de sexo también tenamos orden, según ANOVA la condición masculina obtiene un coeficiente negativo y p> a 0.05, nos quedamos con mayor peso a femenino para probar get dummies, sospechamos que han contestado más varones, pero las mujeres en general, deberían ocasionar más gastos por embarazos, partos, revisiones regulares anuales...toda la condición preventiva y clínica asociada a su sexo.

In [9]:
mapa = {'no':0, 'yes':1} 

In [10]:
df["fumadores"] = df["smoker"].map(mapa)

In [11]:
df.head()

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges_Sklearn,fumadores
0,-1.65459,-0.683871,male,-1.697491,-1.011604,no,northwest,0.045128,0
1,-1.639514,1.268419,female,-1.120191,-1.011604,no,northwest,0.203687,0
2,-1.634489,1.413033,female,-1.037325,-1.011604,yes,southeast,0.178221,1
3,-1.626951,-1.117713,male,1.879556,-1.011604,yes,southeast,0.447938,1
4,-1.614388,-0.900792,male,0.621835,-1.011604,yes,southwest,0.384541,1


In [12]:
# lo haremos para la columna "owners"

dummies = pd.get_dummies(df["sex"], prefix_sep = "_", prefix = "sex", dtype = int)
dummies.head(2)

Unnamed: 0,sex_female,sex_male
0,0,1
1,1,0


In [13]:
# juntamos los datos condf_dummies

df_dummies = pd.concat([df, dummies], axis = 1)
df_dummies.head(2)

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges_Sklearn,fumadores,sex_female,sex_male
0,-1.65459,-0.683871,male,-1.697491,-1.011604,no,northwest,0.045128,0,0,1
1,-1.639514,1.268419,female,-1.120191,-1.011604,no,northwest,0.203687,0,1,0


In [14]:
df_dummies['sex_female'].value_counts()

0    158
1    115
Name: sex_female, dtype: int64

Categorizamos la variable region, que aún no la hemos quitado del dataframe a pesar de que según anova y gráficos no influye aparentemente en nuestra variable respuesta. Asignamos categorias de igual peso con oneHot.

In [15]:
# iniciamos el método de OneHot Encoder
oh = OneHotEncoder()

In [16]:
# hacemos la codificación de los datos para la variable dada 
transformados = oh.fit_transform(df[['region']])

In [20]:
# convertimos nuestro array con la codificación hecha en un dataframe, donde tendremos dos columnas, una para los "Yes" y otra para los "No"
oh_df = pd.DataFrame(transformados.toarray())
oh_df.head(10)

Unnamed: 0,0,1,2,3
0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0
5,1.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0
7,0.0,0.0,0.0,1.0
8,0.0,0.0,0.0,1.0
9,1.0,0.0,0.0,0.0


In [18]:
# el método get_feature_names nos va a dar el nombre de las columnas nuevas que se nos generarán
oh_df.columns = oh.get_feature_names()
oh_df.columns

Index(['x0_northeast', 'x0_northwest', 'x0_southeast', 'x0_southwest'], dtype='object')

In [22]:
# concatenamos el dataframe original con el dataframe que acabamos de crear
final = pd.concat([df,oh_df],axis=1)
final.head()

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges_Sklearn,fumadores,0,1,2,3
0,-1.65459,-0.683871,male,-1.697491,-1.011604,no,northwest,0.045128,0,0.0,1.0,0.0,0.0
1,-1.639514,1.268419,female,-1.120191,-1.011604,no,northwest,0.203687,0,0.0,1.0,0.0,0.0
2,-1.634489,1.413033,female,-1.037325,-1.011604,yes,southeast,0.178221,1,0.0,0.0,1.0,0.0
3,-1.626951,-1.117713,male,1.879556,-1.011604,yes,southeast,0.447938,1,0.0,0.0,1.0,0.0
4,-1.614388,-0.900792,male,0.621835,-1.011604,yes,southwest,0.384541,1,0.0,0.0,0.0,1.0


In [25]:
final.columns

Index([          'index',             'age',             'sex',
                   'bmi',        'children',          'smoker',
                'region', 'charges_Sklearn',       'fumadores',
                       0,                 1,                 2,
                       3],
      dtype='object')

In [26]:
final.columns=['index','age','sex','bmi','children','smoker','region', 'charges_Sklearn','fumadores','x0_northeast', 'x0_northwest', 'x0_southeast', 'x0_southwest']
final.head(2)

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges_Sklearn,fumadores,x0_northeast,x0_northwest,x0_southeast,x0_southwest
0,-1.65459,-0.683871,male,-1.697491,-1.011604,no,northwest,0.045128,0,0.0,1.0,0.0,0.0
1,-1.639514,1.268419,female,-1.120191,-1.011604,no,northwest,0.203687,0,0.0,1.0,0.0,0.0


In [27]:
final.to_csv('data/08-primas_categorizadas_normalizadas.csv')