In [18]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import seaborn as sns

# Para la codificación de las variables numéricas
# -----------------------------------------------------------------------
from sklearn.preprocessing import LabelEncoder # para realizar el Label Encoding 
from sklearn.preprocessing import OneHotEncoder  # para realizar el One-Hot Encoding

# Para evitar que salgan los warnings en jupyter
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('../data/06-primas_norm_est_standard.csv',index_col=0)
df.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_Sklearn
0,-1.387916,female,-0.382685,-0.885991,yes,southwest,0.836552
1,-1.459488,male,0.601634,-0.069805,no,southeast,0.032037


# Pair Programming Encoding

## Ana C y Ana G

En el pair programming de hoy usaremos el set de datos que guardastéis en el pair programming de normalización y estandarización.

Vuestro set de datos debería tener al menos una variable categórica, el objetivo del pair programming de hoy:

- Hacer una códificación de la/las variables categóricas que tengáis en vuestro set de datos.
- Recordad que lo primero que deberéis hacer es decidir su vuestras variables tienen o no orden, para que en función de esto uséis una aproximación u otra.
- Guardad el dataframe, donde deberíais tener las variables estadandarizas, normalizadas y codificadas en un csv para usarlo en el próximo pairprogramming

In [3]:
df.sample(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_Sklearn
335,-1.459488,female,-0.170562,-0.885991,no,northeast,0.329136
744,-0.886913,male,2.741313,-0.069805,no,southeast,0.095802
554,0.616097,female,-0.106841,-0.069805,no,northwest,0.41443
811,-0.02805,female,-0.432991,-0.885991,no,southwest,0.226168
1049,0.043522,female,-0.648468,0.74638,no,northwest,0.322659


Tenemos dos, en el caso de hábitos tabáquicos, asumimos que nuestra variable tiene orden, y le asignaremos más peso a la condición fumador.
La variable de sexo también tenemos orden, según ANOVA la condición masculina obtiene un coeficiente negativo y p> a 0.05, nos quedamos con mayor peso a masculino lo haremos por get dummies y seleccionaremos la columna que nos interesa.En el pair 1 vimos que más que el sexo, las primas se relacionaban con el tabaquismo más que por sexo, es decir, eran más los hombres que fumaban y por tanto, se les asocia mayor prima por ello, no por el sexo en si.

In [4]:
mapa = {'no':0, 'yes':1} 

In [5]:
df["fumadores"] = df["smoker"].map(mapa)

In [6]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_Sklearn,fumadores
0,-1.387916,female,-0.382685,-0.885991,yes,southwest,0.836552,1
1,-1.459488,male,0.601634,-0.069805,no,southeast,0.032037,0
2,-0.743769,male,0.472516,1.562566,no,southeast,0.176597,0
3,-0.457482,male,-0.218352,-0.885991,no,northwest,0.145677,0
4,-0.529054,female,-0.744888,-0.885991,no,southeast,0.139827,0


In [7]:
# lo haremos para la columna "sex"

dummies = pd.get_dummies(df["sex"], prefix_sep = "_", prefix = "sex", dtype = int)
dummies.head(2)

Unnamed: 0,sex_female,sex_male
0,1,0
1,0,1


In [8]:
# juntamos los datos condf_dummies

df_dummies = pd.concat([df, dummies], axis = 1)
df_dummies.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_Sklearn,fumadores,sex_female,sex_male
0,-1.387916,female,-0.382685,-0.885991,yes,southwest,0.836552,1,1,0
1,-1.459488,male,0.601634,-0.069805,no,southeast,0.032037,0,0,1


In [9]:
df_dummies['sex_female'].value_counts()

1    547
0    517
Name: sex_female, dtype: int64

Categorizamos la variable region, que aún no la hemos quitado del dataframe a pesar de que según anova y gráficos no influye aparentemente en nuestra variable respuesta. Asignamos categorias de igual peso con oneHot.

In [10]:
# iniciamos el método de OneHot Encoder
oh = OneHotEncoder()

In [11]:
# hacemos la codificación de los datos para la variable dada 
transformados = oh.fit_transform(df[['region']])

In [12]:
# convertimos nuestro array con la codificación hecha en un dataframe, donde tendremos dos columnas, una para los "Yes" y otra para los "No"
oh_df = pd.DataFrame(transformados.toarray())
oh_df.head(10)

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0
5,0.0,0.0,1.0,0.0
6,0.0,1.0,0.0,0.0
7,1.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0
9,0.0,0.0,0.0,1.0


In [13]:
# el método get_feature_names nos va a dar el nombre de las columnas nuevas que se nos generarán
oh_df.columns = oh.get_feature_names_out()# metodo actualizado el anterior lo han quitado ya.
oh_df.columns

Index(['region_northeast', 'region_northwest', 'region_southeast',
       'region_southwest'],
      dtype='object')

In [14]:
# concatenamos el dataframe original con el dataframe que acabamos de crear
final = pd.concat([df_dummies,oh_df],axis=1)
final.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_Sklearn,fumadores,sex_female,sex_male,region_northeast,region_northwest,region_southeast,region_southwest
0,-1.387916,female,-0.382685,-0.885991,yes,southwest,0.836552,1,1,0,0.0,0.0,0.0,1.0
1,-1.459488,male,0.601634,-0.069805,no,southeast,0.032037,0,0,1,0.0,0.0,1.0,0.0
2,-0.743769,male,0.472516,1.562566,no,southeast,0.176597,0,0,1,0.0,0.0,1.0,0.0
3,-0.457482,male,-0.218352,-0.885991,no,northwest,0.145677,0,0,1,0.0,1.0,0.0,0.0
4,-0.529054,female,-0.744888,-0.885991,no,southeast,0.139827,0,1,0,0.0,0.0,1.0,0.0


In [15]:
final.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges_Sklearn',
       'fumadores', 'sex_female', 'sex_male', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [16]:
final.columns=['age','sex','bmi','children','smoker','region', 'charges_Sklearn','fumadores','mujeres','hombres','northeast', 'northwest', 'southeast', 'southwest']
final.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_Sklearn,fumadores,mujeres,hombres,northeast,northwest,southeast,southwest
0,-1.387916,female,-0.382685,-0.885991,yes,southwest,0.836552,1,1,0,0.0,0.0,0.0,1.0
1,-1.459488,male,0.601634,-0.069805,no,southeast,0.032037,0,0,1,0.0,0.0,1.0,0.0


In [22]:
final.drop('mujeres', axis=1, inplace=True)

In [24]:
final.to_csv('../data/08-primas_categorizadas_normalizadas.csv')