In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder # para realizar el Label Encoding 
from sklearn.preprocessing import OneHotEncoder  # para realizar el One-Hot Encoding

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

En el pair programming de hoy usaremos el set de datos que guardastéis en el pair programming de normalización y estandarización.

Vuestro set de datos debería tener al menos una variable categórica, el objetivo del pair programming de hoy:

Hacer una códificación de la/las variables categóricas que tengáis en vuestro set de datos.

Recordad que lo primero que deberéis hacer es decidir su vuestras variables tienen o no orden, para que en función de esto uséis una aproximación u otra.



In [2]:
df = pd.read_csv("Datos/possum_esta.csv", index_col = 0)
df.head(2)

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,2.218955,94.1,1.134883,0.445603,-0.517732,1.231614,1.557431,0.147177,0.491222,1.241874
1,2,1,Vic,f,1.157937,92.5,0.231197,1.028384,-0.261331,0.810757,0.750501,0.912499,0.736833,0.150424


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      104 non-null    int64  
 1   site      104 non-null    int64  
 2   Pop       104 non-null    object 
 3   sex       104 non-null    object 
 4   age       104 non-null    float64
 5   hdlngth   104 non-null    float64
 6   skullw    104 non-null    float64
 7   totlngth  104 non-null    float64
 8   taill     104 non-null    float64
 9   footlgth  104 non-null    float64
 10  earconch  104 non-null    float64
 11  eye       104 non-null    float64
 12  chest     104 non-null    float64
 13  belly     104 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 12.2+ KB


In [4]:
df.Pop.unique()

array(['Vic', 'other'], dtype=object)

In [5]:
df.sex.unique()

array(['m', 'f'], dtype=object)

In [6]:
df.site.unique()

array([1, 2, 3, 4, 5, 6, 7])

In [7]:
df["site"]=df["site"].astype("category")

Nuestras variables cualitativas:

Pop (population): o "Vic" (Victoria) o "other" (New South Wales or Queensland)

Sex (gender): o "m" (male) o "f" (female)

Site (lugar de atrapamiento): 1 al 7


Consideramos "sex" como una variable cualitativa binaria , "pop" como una variable cualitativa nominal, consideramos "site" como variable cualitiva "ordinaria" porque creemos que las condiciones del lugar de atrapamiento pueden tener que ver en el desarrollo de la zarigüeya.

Para nuestras variables cualitativas que NO tienen orden podemos usar One_Hot Encoding o get_dummies

In [8]:
# nos creamos un dataframe con las variables categóricas

df_possum_cat = df.select_dtypes(include = "object")
df_possum_cat.head(10)

Unnamed: 0,Pop,sex
0,Vic,m
1,Vic,f
2,Vic,f
3,Vic,f
4,Vic,f
5,Vic,f
6,Vic,m
7,Vic,f
8,Vic,f
9,Vic,f


One-Hot Encoding

In [9]:
# Vamos a usar este método con la variable "Pop"

In [10]:
# iniciamos el método de OneHot Encoder
oh = OneHotEncoder()

In [11]:
# hacemos la codificación de los datos para la variable dada 
transformados = oh.fit_transform(df_possum_cat[['Pop']])

In [12]:
# convertimos nuestro array con la codificación hecha en un dataframe, donde tendremos dos columnas, una para los "Yes" y otra para los "No"
oh_df = pd.DataFrame(transformados.toarray())
oh_df.head(5)

Unnamed: 0,0,1
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [13]:
# el método get_feature_names nos va a dar el nombre de las columnas nuevas que se nos generarán
oh_df.columns = oh.get_feature_names()
oh_df.columns

Index(['x0_Vic', 'x0_other'], dtype='object')

In [14]:
# concatenamos el dataframe original con el dataframe que acabamos de crear
get_hot = pd.concat([df,oh_df],axis=1)
get_hot.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly,x0_Vic,x0_other
0,1,1,Vic,m,2.218955,94.1,1.134883,0.445603,-0.517732,1.231614,1.557431,0.147177,0.491222,1.241874,1.0,0.0
1,2,1,Vic,f,1.157937,92.5,0.231197,1.028384,-0.261331,0.810757,0.750501,0.912499,0.736833,0.150424,1.0,0.0
2,3,1,Vic,f,1.157937,94.0,1.005785,1.960834,1.020671,1.421,0.921668,0.434173,1.473667,0.514241,1.0,0.0
3,4,1,Vic,f,1.157937,93.2,0.069825,1.144941,0.50787,1.5683,0.995025,0.147177,0.491222,0.514241,1.0,0.0
4,5,1,Vic,f,-0.964098,91.5,-0.188371,-0.37029,-0.517732,0.495114,1.239549,0.051512,0.736833,0.150424,1.0,0.0


get_dummies

In [15]:
# Lo usaremos sobre la variable sex

In [16]:
dummies = pd.get_dummies(df_possum_cat["sex"], prefix_sep = "_", prefix = "sex", dtype = int)
dummies.head(2)

Unnamed: 0,sex_f,sex_m
0,0,1
1,1,0


In [17]:
df_final = pd.concat([get_hot, dummies], axis = 1)
df_final.head(2)

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly,x0_Vic,x0_other,sex_f,sex_m
0,1,1,Vic,m,2.218955,94.1,1.134883,0.445603,-0.517732,1.231614,1.557431,0.147177,0.491222,1.241874,1.0,0.0,0,1
1,2,1,Vic,f,1.157937,92.5,0.231197,1.028384,-0.261331,0.810757,0.750501,0.912499,0.736833,0.150424,1.0,0.0,1,0


In [18]:
# Lo usaremos sobre la variable site

In [19]:
dummies2 = pd.get_dummies(df["site"], prefix_sep = "_", prefix = "site", dtype = int)
dummies2.head(2)

Unnamed: 0,site_1,site_2,site_3,site_4,site_5,site_6,site_7
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0


In [20]:
df_final = pd.concat([get_hot, dummies, dummies2], axis = 1)
df_final.head(2)

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly,x0_Vic,x0_other,sex_f,sex_m,site_1,site_2,site_3,site_4,site_5,site_6,site_7
0,1,1,Vic,m,2.218955,94.1,1.134883,0.445603,-0.517732,1.231614,1.557431,0.147177,0.491222,1.241874,1.0,0.0,0,1,1,0,0,0,0,0,0
1,2,1,Vic,f,1.157937,92.5,0.231197,1.028384,-0.261331,0.810757,0.750501,0.912499,0.736833,0.150424,1.0,0.0,1,0,1,0,0,0,0,0,0


In [21]:
df_final.drop(["case", "site", "Pop", "sex"], axis =1, inplace=True)

Guardad el dataframe, donde deberíais tener las variables estadandarizas, normalizadas y codificadas en un csv para usarlo en el próximo pairprogramming

In [22]:
df_final.to_csv("Datos/possum_completo.csv")