In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder # para realizar el Label Encoding 
from sklearn.preprocessing import OneHotEncoder  # para realizar el One-Hot Encoding

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

En el pair programming de hoy usaremos el set de datos que guardastéis en el pair programming de normalización y estandarización.

Vuestro set de datos debería tener al menos una variable categórica, el objetivo del pair programming de hoy:

Hacer una códificación de la/las variables categóricas que tengáis en vuestro set de datos.

Recordad que lo primero que deberéis hacer es decidir su vuestras variables tienen o no orden, para que en función de esto uséis una aproximación u otra.



In [2]:
df = pd.read_csv("Datos/possum_esta.csv", index_col = 0).reset_index()
df.head(2)

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly,age_norm,age_log,age_raiz,age_Sklearn,belly_esta
0,1,1,Vic,m,8.0,-0.028932,60.4,89.0,36.0,74.5,54.5,15.2,0.736833,0.150424,0.522837,2.079442,2.828427,0.625,1.235889
1,2,1,Vic,f,6.0,0.392875,57.6,91.5,36.5,72.5,51.2,16.0,1.473667,0.514241,0.272837,1.791759,2.44949,0.625,0.149699


In [3]:
df.index

RangeIndex(start=0, stop=104, step=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   case         104 non-null    int64  
 1   site         104 non-null    int64  
 2   Pop          104 non-null    object 
 3   sex          104 non-null    object 
 4   age          104 non-null    float64
 5   hdlngth      103 non-null    float64
 6   skullw       104 non-null    float64
 7   totlngth     104 non-null    float64
 8   taill        104 non-null    float64
 9   footlgth     103 non-null    float64
 10  earconch     104 non-null    float64
 11  eye          104 non-null    float64
 12  chest        103 non-null    float64
 13  belly        103 non-null    float64
 14  age_norm     104 non-null    float64
 15  age_log      104 non-null    float64
 16  age_raiz     104 non-null    float64
 17  age_Sklearn  103 non-null    float64
 18  belly_esta   104 non-null    float64
dtypes: float

In [5]:
df.Pop.unique()

array(['Vic', 'other'], dtype=object)

In [6]:
df.sex.unique()

array(['m', 'f'], dtype=object)

In [7]:
df.Pop.value_counts()

other    58
Vic      46
Name: Pop, dtype: int64

In [8]:
df["site"]=df["site"].astype("category")

Nuestras variables cualitativas:

Pop (population): o "Vic" (Victoria) o "other" (New South Wales or Queensland)

Sex (gender): o "m" (male) o "f" (female)

Site (lugar de atrapamiento): 1 al 7


Consideramos "sex" como una variable cualitativa binaria , "pop" como una variable cualitativa nominal, consideramos "site" como variable cualitiva "ordinaria" porque creemos que las condiciones del lugar de atrapamiento pueden tener que ver en el desarrollo de la zarigüeya.

Para nuestras variables cualitativas que NO tienen orden podemos usar One_Hot Encoding o get_dummies

In [9]:
# nos creamos un dataframe con las variables categóricas

df_possum_cat = df.select_dtypes(include = "object")
df_possum_cat.head(10)

Unnamed: 0,Pop,sex
0,Vic,m
1,Vic,f
2,Vic,f
3,Vic,f
4,Vic,f
5,Vic,f
6,Vic,m
7,Vic,f
8,Vic,f
9,Vic,f


One-Hot Encoding

In [10]:
# Vamos a usar este método con la variable "Pop"

In [11]:
# iniciamos el método de OneHot Encoder
oh = OneHotEncoder()

In [12]:
# hacemos la codificación de los datos para la variable dada 
transformados = oh.fit_transform(df_possum_cat[['Pop']])

In [13]:
# convertimos nuestro array con la codificación hecha en un dataframe, donde tendremos dos columnas, una para los "Yes" y otra para los "No"
oh_df = pd.DataFrame(transformados.toarray())
oh_df.head(5)

Unnamed: 0,0,1
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [14]:
# el método get_feature_names nos va a dar el nombre de las columnas nuevas que se nos generarán
oh_df.columns = oh.get_feature_names()
oh_df.columns

Index(['x0_Vic', 'x0_other'], dtype='object')

In [15]:
# concatenamos el dataframe original con el dataframe que acabamos de crear
get_hot = pd.concat([df,oh_df],axis=1)
get_hot.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly,age_norm,age_log,age_raiz,age_Sklearn,belly_esta,x0_Vic,x0_other
0,1,1,Vic,m,8.0,-0.028932,60.4,89.0,36.0,74.5,54.5,15.2,0.736833,0.150424,0.522837,2.079442,2.828427,0.625,1.235889,1.0,0.0
1,2,1,Vic,f,6.0,0.392875,57.6,91.5,36.5,72.5,51.2,16.0,1.473667,0.514241,0.272837,1.791759,2.44949,0.625,0.149699,1.0,0.0
2,3,1,Vic,f,6.0,0.167912,60.0,95.5,39.0,75.4,51.9,15.5,0.491222,0.514241,0.272837,1.791759,2.44949,0.625,0.511762,1.0,0.0
3,4,1,Vic,f,6.0,-0.310136,57.1,92.0,38.0,76.1,52.2,15.2,0.736833,0.150424,0.272837,1.791759,2.44949,0.125,0.511762,1.0,0.0
4,5,1,Vic,f,2.0,0.139791,56.3,85.5,36.0,71.0,53.2,15.1,1.473667,-0.213392,-0.227163,0.693147,1.414214,0.0,0.149699,1.0,0.0


get_dummies

In [16]:
# Lo usaremos sobre la variable sex

In [17]:
dummies = pd.get_dummies(df_possum_cat["sex"], prefix_sep = "_", prefix = "sex", dtype = int)
dummies.head(2)

Unnamed: 0,sex_f,sex_m
0,0,1
1,1,0


In [18]:
df_final = pd.concat([get_hot, dummies], axis = 1)
df_final.head(2)

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly,age_norm,age_log,age_raiz,age_Sklearn,belly_esta,x0_Vic,x0_other,sex_f,sex_m
0,1,1,Vic,m,8.0,-0.028932,60.4,89.0,36.0,74.5,54.5,15.2,0.736833,0.150424,0.522837,2.079442,2.828427,0.625,1.235889,1.0,0.0,0,1
1,2,1,Vic,f,6.0,0.392875,57.6,91.5,36.5,72.5,51.2,16.0,1.473667,0.514241,0.272837,1.791759,2.44949,0.625,0.149699,1.0,0.0,1,0


Guardad el dataframe, donde deberíais tener las variables estadandarizas, normalizadas y codificadas en un csv para usarlo en el próximo pairprogramming

In [19]:
df_final.to_csv("Datos/possum_completo.csv")