# Pair Programming Encoding

In [61]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder # para realizar el Label Encoding 
from sklearn.preprocessing import OneHotEncoder  # para realizar el One-Hot Encoding

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None


En el pair programming de hoy usaremos el set de datos que guardastéis en el pair programming de normalización y estandarización.


In [62]:
df = pd.read_csv('../Datos/vinos_norm_estandar.csv', index_col = 0)
df.head(2)

Unnamed: 0,id,year_birth,education,marital_status,income,kidhome,teenhome,dt_customer,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,mntgoldprods,numdealspurchases,numwebpurchases,numcatalogpurchases,numstorepurchases,numwebvisitsmonth,mnt_total_sin_vino,mntwines_sin_cero,mnt_wines_box
0,5524,1957,Graduation,Single,0.235696,0,0,04-09-2012,0.307039,635,1.551577,1.679702,2.462147,1.4765,0.843207,0.349414,1.409304,2.51089,-0.550785,0.693904,2.011116,635.0,14.02674
1,2174,1954,Graduation,Single,-0.235454,1,1,08-03-2014,-0.383664,11,-0.636301,-0.713225,-0.650449,-0.631503,-0.729006,-0.168236,-1.110409,-0.56872,-1.166125,-0.130463,-0.845274,11.0,3.141178


Vuestro set de datos debería tener al menos una variable categórica, el objetivo del pair programming de hoy:
Hacer una códificación de la/las variables categóricas que tengáis en vuestro set de datos.


Hacer una códificación de la/las variables categóricas que tengáis en vuestro set de datos.


Recordad que lo primero que deberéis hacer es decidir su vuestras variables tienen o no orden, para que en función de esto uséis una aproximación u otra.


>Probaremos realizando el encoding nominal para las variables 'education' y 'marital_status'

In [63]:
df_cat= df.select_dtypes(object)
df_cat= df_cat.drop(['dt_customer'], axis=1) # quitamos la columna dt_customer al no necesitarla para nuestro análisis
df_cat

Unnamed: 0,education,marital_status
0,Graduation,Single
1,Graduation,Single
2,Graduation,Together
3,Graduation,Together
4,PhD,Married
...,...,...
2235,Graduation,Married
2236,PhD,Together
2237,Graduation,Divorced
2238,Master,Together


In [64]:
df_cat.education.unique()

array(['Graduation', 'PhD', 'Master', 'Basic', '2n Cycle'], dtype=object)

In [65]:

dummies_education = pd.get_dummies(df_cat["education"], prefix_sep = "_", prefix = "education", dtype = int)
dummies_education.head()

Unnamed: 0,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD
0,0,0,1,0,0
1,0,0,1,0,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,0,0,1


In [66]:
df_dummies = pd.concat([df_cat, dummies_education], axis = 1)
df_dummies.head(2)

Unnamed: 0,education,marital_status,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD
0,Graduation,Single,0,0,1,0,0
1,Graduation,Single,0,0,1,0,0


In [67]:
df_dummies.drop(['education'], axis=1, inplace=True)
df_dummies

Unnamed: 0,marital_status,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD
0,Single,0,0,1,0,0
1,Single,0,0,1,0,0
2,Together,0,0,1,0,0
3,Together,0,0,1,0,0
4,Married,0,0,0,0,1
...,...,...,...,...,...,...
2235,Married,0,0,1,0,0
2236,Together,0,0,0,0,1
2237,Divorced,0,0,1,0,0
2238,Together,0,0,0,1,0


> Hacemos el mismo proceso para 'marital_status'

In [68]:

dummies_marital = pd.get_dummies(df_cat["marital_status"], prefix_sep = "_", prefix = "marital", dtype = int)
dummies_marital.head()

Unnamed: 0,marital_Alone,marital_Divorced,marital_Married,marital_Single,marital_Together,marital_Unknown,marital_Widow
0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0


In [69]:
df_dummies = pd.concat([df_dummies, dummies_marital], axis = 1)
df_dummies.head(2)

Unnamed: 0,marital_status,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD,marital_Alone,marital_Divorced,marital_Married,marital_Single,marital_Together,marital_Unknown,marital_Widow
0,Single,0,0,1,0,0,0,0,0,1,0,0,0
1,Single,0,0,1,0,0,0,0,0,1,0,0,0


In [70]:
df_dummies.drop(['marital_status'], axis=1, inplace=True)
df_dummies.head(2)

Unnamed: 0,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD,marital_Alone,marital_Divorced,marital_Married,marital_Single,marital_Together,marital_Unknown,marital_Widow
0,0,0,1,0,0,0,0,0,1,0,0,0
1,0,0,1,0,0,0,0,0,1,0,0,0


Ahora procederemos a añadir nuestras variables codificadas al dataframe original para realizar una predicción.

In [71]:
df_codificado_nom = pd.concat([df, df_dummies], axis = 1)
df_codificado_nom.head(2)

Unnamed: 0,id,year_birth,education,marital_status,income,kidhome,teenhome,dt_customer,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,mntgoldprods,numdealspurchases,numwebpurchases,numcatalogpurchases,numstorepurchases,numwebvisitsmonth,mnt_total_sin_vino,mntwines_sin_cero,mnt_wines_box,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD,marital_Alone,marital_Divorced,marital_Married,marital_Single,marital_Together,marital_Unknown,marital_Widow
0,5524,1957,Graduation,Single,0.235696,0,0,04-09-2012,0.307039,635,1.551577,1.679702,2.462147,1.4765,0.843207,0.349414,1.409304,2.51089,-0.550785,0.693904,2.011116,635.0,14.02674,0,0,1,0,0,0,0,0,1,0,0,0
1,2174,1954,Graduation,Single,-0.235454,1,1,08-03-2014,-0.383664,11,-0.636301,-0.713225,-0.650449,-0.631503,-0.729006,-0.168236,-1.110409,-0.56872,-1.166125,-0.130463,-0.845274,11.0,3.141178,0,0,1,0,0,0,0,0,1,0,0,0


> Ahora vamos a codificar las variables categóricas con métodos ordinales

In [72]:
#recordemos nuestro dataframe con variables categóricas

df_cat.head(2)

Unnamed: 0,education,marital_status
0,Graduation,Single
1,Graduation,Single


> Al no tener claro el peso de nuestras categorías, utilizaremos el método Label-Encoding

In [73]:
# iniciamos el método con 'education'
le = LabelEncoder()

df['education_le']=le.fit_transform(df['education'])

In [76]:
#comprobamos que el método se haya realizado correctamente
df['education_le'].unique()

array([2, 4, 3, 1, 0])

In [74]:
df.sample(6)

Unnamed: 0,id,year_birth,education,marital_status,income,kidhome,teenhome,dt_customer,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,mntgoldprods,numdealspurchases,numwebpurchases,numcatalogpurchases,numstorepurchases,numwebvisitsmonth,mnt_total_sin_vino,mntwines_sin_cero,mnt_wines_box,education_le
1057,8104,1951,PhD,Married,0.042905,0,0,09-08-2013,-0.590875,241,-0.661449,-0.686637,-0.687068,-0.655733,-0.364713,-0.685887,0.329427,-0.56872,-0.243114,0.693904,-0.783179,241.0,10.504993,4
595,4992,1975,Master,Together,-1.38936,1,0,19-09-2013,-1.281579,6,-0.560857,-0.668912,-0.63214,-0.413434,-0.268846,0.349414,-0.390491,-0.56872,-1.166125,0.693904,-0.706299,6.0,2.187707,3
2159,10157,1965,Graduation,Single,0.297536,0,1,25-01-2014,0.997743,89,-0.284229,-0.345424,-0.064549,0.361924,-0.748179,-0.685887,-0.030532,-0.56872,0.064556,-0.542647,-0.345554,89.0,7.575721,2
1861,10241,1975,2n Cycle,Divorced,-1.629489,0,0,15-12-2013,-1.143438,0,-0.661449,-0.739813,-0.577212,-0.607273,-0.729006,-0.685887,-1.110409,-0.910898,-1.166125,0.28172,-0.851188,1e-08,-4.544805,0
1513,9264,1986,Graduation,Married,1.09023,0,0,27-04-2014,-1.661466,423,0.394768,2.388718,0.649517,4.117562,2.933099,-0.685887,-0.030532,1.826532,0.987567,-1.367014,2.700079,423.0,12.459876,2
1785,9648,1967,Graduation,Single,-0.213083,1,1,03-11-2012,-1.005297,153,-0.560857,-0.491658,-0.687068,-0.437664,-0.249673,0.867064,0.329427,-0.56872,-0.550785,1.106087,-0.596892,153.0,9.090604,2


> Continuamos con 'marital_status'

In [77]:
le = LabelEncoder()

df['marital_le']=le.fit_transform(df['marital_status'])

In [78]:
df['marital_le'].unique()

array([3, 4, 2, 1, 6, 0, 5])

In [79]:
df.sample(6)

Unnamed: 0,id,year_birth,education,marital_status,income,kidhome,teenhome,dt_customer,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,mntgoldprods,numdealspurchases,numwebpurchases,numcatalogpurchases,numstorepurchases,numwebvisitsmonth,mnt_total_sin_vino,mntwines_sin_cero,mnt_wines_box,education_le,marital_le
296,2874,1988,2n Cycle,Divorced,-0.673128,1,0,07-03-2013,-1.005297,6,-0.560857,-0.708794,-0.61383,-0.583043,-0.690659,-0.685887,-1.110409,-0.910898,-0.858455,0.693904,-0.815705,6.0,2.187707,0,1
1831,9860,1959,Graduation,Together,-0.2927,0,1,15-03-2013,-1.316114,159,-0.661449,-0.642324,-0.650449,-0.631503,-0.249673,0.349414,-0.030532,-0.56872,-0.550785,0.693904,-0.726997,159.0,9.205045,2,4
325,7214,1957,Graduation,Married,0.397446,0,0,05-07-2013,-0.003777,792,-0.661449,0.478807,0.136855,0.434613,1.571793,-0.685887,-0.030532,1.826532,-0.243114,-0.954831,0.559265,792.0,14.93835,2,2
1899,8780,1950,PhD,Together,0.281796,0,1,13-06-2013,0.755996,378,-0.309377,-0.438482,-0.33919,-0.316514,-0.575619,-0.168236,-0.390491,0.799996,0.679896,-0.954831,-0.511142,378.0,12.04981,4,4
2176,1544,1967,Master,Divorced,1.164174,0,0,22-12-2012,0.065293,741,1.048616,2.313385,3.414235,0.991902,1.130807,-0.685887,-0.030532,-0.226541,1.295237,-1.367014,2.513793,741.0,14.659121,3,1
1455,4399,1969,Graduation,Together,0.65743,0,0,25-06-2014,-1.592395,458,1.375541,0.837746,1.253727,0.555763,-0.077113,-0.685887,-0.030532,0.457817,0.372226,-1.367014,0.979149,458.0,12.755767,2,4


Guardad el dataframe, donde deberíais tener las variables estadandarizas, normalizadas y codificadas en un csv para usarlo en el próximo pairprogramming

In [None]:
df.to_csv('../Datos/vinos_encoding_ordinal.csv')
df_codificado_nom.to_csv('../Datos/vinos_encoding_nominal.csv')