# Contexto y Datos (categoricos)

In [2]:
import pandas as pd 

datos = {
    'nombre':['Mariana','Ana','Elsa','Gustavo','Pedro','Raul','Carlos','Jose','Luis'],
    'saldo':[10000.00,8000.00,9000.00,2000.00,2100.00,12000.00,5000.00,10000.00,200.00],
    'pais':['Argentina','Bolivia','Chile','Colombia','Costa Rica','Ecuador','Mexico','Peru','Peru']
}

data = pd.DataFrame(datos)
data

Unnamed: 0,nombre,saldo,pais
0,Mariana,10000.0,Argentina
1,Ana,8000.0,Bolivia
2,Elsa,9000.0,Chile
3,Gustavo,2000.0,Colombia
4,Pedro,2100.0,Costa Rica
5,Raul,12000.0,Ecuador
6,Carlos,5000.0,Mexico
7,Jose,10000.0,Peru
8,Luis,200.0,Peru


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   nombre  9 non-null      object 
 1   saldo   9 non-null      float64
 2   pais    9 non-null      object 
dtypes: float64(1), object(2)
memory usage: 348.0+ bytes


In [5]:
data['pais'] = data['pais'].astype('category')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   nombre  9 non-null      object  
 1   saldo   9 non-null      float64 
 2   pais    9 non-null      category
dtypes: category(1), float64(1), object(1)
memory usage: 649.0+ bytes


### Que no se debe hacer

In [7]:
datos_sesgados = data.copy()
remplazos = {
    'Argentina': 1,
    'Bolivia': 2,
    'Chile': 3,
    'Colombia': 4,
    'Costa Rica': 5,
    'Ecuador':6,
    'Mexico':7,
    'Peru':8
}
datos_sesgados['pais'].replace(remplazos,inplace=True)
datos_sesgados

Unnamed: 0,nombre,saldo,pais
0,Mariana,10000.0,1
1,Ana,8000.0,2
2,Elsa,9000.0,3
3,Gustavo,2000.0,4
4,Pedro,2100.0,5
5,Raul,12000.0,6
6,Carlos,5000.0,7
7,Jose,10000.0,8
8,Luis,200.0,8


### Lo que si se debe hacer

In [12]:
from sklearn.preprocessing import OneHotEncoder

codificador = OneHotEncoder()

codificacion = codificador.fit_transform(data[['pais']]) # El odificador trabaja con un Dataframe y no con una Serie

print(codificacion)
print(type(codificacion))
print(codificacion.toarray()) # PAra volverlo una matriz


  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0
  (5, 5)	1.0
  (6, 6)	1.0
  (7, 7)	1.0
  (8, 7)	1.0
<class 'scipy.sparse._csr.csr_matrix'>
[[1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]


In [13]:
nuevas_cols = pd.DataFrame(codificacion.toarray(), 
                           columns = codificador.categories_)

print(nuevas_cols)

  Argentina Bolivia Chile Colombia Costa Rica Ecuador Mexico Peru
0       1.0     0.0   0.0      0.0        0.0     0.0    0.0  0.0
1       0.0     1.0   0.0      0.0        0.0     0.0    0.0  0.0
2       0.0     0.0   1.0      0.0        0.0     0.0    0.0  0.0
3       0.0     0.0   0.0      1.0        0.0     0.0    0.0  0.0
4       0.0     0.0   0.0      0.0        1.0     0.0    0.0  0.0
5       0.0     0.0   0.0      0.0        0.0     1.0    0.0  0.0
6       0.0     0.0   0.0      0.0        0.0     0.0    1.0  0.0
7       0.0     0.0   0.0      0.0        0.0     0.0    0.0  1.0
8       0.0     0.0   0.0      0.0        0.0     0.0    0.0  1.0


In [14]:
data = pd.concat([data, nuevas_cols], axis='columns')
data

Unnamed: 0,nombre,saldo,pais,"(Argentina,)","(Bolivia,)","(Chile,)","(Colombia,)","(Costa Rica,)","(Ecuador,)","(Mexico,)","(Peru,)"
0,Mariana,10000.0,Argentina,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ana,8000.0,Bolivia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Elsa,9000.0,Chile,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Gustavo,2000.0,Colombia,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Pedro,2100.0,Costa Rica,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,Raul,12000.0,Ecuador,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,Carlos,5000.0,Mexico,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,Jose,10000.0,Peru,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,Luis,200.0,Peru,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
# Borrar la variable categorica
data.drop('pais',axis='columns',inplace = True)
data

Unnamed: 0,nombre,saldo,"(Argentina,)","(Bolivia,)","(Chile,)","(Colombia,)","(Costa Rica,)","(Ecuador,)","(Mexico,)","(Peru,)"
0,Mariana,10000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ana,8000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Elsa,9000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Gustavo,2000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Pedro,2100.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,Raul,12000.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,Carlos,5000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,Jose,10000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,Luis,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
