## Transforming categorical (nominal and ordinal) to numerical values using OneHotEncoder and OrdinalEncoder

In [26]:
# We start first by learning how to deal with nominal data

In [1]:
import pandas as pd

In [2]:
d = {'sales': [100000,222000,1000000,522000,111111,222222,1111111,20000,75000,90000,1000000,10000],
      'city': ['Tampa','Tampa','Orlando','Jacksonville','Miami','Jacksonville','Miami','Miami','Orlando','Orlando','Orlando','Orlando'],
      'size': ['Small', 'Medium','Large','Large','Small','Medium','Large','Small','Medium','Medium','Medium','Small',]}

In [3]:
data = pd.DataFrame(d)

In [4]:
data.head()

Unnamed: 0,sales,city,size
0,100000,Tampa,Small
1,222000,Tampa,Medium
2,1000000,Orlando,Large
3,522000,Jacksonville,Large
4,111111,Miami,Small


In [5]:
data.head(10)

Unnamed: 0,sales,city,size
0,100000,Tampa,Small
1,222000,Tampa,Medium
2,1000000,Orlando,Large
3,522000,Jacksonville,Large
4,111111,Miami,Small
5,222222,Jacksonville,Medium
6,1111111,Miami,Large
7,20000,Miami,Small
8,75000,Orlando,Medium
9,90000,Orlando,Medium


In [6]:
data['city'].unique()

array(['Tampa', 'Orlando', 'Jacksonville', 'Miami'], dtype=object)

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

In [12]:
ohetransform = ohe.fit_transform(data[['city']])

In [13]:
ohetransform

Unnamed: 0,city_Jacksonville,city_Miami,city_Orlando,city_Tampa
0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0
5,1.0,0.0,0.0,0.0
6,0.0,1.0,0.0,0.0
7,0.0,1.0,0.0,0.0
8,0.0,0.0,1.0,0.0
9,0.0,0.0,1.0,0.0


In [14]:
data.head()

Unnamed: 0,sales,city,size
0,100000,Tampa,Small
1,222000,Tampa,Medium
2,1000000,Orlando,Large
3,522000,Jacksonville,Large
4,111111,Miami,Small


In [15]:
data = pd.concat([data, ohetransform], axis=1).drop(columns = ['city'])

In [16]:
data.head()

Unnamed: 0,sales,size,city_Jacksonville,city_Miami,city_Orlando,city_Tampa
0,100000,Small,0.0,0.0,0.0,1.0
1,222000,Medium,0.0,0.0,0.0,1.0
2,1000000,Large,0.0,0.0,1.0,0.0
3,522000,Large,1.0,0.0,0.0,0.0
4,111111,Small,0.0,1.0,0.0,0.0


In [27]:
# Here we deal with ordinal data

In [17]:
data['size'].unique()

array(['Small', 'Medium', 'Large'], dtype=object)

In [18]:
sizes = ['Small', 'Medium', 'Large']

In [19]:
from sklearn.preprocessing import OrdinalEncoder

In [20]:
enc = OrdinalEncoder(categories = [sizes])

In [22]:
enc.fit_transform(data[['size']])

array([[0.],
       [1.],
       [2.],
       [2.],
       [0.],
       [1.],
       [2.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.]])

In [23]:
data['size'] = enc.fit_transform(data[['size']])

In [24]:
data.head()

Unnamed: 0,sales,size,city_Jacksonville,city_Miami,city_Orlando,city_Tampa
0,100000,0.0,0.0,0.0,0.0,1.0
1,222000,1.0,0.0,0.0,0.0,1.0
2,1000000,2.0,0.0,0.0,1.0,0.0
3,522000,2.0,1.0,0.0,0.0,0.0
4,111111,0.0,0.0,1.0,0.0,0.0


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sales              12 non-null     int64  
 1   size               12 non-null     float64
 2   city_Jacksonville  12 non-null     float64
 3   city_Miami         12 non-null     float64
 4   city_Orlando       12 non-null     float64
 5   city_Tampa         12 non-null     float64
dtypes: float64(5), int64(1)
memory usage: 708.0 bytes


Here we can see that our dataset has been transform entirelly and can be used to fit a model