## Dealing with non-numerical categorical data

In [1]:
import pandas as pd
import numpy as np

# show a dataset with nice formatting
def show_dataset(df):
    return df.style.background_gradient(cmap='Greens').highlight_null('orange')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'XL', 15.3, 'class1'],
])

df.columns = ['color', 'size', 'price', 'class label']
show_dataset(df)

Unnamed: 0,color,size,price,class label
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


## Encoding ordinal data

In [3]:
size_mapping = {'XL': 3, 'L': 2, 'M': 1}
df['size'] = df['size'].map(size_mapping)
show_dataset(df)

Unnamed: 0,color,size,price,class label
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [4]:
# to reverse the mapping later use:
inverse_size_mapping = {v: k for k, v in size_mapping.items()}
df2 = df.copy()
df2['size'] = df2['size'].map(inverse_size_mapping)

In [5]:
# Use a threshold approach for the size feature
df2['x > M'] = df2['size'].map(lambda x: 0 if x == 'M' else 1)
df2['x > L'] = df2['size'].map(lambda x: 1 if x == 'XL' else 0)
df2.drop('size', axis=1, inplace=True)

show_dataset(df2)

Unnamed: 0,color,price,class label,x > M,x > L
0,green,10.1,class1,0,0
1,red,13.5,class2,1,0
2,blue,15.3,class1,1,1


## Encoding class labels

In [6]:
class_mapping = {label: i for i, label in enumerate(np.unique(df['class label']))}
df['class label'] = df['class label'].map(class_mapping)
show_dataset(df)

Unnamed: 0,color,size,price,class label
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [7]:
# Alternative to the above using sklearn's LabelEncoder
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
y = class_le.fit_transform(df['class label']) # ".fit_transform" is short for ".fit" and then ".transform"

# need to convert back to data frame because the encoder returns a numpy array
df3 = df.copy()
df3['class label'] = pd.DataFrame(y)
show_dataset(df3)


Unnamed: 0,color,size,price,class label
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


## One-hot encoding for nominal data using pandas

In [8]:
# in pandas the concept of one-host encoding is called "get_dummies" (dummy variables are the binary digit variables)
df_dum = pd.get_dummies(df, drop_first=True) # drop one of the three new features to avoid multicollinearity
show_dataset(df_dum)

Unnamed: 0,size,price,class label,color_green,color_red
0,1,10.1,0,True,False
1,2,13.5,1,False,True
2,3,15.3,0,False,False


In [9]:
# use one-hot encoding using sklearn
from sklearn.preprocessing import OneHotEncoder

# sparse_output=False to return a regular numpy array instead of a sparse matrix
# the alternative is to use sparse_output=True and then convert to a dense array using ".toarray()" on the result
# drop='first' to drop one of the three new features to avoid multicollinearity
color_ohe = OneHotEncoder(drop='first', sparse_output=False)
out = color_ohe.fit_transform(df[['color']]) # expects a 2D array
# turn into data frame
df_ohe = pd.DataFrame(out, columns=color_ohe.get_feature_names_out())

# combine back into the original data frame
df_ohe = pd.concat([df.drop(['color'], axis=1), df_ohe], axis=1)
show_dataset(df_ohe)

Unnamed: 0,size,price,class label,color_green,color_red
0,1,10.1,0,1.0,0.0
1,2,13.5,1,0.0,1.0
2,3,15.3,0,0.0,0.0


In [10]:
# use one-hot encoding using sklearn's ColumnTransformer (alternative to the above)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(
    [
        ('onehot', OneHotEncoder(drop='first'), ['color']),
        # here we could add more transformers for other columns
    ],
    remainder='passthrough', # do noting to the untransformed columns
    verbose_feature_names_out=False,
)

out = ct.fit_transform(df)

# turn into data frame
df_ohe = pd.DataFrame(out, columns=ct.get_feature_names_out())
show_dataset(df_ohe)

Unnamed: 0,color_green,color_red,size,price,class label
0,1.0,0.0,1.0,10.1,0.0
1,0.0,1.0,2.0,13.5,1.0
2,0.0,0.0,3.0,15.3,0.0
