# Categorical variables

In [1]:
import numpy as np
import pandas as pd

In [2]:
l1 = np.random.random(5)
l2 = ["Server", "Desktop", "Netbook", "Notebook", "Desktop"]
l3 = np.random.uniform(0,10,5)

df = pd.DataFrame({"feat1":l1, "feat2":l2, "feat3":l3})
df

Unnamed: 0,feat1,feat2,feat3
0,0.22067,Server,1.383618
1,0.356483,Desktop,8.679662
2,0.140885,Netbook,4.370514
3,0.764998,Notebook,9.747086
4,0.974588,Desktop,6.478867


## One-Hot encoding
Generates a binary represenation of the categorical variable. Most versatile technique for label encoding, but introduces additional dimensions (one for each individual category/class *) into the dataset -> "curse of dimensionality"

*You can also use one less dimension as it is redundant (all the class related information can already be inferred from the other newly created dimensions -> if all those are 0s, last dimension would have been 1 etc.)

In [3]:
df_enc = pd.get_dummies(df, columns=["feat2"])
df_enc

Unnamed: 0,feat1,feat3,feat2_Desktop,feat2_Netbook,feat2_Notebook,feat2_Server
0,0.22067,1.383618,0,0,0,1
1,0.356483,8.679662,1,0,0,0
2,0.140885,4.370514,0,1,0,0
3,0.764998,9.747086,0,0,1,0
4,0.974588,6.478867,1,0,0,0


In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encoded = enc.fit_transform(df[["feat2"]])

df_oh = pd.concat([df, one_hot_encoded],axis=1).drop(columns=["feat2"])
df_oh

Unnamed: 0,feat1,feat3,feat2_Desktop,feat2_Netbook,feat2_Notebook,feat2_Server
0,0.22067,1.383618,0.0,0.0,0.0,1.0
1,0.356483,8.679662,1.0,0.0,0.0,0.0
2,0.140885,4.370514,0.0,1.0,0.0,0.0
3,0.764998,9.747086,0.0,0.0,1.0,0.0
4,0.974588,6.478867,1.0,0.0,0.0,0.0


## LabelEncoder
Replace labels with integer numbers. These might be interpreted by the model as ordering/ranking/priorization ("Server > Desktop"), so it is recommended to use this type of encoding mainly for categorical data that has an inherent order.  

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df["feat2_enc"] = encoder.fit_transform(df["feat2"])
df

Unnamed: 0,feat1,feat2,feat3,feat2_enc
0,0.666582,Server,2.864176,3
1,0.275522,Desktop,5.269818,0
2,0.842579,Netbook,9.181453,1
3,0.707444,Notebook,4.857045,2
4,0.746663,Desktop,8.595841,0
