### 

# <center> ENCODING

---

 # References
 
 https://www.kaggle.com/code/residentmario/encoding-categorical-data-in-sklearn
 https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
 https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
 https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

The treatment of categorical variables is part of the workflow in data science and requires some interpretation or knowledge about the data. The choice of the type of encoding depends mainly on whether the variable is nominal or ordinal categorical. Below, we can see a little about these two cases.

Some features from the GermanCreditRisk dataset will be used to explore the topic.

---

# Imports

In [1]:
# Imports
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Data

In [2]:
# Load dataset
file = 'data/SubSetGermanCreditRisk.csv'
df = pd.read_csv(file, encoding = 'UTF-8')

In [3]:
# View
df

Unnamed: 0,checking_balance,months_loan_duration,credit_history,default
0,< 0 DM,6,critical,no
1,1 - 200 DM,48,good,yes
2,unknown,12,critical,no
3,< 0 DM,42,good,no
4,< 0 DM,24,poor,yes
...,...,...,...,...
995,unknown,12,good,no
996,< 0 DM,30,good,no
997,unknown,12,good,no
998,< 0 DM,45,good,yes


---

# EDA

A brief exploration and treatment of the data.

In [4]:
# Check Types
df.dtypes

checking_balance        object
months_loan_duration     int64
credit_history          object
default                 object
dtype: object

In [5]:
# Missing data
print('Null:', df.isnull().values.any())
print('NaN:', df.isna().values.any())
print('None:', sum([sum(bool(x) for x in df[col].isin(['none'])) for col in df.columns]))
print('unknown:', sum([sum(bool(x) for x in df[col].isin(['unknown'])) for col in df.columns]))

Null: False
NaN: False
None: 0
unknown: 394


In [6]:
# Types, count and na's

# Missing data - Replace 'none', 'None', 'unknown' to np.nan
for col in df.columns:
    df[col] = [np.nan if row in  ['none', 'None', 'unknown'] else row for row in df[col]]

# View  
pd.DataFrame(pd.concat([df.dtypes, df.nunique(), df.isna().sum()], axis = 1)).rename(columns = {0: 'Type', 
                                                                                                1: 'Unique',
                                                                                                2: 'NaN'})

Unnamed: 0,Type,Unique,NaN
checking_balance,object,3,394
months_loan_duration,int64,33,0
credit_history,object,5,0
default,object,2,0


In [7]:
# fillna
df['checking_balance'].fillna(df['checking_balance'].mode()[0], inplace = True)

# Missing data
print('Null:', df.isnull().values.any())
print('NaN:', df.isna().values.any())
print('None:', sum([sum(bool(x) for x in df[col].isin(['none', 'None'])) for col in df.columns]))
print('unknown:', sum([sum(bool(x) for x in df[col].isin(['unknown'])) for col in df.columns]))

Null: False
NaN: False
None: 0
unknown: 0


In [8]:
# Define numeric and categorical features
numeric = df[df.select_dtypes(exclude = [object]).describe().columns]
categorical = df[df.select_dtypes([object]).describe().columns].astype('category')

# Check unique values
categorical.nunique()

checking_balance    3
credit_history      5
default             2
dtype: int64

In [9]:
# View
categorical

Unnamed: 0,checking_balance,credit_history,default
0,< 0 DM,critical,no
1,1 - 200 DM,good,yes
2,< 0 DM,critical,no
3,< 0 DM,good,no
4,< 0 DM,poor,yes
...,...,...,...
995,< 0 DM,good,no
996,< 0 DM,good,no
997,< 0 DM,good,no
998,< 0 DM,good,yes


According the variables, we will need 3 distinct treatments.

* The variable 'default' is categorical and contains only two categories, so labelEncoding treatment is sufficient.

* The variable 'checking_balance' is also categorical, but as it has 5 distinct categories, it is necessary to apply OneHotEncoding, transforming each category into a dummy variable.

* The variable 'credit_history' is an ordinal categorical type, meaning we can order them in ascending or descending order, so OrdinalEncoder will be applied.

---

# Encoding

## Label encoding

In [10]:
from sklearn.preprocessing import LabelEncoder

X = categorical[['default']]

# Fit
le = LabelEncoder()
le.fit(X)
le.transform(X)

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,

## Ordinal encoding

In [11]:
from sklearn.preprocessing import OrdinalEncoder

X = categorical[['credit_history']]

# Define order
credit_history = ['critical', 'poor', 'good', 'very good', 'perfect']

# Fit
enc = OrdinalEncoder(categories = [credit_history])
enc.fit(X)
enc.transform(X)

array([[0.],
       [2.],
       [0.],
       [2.],
       [1.],
       [2.],
       [2.],
       [2.],
       [2.],
       [0.],
       [2.],
       [2.],
       [2.],
       [0.],
       [2.],
       [2.],
       [0.],
       [4.],
       [2.],
       [2.],
       [0.],
       [2.],
       [0.],
       [0.],
       [0.],
       [2.],
       [4.],
       [3.],
       [2.],
       [1.],
       [2.],
       [2.],
       [2.],
       [0.],
       [2.],
       [0.],
       [0.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [1.],
       [0.],
       [0.],
       [0.],
       [2.],
       [2.],
       [0.],
       [2.],
       [1.],
       [1.],
       [2.],
       [2.],
       [1.],
       [3.],
       [2.],
       [0.],
       [2.],
       [0.],
       [2.],
       [0.],
       [4.],
       [4.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [0.],
       [0.],
       [0.],
       [2.],
       [0.],
       [2.],

## One hot encoder

In [12]:
from sklearn.preprocessing import OneHotEncoder

X = categorical[['checking_balance']]

ohe = OneHotEncoder(drop='if_binary')
ohe.fit(X)
ohe.transform(X).toarray()

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

___

# Pipeline

Instead of doing the entire process manually, we can use sklearn to do and organize the work for us. In the following example, there is a treatment for numeric variables with fillna and scaling and the treatment of fillna for categorical variables. Other transformations can be included, and the transformer can be used in a subsequent pipeline to train a machine learning model.

In [13]:
# Import
import category_encoders as ce

from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [14]:
# Define types
cat_cols = ['checking_balance']
ord_cols = ['credit_history']
num_cols = ['months_loan_duration']
bin_cols = ['default']

# Apply
df[cat_cols] = df[cat_cols].astype('category')
df[ord_cols] = df[ord_cols].astype('category')
df[bin_cols] = df[bin_cols].astype('category')
df[num_cols] = df[num_cols].astype(int)

categories = [credit_history]

In [15]:
# Define the column transformer to handle different feature types
transformer = ColumnTransformer(
    transformers=[
        
        # handle missing values in numeric features
        ("num", make_pipeline(
            SimpleImputer(strategy="median"),
            StandardScaler()),
        num_cols),

        # handle missing and encode categorical features
        ("cat", make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OneHotEncoder(drop='if_binary')),
        cat_cols),
        
        # handle missing values and encode ordinal features
        ("ord", make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(categories = [credit_history])),
        ord_cols),
        
        ('binary_cat', make_pipeline(
            OrdinalEncoder()), bin_cols),
        
    ], remainder = 'passthrough'
)

For some reason the label encoder doesn't work with the column transformer so I used the ordinal encoder to encode the target.

In [16]:
# Fit and make dataframe
df_new = transformer.fit_transform(df)
df_new = pd.DataFrame(df_new, columns=list(transformer.get_feature_names_out()))

In [17]:
# View
df_new

Unnamed: 0,num__months_loan_duration,cat__checking_balance_1 - 200 DM,cat__checking_balance_< 0 DM,cat__checking_balance_> 200 DM,ord__credit_history,binary_cat__default
0,-1.236478,0.0,1.0,0.0,0.0,0.0
1,2.248194,1.0,0.0,0.0,2.0,1.0
2,-0.738668,0.0,1.0,0.0,0.0,0.0
3,1.750384,0.0,1.0,0.0,2.0,0.0
4,0.256953,0.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...
995,-0.738668,0.0,1.0,0.0,2.0,0.0
996,0.754763,0.0,1.0,0.0,2.0,0.0
997,-0.738668,0.0,1.0,0.0,2.0,0.0
998,1.999289,0.0,1.0,0.0,2.0,1.0


___