# Demonstrating Feature Preprocessing

---

Testing different pre-processing techniques from the `Feature-Engine` package.

---

In [1]:
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import DecisionTreeEncoder, MeanEncoder, OneHotEncoder, RareLabelEncoder
from feature_engine.outliers import OutlierTrimmer
from feature_engine.pipeline import Pipeline

import numpy as np
import pandas as pd

# Read Data

In [2]:
data = pd.read_parquet('../data/data_prepped.parquet')
data.head()

Unnamed: 0,IsCanceled,Adults,Children,Babies,Meal,Country,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,...,DepartureDate_DaysBeforeHoliday,DepartureDate_DaysAfterHoliday,BookingDate_DaysBeforeHoliday,BookingDate_DaysAfterHoliday,ArrivalDate_WeekNumber,ArrivalDate_DayOfWeek,DepartureDate_WeekNumber,DepartureDate_DayOfWeek,BookingDate_WeekNumber,BookingDate_DayOfWeek
0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,...,45,21,22,44,27,3,27,3,30,4
1,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,...,45,21,52,14,27,3,27,3,26,1
2,0,1,0.0,0,BB,GBR,Direct,Direct,0,0,...,44,22,52,14,27,3,27,4,26,3
3,0,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,...,44,22,58,8,27,3,27,4,25,4
4,0,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,...,43,23,59,7,27,3,27,5,25,3


In [3]:
initial_size = data.shape
initial_size

(119390, 35)

# Imputation

In [4]:
cat_imputer = CategoricalImputer(variables=['Country'], imputation_method = 'frequent')
data = cat_imputer.fit_transform(data)

num_imputer = MeanMedianImputer(imputation_method = 'median', variables = ['Children'])
data = num_imputer.fit_transform(data)


data.isna().sum().sum()

0

# Rare Label Encoding

In [5]:
data.describe(include = 'object')

Unnamed: 0,Meal,Country,MarketSegment,DistributionChannel,ReservedRoomType,AssignedRoomType,DepositType,Agent,Company,CustomerType
count,119390,119390,119390,119390,119390,119390,119390,119390,119390.0,119390
unique,5,177,8,5,10,12,3,334,353.0,4
top,BB,PRT,Online TA,TA/TO,A,A,No Deposit,9,,Transient
freq,92310,49078,56477,97870,85994,74053,104641,31961,112593.0,89613


In [6]:
print(data.select_dtypes('object').nunique().sum())
print(data.select_dtypes('object').nunique().mean())

911
91.1


In [7]:
rle = RareLabelEncoder(tol=0.05,n_categories=5,replace_with='Rare')

data = rle.fit_transform(data)



In [8]:
print(data.select_dtypes('object').nunique().sum())
print(data.select_dtypes('object').nunique().mean())

43
4.3


In [9]:
data.describe(include = 'object')

Unnamed: 0,Meal,Country,MarketSegment,DistributionChannel,ReservedRoomType,AssignedRoomType,DepositType,Agent,Company,CustomerType
count,119390,119390,119390,119390,119390,119390,119390,119390,119390.0,119390
unique,5,6,5,5,4,4,3,5,2.0,4
top,BB,PRT,Online TA,TA/TO,A,A,No Deposit,Rare,,Transient
freq,92310,49078,56477,97870,85994,74053,104641,49976,112593.0,89613


# Categorical Feature Encoding

In [10]:
# dte = DecisionTreeEncoder(regression = True, random_state=42)

# data = dte.fit_transform(data.drop(columns=['ADR']), data['ADR'])

In [11]:
# mean_enc = MeanEncoder(unseen = 'encode')

# data = mean_enc.fit_transform(X = data, y = None)

# data.head()

In [12]:
ohe = OneHotEncoder(drop_last = True)

data = ohe.fit_transform(data)

data.head()

Unnamed: 0,IsCanceled,Adults,Children,Babies,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,BookingChanges,DaysInWaitingList,ADR,...,DepositType_No Deposit,DepositType_Refundable,Agent_ NULL,Agent_Rare,Agent_ 240,Agent_ 9,Company_ NULL,CustomerType_Transient,CustomerType_Contract,CustomerType_Transient-Party
0,0,2,0.0,0,0,0,0,3,0,0.0,...,1,0,1,0,0,0,1,1,0,0
1,0,2,0.0,0,0,0,0,4,0,0.0,...,1,0,1,0,0,0,1,1,0,0
2,0,1,0.0,0,0,0,0,0,0,75.0,...,1,0,1,0,0,0,1,1,0,0
3,0,1,0.0,0,0,0,0,0,0,75.0,...,1,0,0,1,0,0,1,1,0,0
4,0,2,0.0,0,0,0,0,0,0,98.0,...,1,0,0,0,1,0,1,1,0,0


# Outlier Removal

In [13]:
olt = OutlierTrimmer(capping_method='gaussian', tail = 'both')

data_no_outliers = olt.fit_transform(data)
data_no_outliers.head()

Unnamed: 0,IsCanceled,Adults,Children,Babies,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,BookingChanges,DaysInWaitingList,ADR,...,DepositType_No Deposit,DepositType_Refundable,Agent_ NULL,Agent_Rare,Agent_ 240,Agent_ 9,Company_ NULL,CustomerType_Transient,CustomerType_Contract,CustomerType_Transient-Party
2,0,1,0.0,0,0,0,0,0,0,75.0,...,1,0,1,0,0,0,1,1,0,0
4,0,2,0.0,0,0,0,0,0,0,98.0,...,1,0,0,0,1,0,1,1,0,0
5,0,2,0.0,0,0,0,0,0,0,98.0,...,1,0,0,0,1,0,1,1,0,0
8,1,2,0.0,0,0,0,0,0,0,82.0,...,1,0,0,0,1,0,1,1,0,0
9,1,2,0.0,0,0,0,0,0,0,105.5,...,1,0,0,1,0,0,1,1,0,0


In [14]:
data_no_outliers.shape[0]

62812

In [15]:
initial_size[0] - data_no_outliers.shape[0]

56578

In [16]:
round(1- (data_no_outliers.shape[0]/initial_size[0]), 2)

0.47