# Feature Preprocessing

In [1]:
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import DecisionTreeEncoder, MeanEncoder, OneHotEncoder, RareLabelEncoder
from feature_engine.outliers import OutlierTrimmer
from feature_engine.pipeline import Pipeline

import numpy as np
import pandas as pd

# Read Data

In [2]:
data = pd.read_parquet('../data/data_prepped.parquet')
data.head()

Unnamed: 0,IsCanceled,Adults,Children,Babies,Meal,Country,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,...,DepartureDate_DaysBeforeHoliday,DepartureDate_DaysAfterHoliday,BookingDate_DaysBeforeHoliday,BookingDate_DaysAfterHoliday,ArrivalDate_WeekNumber,ArrivalDate_DayOfWeek,DepartureDate_WeekNumber,DepartureDate_DayOfWeek,BookingDate_WeekNumber,BookingDate_DayOfWeek
0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,...,45,21,22,44,27,3,27,3,30,4
1,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,...,45,21,52,14,27,3,27,3,26,1
2,0,1,0.0,0,BB,GBR,Direct,Direct,0,0,...,44,22,52,14,27,3,27,4,26,3
3,0,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,...,44,22,58,8,27,3,27,4,25,4
4,0,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,...,43,23,59,7,27,3,27,5,25,3


In [3]:
initial_size = data.shape
initial_size

(119390, 35)

# Imputation

In [4]:
cat_imputer = CategoricalImputer(variables=['Country'], imputation_method = 'frequent')
data = cat_imputer.fit_transform(data)

num_imputer = MeanMedianImputer(imputation_method = 'median', variables = ['Children'])
data = num_imputer.fit_transform(data)


data.isna().sum().sum()

0

# Rare Label Encoding

In [5]:
rle = RareLabelEncoder(tol=0.05,n_categories=5,replace_with='Rare')

data = rle.fit_transform(data)



# Categorical Feature Encoding

In [6]:
# dte = DecisionTreeEncoder(regression = True, random_state=42)

# data = dte.fit_transform(data.drop(columns=['ADR']), data['ADR'])

In [7]:
# mean_enc = MeanEncoder(unseen = 'encode')

# data = mean_enc.fit_transform(X = data, y = None)

# data.head()

In [8]:
ohe = OneHotEncoder(drop_last = True)

data = ohe.fit_transform(data)

data.head()

Unnamed: 0,IsCanceled,Adults,Children,Babies,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,BookingChanges,DaysInWaitingList,ADR,...,DepositType_No Deposit,DepositType_Refundable,Agent_ NULL,Agent_Rare,Agent_ 240,Agent_ 9,Company_ NULL,CustomerType_Transient,CustomerType_Contract,CustomerType_Transient-Party
0,0,2,0.0,0,0,0,0,3,0,0.0,...,1,0,1,0,0,0,1,1,0,0
1,0,2,0.0,0,0,0,0,4,0,0.0,...,1,0,1,0,0,0,1,1,0,0
2,0,1,0.0,0,0,0,0,0,0,75.0,...,1,0,1,0,0,0,1,1,0,0
3,0,1,0.0,0,0,0,0,0,0,75.0,...,1,0,0,1,0,0,1,1,0,0
4,0,2,0.0,0,0,0,0,0,0,98.0,...,1,0,0,0,1,0,1,1,0,0


# Outlier Removal

In [9]:
olt = OutlierTrimmer(capping_method='gaussian')

data = olt.fit_transform(data)
data.head()

Unnamed: 0,IsCanceled,Adults,Children,Babies,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,BookingChanges,DaysInWaitingList,ADR,...,DepositType_No Deposit,DepositType_Refundable,Agent_ NULL,Agent_Rare,Agent_ 240,Agent_ 9,Company_ NULL,CustomerType_Transient,CustomerType_Contract,CustomerType_Transient-Party
2,0,1,0.0,0,0,0,0,0,0,75.0,...,1,0,1,0,0,0,1,1,0,0
4,0,2,0.0,0,0,0,0,0,0,98.0,...,1,0,0,0,1,0,1,1,0,0
5,0,2,0.0,0,0,0,0,0,0,98.0,...,1,0,0,0,1,0,1,1,0,0
8,1,2,0.0,0,0,0,0,0,0,82.0,...,1,0,0,0,1,0,1,1,0,0
9,1,2,0.0,0,0,0,0,0,0,105.5,...,1,0,0,1,0,0,1,1,0,0


In [10]:
data.shape[0]

63439

In [13]:
initial_size[0] - data.shape[0]

55951

In [16]:
round(1- (data.shape[0]/initial_size[0]), 2)

0.47