# Feature Engineering

---

1. Import packages
2. Load data
3. Feature engineering

---

## 1. Import packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
from ydata_profiling import ProfileReport


---
## 2. Load data

In [2]:
data = pd.read_csv('customer_booking.csv', encoding="ISO-8859-1")
data.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   num_passengers         50000 non-null  int64  
 1   sales_channel          50000 non-null  object 
 2   trip_type              50000 non-null  object 
 3   purchase_lead          50000 non-null  int64  
 4   length_of_stay         50000 non-null  int64  
 5   flight_hour            50000 non-null  int64  
 6   flight_day             50000 non-null  object 
 7   route                  50000 non-null  object 
 8   booking_origin         50000 non-null  object 
 9   wants_extra_baggage    50000 non-null  int64  
 10  wants_preferred_seat   50000 non-null  int64  
 11  wants_in_flight_meals  50000 non-null  int64  
 12  flight_duration        50000 non-null  float64
 13  booking_complete       50000 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.3+ 

In [4]:
profile = ProfileReport(data,minimal=True)
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



# Booking Model

### Preparing data

In [5]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data['booking_origin'] = le.fit_transform(data['booking_origin'])
data['route'] = le.fit_transform(data['route'])
data['flight_day'] = le.fit_transform(data['flight_day'])
data['trip_type'] = le.fit_transform(data['trip_type'])
data['sales_channel'] = le.fit_transform(data['sales_channel'])


In [6]:
from sklearn.model_selection import train_test_split

train , test = train_test_split(data,random_state=104, test_size=0.2, shuffle=True)
train.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
18926,1,0,2,159,103,3,5,626,4,1,1,0,8.83,0
18703,1,0,2,16,89,7,3,621,4,1,0,0,8.83,0
46060,1,0,2,263,6,13,3,362,4,0,0,0,8.83,0
25397,1,0,1,36,17,3,2,782,91,1,0,0,8.58,0
4858,1,0,2,88,21,11,6,148,4,1,0,1,8.83,0


In [7]:
x = train.drop(columns=['booking_complete'])
y = train.booking_complete

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=104, test_size=0.15, shuffle=True)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.metrics import log_loss

## Modeling

#### 1. Start with a Dummy Model (np.rand) - Baseline Model

In [9]:
m = y_test.shape[0]
rand_y_test = np.random.randint(2,size= m)
rand_y_test

array([0, 0, 0, ..., 0, 0, 0])

In [10]:
random_accuarcy = acc(y_test,rand_y_test)
random_accuarcy

0.49983333333333335

In [11]:
random_f1 = f1(y_test,rand_y_test)
random_f1

0.22514846372321198

#### 2. Simple Model (linear)
- Linear model
- calculate score
- calculate feature imporance
- Simple model with top 10/20 features

In [12]:
model = LogisticRegression()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred), f1(y_test, y_pred))

0.8553333333333333 0.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000 entries, 18926 to 8261
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   num_passengers         40000 non-null  int64  
 1   sales_channel          40000 non-null  int32  
 2   trip_type              40000 non-null  int32  
 3   purchase_lead          40000 non-null  int64  
 4   length_of_stay         40000 non-null  int64  
 5   flight_hour            40000 non-null  int64  
 6   flight_day             40000 non-null  int32  
 7   route                  40000 non-null  int32  
 8   booking_origin         40000 non-null  int32  
 9   wants_extra_baggage    40000 non-null  int64  
 10  wants_preferred_seat   40000 non-null  int64  
 11  wants_in_flight_meals  40000 non-null  int64  
 12  flight_duration        40000 non-null  float64
dtypes: float64(1), int32(5), int64(7)
memory usage: 3.5 MB


#### 3. Simple Model with Balanced Dataset

First : Upsampling

In [15]:
from imblearn.over_sampling import SMOTE


print('Before UpSampling, the shape of train_X: {}'.format(x_train.shape))
print('Before UpSampling, the shape of train_y: {} \n'.format(y_train.shape))

print("Before UpSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before UpSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(sampling_strategy = 1 ,k_neighbors = 5, random_state=1)   #Synthetic Minority Over Sampling Technique
x_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())


print("After UpSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After UpSampling, counts of label '0': {} \n".format(sum(y_train_res==0)))



print('After UpSampling, the shape of train_X: {}'.format(x_train_res.shape))
print('After UpSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

Before UpSampling, the shape of train_X: (34000, 13)
Before UpSampling, the shape of train_y: (34000,) 

Before UpSampling, counts of label '1': 5180
Before UpSampling, counts of label '0': 28820 

After UpSampling, counts of label '1': 28820
After UpSampling, counts of label '0': 28820 

After UpSampling, the shape of train_X: (57640, 13)
After UpSampling, the shape of train_y: (57640,) 



In [16]:
from sklearn import metrics


model = LogisticRegression()
model.fit(x_train_res,y_train_res)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.5963333333333334 0.24595267745952681
[[3183 1950]
 [ 472  395]]
              precision    recall  f1-score   support

           0       0.87      0.62      0.72      5133
           1       0.17      0.46      0.25       867

    accuracy                           0.60      6000
   macro avg       0.52      0.54      0.49      6000
weighted avg       0.77      0.60      0.66      6000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Second : Downsampling

In [17]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
x_rus, y_rus = rus.fit_resample(x_train, y_train)


print("After UpSampling, counts of label '1': {}".format(sum(y_rus==1)))
print("After UpSampling, counts of label '0': {} \n".format(sum(y_rus==0)))



print('After UpSampling, the shape of train_X: {}'.format(x_rus.shape))
print('After UpSampling, the shape of train_y: {} \n'.format(y_rus.shape))

After UpSampling, counts of label '1': 5180
After UpSampling, counts of label '0': 5180 

After UpSampling, the shape of train_X: (10360, 13)
After UpSampling, the shape of train_y: (10360,) 



In [18]:
from sklearn import metrics


model = LogisticRegression()
model.fit(x_train_res,y_train_res)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.5963333333333334 0.24595267745952681
[[3183 1950]
 [ 472  395]]
              precision    recall  f1-score   support

           0       0.87      0.62      0.72      5133
           1       0.17      0.46      0.25       867

    accuracy                           0.60      6000
   macro avg       0.52      0.54      0.49      6000
weighted avg       0.77      0.60      0.66      6000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### 4. Complex and Explainable Model (Tree Based)


First : Decision Tree

In [19]:
# DecisionTreeClassifier With original data
model = DecisionTreeClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.778 0.26732673267326734
[[4425  708]
 [ 624  243]]
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      5133
           1       0.26      0.28      0.27       867

    accuracy                           0.78      6000
   macro avg       0.57      0.57      0.57      6000
weighted avg       0.79      0.78      0.78      6000



In [20]:
# DecisionTreeClassifier With Upsampling data
model = DecisionTreeClassifier()
model.fit(x_train_res,y_train_res)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.747 0.30558096980786825
[[4148  985]
 [ 533  334]]
              precision    recall  f1-score   support

           0       0.89      0.81      0.85      5133
           1       0.25      0.39      0.31       867

    accuracy                           0.75      6000
   macro avg       0.57      0.60      0.58      6000
weighted avg       0.79      0.75      0.77      6000



In [21]:
# DecisionTreeClassifier With Downsampling data
model = DecisionTreeClassifier()
model.fit(x_rus,y_rus)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.6261666666666666 0.32460102378801564
[[3218 1915]
 [ 328  539]]
              precision    recall  f1-score   support

           0       0.91      0.63      0.74      5133
           1       0.22      0.62      0.32       867

    accuracy                           0.63      6000
   macro avg       0.56      0.62      0.53      6000
weighted avg       0.81      0.63      0.68      6000



Second : Random Forest

In [22]:
# RandomForestClassifier With Original data
model = RandomForestClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.8513333333333334 0.13060428849902536
[[5041   92]
 [ 800   67]]
              precision    recall  f1-score   support

           0       0.86      0.98      0.92      5133
           1       0.42      0.08      0.13       867

    accuracy                           0.85      6000
   macro avg       0.64      0.53      0.52      6000
weighted avg       0.80      0.85      0.80      6000



In [23]:
# RandomForestClassifier With Upsampling data
model = RandomForestClassifier()
model.fit(x_train_res,y_train_res)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.8056666666666666 0.33825198637911463
[[4536  597]
 [ 569  298]]
              precision    recall  f1-score   support

           0       0.89      0.88      0.89      5133
           1       0.33      0.34      0.34       867

    accuracy                           0.81      6000
   macro avg       0.61      0.61      0.61      6000
weighted avg       0.81      0.81      0.81      6000



In [24]:
# RandomForestClassifier With Downsampling data
model = RandomForestClassifier()
model.fit(x_rus,y_rus)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.6835 0.38997751365242533
[[3494 1639]
 [ 260  607]]
              precision    recall  f1-score   support

           0       0.93      0.68      0.79      5133
           1       0.27      0.70      0.39       867

    accuracy                           0.68      6000
   macro avg       0.60      0.69      0.59      6000
weighted avg       0.84      0.68      0.73      6000



#### 5. Deeper Model (XGBoost,catboost)

First : XGBoost

In [25]:
# XGBClassifier With original data
model = XGBClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.8506666666666667 0.17798165137614677
[[5007  126]
 [ 770   97]]
              precision    recall  f1-score   support

           0       0.87      0.98      0.92      5133
           1       0.43      0.11      0.18       867

    accuracy                           0.85      6000
   macro avg       0.65      0.54      0.55      6000
weighted avg       0.80      0.85      0.81      6000



In [26]:
# XGBClassifier With Upsampling data
model = XGBClassifier()
model.fit(x_train_res,y_train_res)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.7868333333333334 0.34578005115089516
[[4383  750]
 [ 529  338]]
              precision    recall  f1-score   support

           0       0.89      0.85      0.87      5133
           1       0.31      0.39      0.35       867

    accuracy                           0.79      6000
   macro avg       0.60      0.62      0.61      6000
weighted avg       0.81      0.79      0.80      6000



In [27]:
# XGBClassifier With Downsampling data
model = XGBClassifier()
model.fit(x_rus,y_rus)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.6838333333333333 0.3979688987622977
[[3476 1657]
 [ 240  627]]
              precision    recall  f1-score   support

           0       0.94      0.68      0.79      5133
           1       0.27      0.72      0.40       867

    accuracy                           0.68      6000
   macro avg       0.60      0.70      0.59      6000
weighted avg       0.84      0.68      0.73      6000



Second : CatBoost

In [28]:
# CatBoostClassifier With original data
model = CatBoostClassifier(verbose=0)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.8526666666666667 0.11066398390342051
[[5061   72]
 [ 812   55]]
              precision    recall  f1-score   support

           0       0.86      0.99      0.92      5133
           1       0.43      0.06      0.11       867

    accuracy                           0.85      6000
   macro avg       0.65      0.52      0.52      6000
weighted avg       0.80      0.85      0.80      6000



In [29]:
# CatBoostClassifier With Upsampling data
model = CatBoostClassifier(verbose=0)
model.fit(x_train_res,y_train_res)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.7923333333333333 0.35839340885684856
[[4406  727]
 [ 519  348]]
              precision    recall  f1-score   support

           0       0.89      0.86      0.88      5133
           1       0.32      0.40      0.36       867

    accuracy                           0.79      6000
   macro avg       0.61      0.63      0.62      6000
weighted avg       0.81      0.79      0.80      6000



In [30]:
# CatBoostClassifier With Downsampling data
model = CatBoostClassifier(verbose=0)
model.fit(x_rus,y_rus)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.6831666666666667 0.40538004379105413
[[3451 1682]
 [ 219  648]]
              precision    recall  f1-score   support

           0       0.94      0.67      0.78      5133
           1       0.28      0.75      0.41       867

    accuracy                           0.68      6000
   macro avg       0.61      0.71      0.59      6000
weighted avg       0.84      0.68      0.73      6000



### The Best Model is : CatBoostClassifier With Downsampling data