In [220]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import chardet
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [221]:
# Detect file encoding
with open('../Data/customer_booking.csv', 'rb') as f:
    result = chardet.detect(f.read())
result

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

In [222]:
booking = pd.read_csv('../Data/customer_booking.csv', encoding='ISO-8859-1')

- `num_passengers` = number of passengers travelling
- `sales_channel` = sales channel booking was made on
- `trip_type` = trip Type (Round Trip, One Way, Circle Trip)
- `purchase_lead` = number of days between travel date and booking date
- `length_of_stay` = number of days spent at destination
- `flight_hour` = hour of flight departure
- `flight_day` = day of week of flight departure
- `route` = origin -> destination flight route
- `booking_origin` = country from where booking was made
- `wants_extra_baggage` = if the customer wanted extra baggage in the booking
- `wants_preferred_seat` = if the customer wanted a preferred seat in the booking
- `wants_in_flight_meals` = if the customer wanted in-flight meals in the booking
- `flight_duration` = total duration of flight (in hours)
- `booking_complete` = flag indicating if the customer completed the booking

In [223]:
booking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   num_passengers         50000 non-null  int64  
 1   sales_channel          50000 non-null  object 
 2   trip_type              50000 non-null  object 
 3   purchase_lead          50000 non-null  int64  
 4   length_of_stay         50000 non-null  int64  
 5   flight_hour            50000 non-null  int64  
 6   flight_day             50000 non-null  object 
 7   route                  50000 non-null  object 
 8   booking_origin         50000 non-null  object 
 9   wants_extra_baggage    50000 non-null  int64  
 10  wants_preferred_seat   50000 non-null  int64  
 11  wants_in_flight_meals  50000 non-null  int64  
 12  flight_duration        50000 non-null  float64
 13  booking_complete       50000 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.3+ 

In [224]:
booking.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0


In [225]:
booking.describe()

Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,1.59124,84.94048,23.04456,9.06634,0.66878,0.29696,0.42714,7.277561,0.14956
std,1.020165,90.451378,33.88767,5.41266,0.470657,0.456923,0.494668,1.496863,0.356643
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.67,0.0
25%,1.0,21.0,5.0,5.0,0.0,0.0,0.0,5.62,0.0
50%,1.0,51.0,17.0,9.0,1.0,0.0,0.0,7.57,0.0
75%,2.0,115.0,28.0,13.0,1.0,1.0,1.0,8.83,0.0
max,9.0,867.0,778.0,23.0,1.0,1.0,1.0,9.5,1.0


### Feature selection
#### Feature Engineering
1. `is_weekend_flight`: Convert flight_day into a binary indicator for weekends. Flights booked outside of weekends are often cheaper and may have higher completion rates.
2. `is_last_minute_booking`: Mark bookings as last-minute if purchase_lead is less than 5 days and flight_duration exceeds 6 hours. Last-minute bookings often indicate higher commitment due to urgency.
3. `trip_scale`: Calculate as num_passengers * length_of_stay * wants_extra_baggage. This can suggest a larger, more planned group trip, often associated with higher commitment, although it could also indicate the complexity of coordinating a group.
4. `early_flight`: Mark flights as early if flight_hour < 7:00 AM, representing flights taken early in the morning.
#### Dropping Feature
1. `booking_origin`: Drop this feature as it is unlikely to have a significance prediction on booking completion.
2. `flight_hour`: Drop this feature since, for non-urgent and very early flights, the specific hour may not impact the booking completion as much.

In [226]:
booking["is_weekend_flight"] = booking["flight_day"].apply(lambda x: 1 if x in ["Sat", "Sun"] else 0)
booking["is_last_minute_booking"] = booking.apply(lambda x: 1 if x["purchase_lead"] < 5 and x["flight_duration"] > 7 else 0, axis=1)
booking["trip_scale"] = booking["num_passengers"] * booking["length_of_stay"] * booking["wants_extra_baggage"]
booking["Early_flight"] = booking["flight_hour"].apply(lambda x: 1 if x < 7 else 0)

In [227]:
booking.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete,is_weekend_flight,is_last_minute_booking,trip_scale,Early_flight
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0,1,0,38,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0,1,0,0,1
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0,0,0,44,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0,1,0,0,1
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0,0,0,44,0


In [228]:
# Apply one-hot encoding and remove features
booking = booking.drop(columns = ['booking_origin', 'route'], axis = 1)
booking = pd.get_dummies(booking, columns=['sales_channel','trip_type','flight_day'])

In [229]:
X = booking.drop('booking_complete', axis=1)  # Drop the target column from features
y = booking['booking_complete']  # Set the target variable

In [230]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2202)

### Model 1 (Baseline) : Random forest classifier (Unbalanced)
#### Finding relevant feautres

In [231]:
# # Param for rf
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'bootstrap': [True, False]
# }
# # Random forest classifer
# rf = RandomForestClassifier(n_estimators = 100, random_state= 2202)

# # Initialize GridSearchCV
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# # Fit the model
# grid_search.fit(X_train, y_train)

# # Get the best parameters
# print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'bootstrap': True, 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}

In [232]:
# Random forest classifer
rf = RandomForestClassifier(
    n_estimators=100,            # Number of trees in the forest
    max_depth=30,                # Maximum depth of each tree
    min_samples_split=5,         # Minimum samples required to split an internal node
    min_samples_leaf=2,          # Minimum samples required at a leaf node
    max_features='sqrt',         # Number of features to consider for the best split
    bootstrap=True,              # Whether to use bootstrap samples or not
    random_state=2202            # Random seed for reproducibility
)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [233]:
# Perform 5-fold cross-validation
rf_cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy')
rf_cv_scores.mean()

0.8500249999999999

In [234]:
# Accuracy
rf_accuracy = accuracy_score(y_test, y_pred)
rf_accuracy

0.8544

In [235]:
rf_report = classification_report(y_test, y_pred, output_dict=True) # Return dict to transform into dataframe
rf_report = pd.DataFrame(rf_report).T
rf_report

Unnamed: 0,precision,recall,f1-score,support
0,0.856324,0.996721,0.921204,8539.0
1,0.540984,0.022587,0.043364,1461.0
accuracy,0.8544,0.8544,0.8544,0.8544
macro avg,0.698654,0.509654,0.482284,10000.0
weighted avg,0.810252,0.8544,0.792951,10000.0


The model predicted 0 value of booking_complete very well but performed poorly on 1 (minority class), with very low recall value and mediocre precision. No sign of overfitting. Next, I will try to account for class imbalance in 3 ways: adjust threshold level, undersampling majority class (some research papers pointed out that SMOTE/other sampling methods may not be beneficial on strong classifiers like Catboost and not as efficient as threshold adjustment), and use strong classfiers such as XGBoost and CatBoost.

### Model 2 : Random forest classifier (Balanced with Threshold)

In [236]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2202)

In [237]:
# Random forest classifer
rf_thrs = RandomForestClassifier(
    n_estimators=100,            # Number of trees in the forest
    max_depth=30,                # Maximum depth of each tree
    min_samples_split=5,         # Minimum samples required to split an internal node
    min_samples_leaf=2,          # Minimum samples required at a leaf node
    max_features='sqrt',         # Number of features to consider for the best split
    bootstrap=True,              # Whether to use bootstrap samples or not
    random_state=2202,            # Random seed for reproducibility
    class_weight="balanced")
rf_thrs.fit(X_train, y_train)
y_pred = rf_thrs.predict(X_test)

In [238]:
# Perform 5-fold cross-validation
rf_thrs_cv_scores = cross_val_score(rf_thrs, X_train, y_train, cv=5, scoring='accuracy')
rf_thrs_cv_scores.mean()

0.828625

In [239]:
rf_thrs_report = classification_report(y_test, y_pred, output_dict=True) # Return dict to transform into dataframe
rf_thrs_report = pd.DataFrame(rf_thrs_report).T
rf_thrs_report

Unnamed: 0,precision,recall,f1-score,support
0,0.86943,0.95058,0.908196,8539.0
1,0.364458,0.16564,0.227765,1461.0
accuracy,0.8359,0.8359,0.8359,0.8359
macro avg,0.616944,0.55811,0.56798,10000.0
weighted avg,0.795654,0.8359,0.808785,10000.0


In [240]:
# Accuracy
rf_thrs_accuracy = accuracy_score(y_test, y_pred)
rf_thrs_accuracy

0.8359

Adjusting for threshold, this model slightly improves the f1-score of 1 through recall, but the metrics overall are not good—no sign of overfitting.

### Model 3 : Random forest classifier (Balanced by Undersampling)

In [241]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2202)

from imblearn.under_sampling import RandomUnderSampler

# Scaling (same as before)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply Random UnderSampling to balance the classes
undersampler = RandomUnderSampler(random_state=2202)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)



In [242]:
# Random forest classifer
rf_under = RandomForestClassifier(
    n_estimators=100,            # Number of trees in the forest
    max_depth=30,                # Maximum depth of each tree
    min_samples_split=5,         # Minimum samples required to split an internal node
    min_samples_leaf=2,          # Minimum samples required at a leaf node
    max_features='sqrt',         # Number of features to consider for the best split
    bootstrap=True,              # Whether to use bootstrap samples or not
    random_state=2202,            # Random seed for reproducibility
    class_weight="balanced")
rf_under.fit(X_train_resampled, y_train_resampled)
y_pred = rf_under.predict(X_test)

In [243]:
# Perform 5-fold cross-validation
rf_under_cv_scores = cross_val_score(rf_under, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
rf_under_cv_scores.mean()

0.6181643937518067

In [244]:
# Accuracy
rf_under_accuracy = accuracy_score(y_test, y_pred)
rf_under_accuracy

0.63

In [245]:
rf_under_report = classification_report(y_test, y_pred, output_dict=True) # Return dict to transform into dataframe
rf_under_report = pd.DataFrame(rf_under_report).T
rf_under_report

Unnamed: 0,precision,recall,f1-score,support
0,0.906434,0.631924,0.744687,8539.0
1,0.223375,0.618754,0.32825,1461.0
accuracy,0.63,0.63,0.63,0.63
macro avg,0.564905,0.625339,0.536468,10000.0
weighted avg,0.806639,0.63,0.683845,10000.0


By undersampling the majority class, the model has learned to recognize class 1 more often, but the precision is very low compared to recall, so a lot of its 1-classification is wrong.

### Model 4 : XGBoost

In [246]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2202)

In [247]:
# Calculate scale_pos_weight
neg, pos = y_train.value_counts()
scale_pos_weight = neg / pos

In [248]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logitraw',  # Using raw output (logits)
    max_delta_step=1,             # Helps stabilize training for imbalanced datasets
    scale_pos_weight=scale_pos_weight,          # Adjust the weight for the minority class (ratio of neg/pos)
    subsample=0.7,                # Use % of the data for each tree
    colsample_bytree=0.4,         # Use % of the features for each tree
    learning_rate=0.05,           # Small learning rate for more stable convergence
    n_estimators=300,             # Number of boosting rounds (trees)
    random_state=2202
)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

In [249]:
# Perform 5-fold cross-validation
xgb_cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='accuracy')
xgb_cv_scores.mean()

0.78375

In [250]:
# Accuracy
xgb_accuracy = accuracy_score(y_test, y_pred)
xgb_accuracy

0.7844

In [251]:
xgb_report = classification_report(y_test, y_pred, output_dict=True) # Return dict to transform into dataframe
xgb_report = pd.DataFrame(xgb_report).T
xgb_report

Unnamed: 0,precision,recall,f1-score,support
0,0.886146,0.857712,0.871697,8539.0
1,0.299712,0.355921,0.325407,1461.0
accuracy,0.7844,0.7844,0.7844,0.7844
macro avg,0.592929,0.606816,0.598552,10000.0
weighted avg,0.800468,0.7844,0.791884,10000.0


I played around with the parameters, and the precision hover around 0.3 and recall 0.35. Could be better!

### Model 4 : Catboost

In [252]:
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2202)

# CatBoost model
cat_model = CatBoostClassifier(
    iterations=200,            # Number of boosting iterations (trees).
    depth=15,                   # The depth of each individual tree. 
    learning_rate=0.05,        # The learning rate (also known as eta).
    l2_leaf_reg=2,             # L2 regularization for leaf values.
    border_count=150,          # Number of splits when dealing with categorical features.
    class_weights=[1, 10]      # Class weights to handle imbalanced classes.
)
# Train model
cat_model.fit(X_train, y_train, verbose=100)

# Make predictions
y_pred = cat_model.predict(X_test)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(cat_model, X_train, y_train, cv=5, scoring='accuracy')

0:	learn: 0.6858826	total: 20.3ms	remaining: 4.04s
100:	learn: 0.4183834	total: 36.9s	remaining: 36.2s
199:	learn: 0.3077071	total: 1m 15s	remaining: 0us
0:	learn: 0.6857485	total: 8.59ms	remaining: 1.71s
1:	learn: 0.6723953	total: 8.59ms	remaining: 1.71s
2:	learn: 0.6635702	total: 516ms	remaining: 50.8s
3:	learn: 0.6574492	total: 535ms	remaining: 35s
4:	learn: 0.6479071	total: 1.06s	remaining: 51.5s
5:	learn: 0.6400935	total: 1.52s	remaining: 58.9s
6:	learn: 0.6317250	total: 1.95s	remaining: 1m 2s
7:	learn: 0.6252799	total: 2.39s	remaining: 1m 5s
8:	learn: 0.6221643	total: 2.43s	remaining: 58.1s
9:	learn: 0.6103822	total: 2.87s	remaining: 1m
10:	learn: 0.6052989	total: 3.27s	remaining: 1m 1s
11:	learn: 0.6036831	total: 3.27s	remaining: 56s
12:	learn: 0.5988238	total: 3.68s	remaining: 57.4s
13:	learn: 0.5970626	total: 3.69s	remaining: 52.8s
14:	learn: 0.5918553	total: 4.14s	remaining: 54.7s
15:	learn: 0.5858720	total: 4.57s	remaining: 56.1s
16:	learn: 0.5806084	total: 4.99s	remaining: 

In [253]:
cv_scores.mean()

0.668575

In [254]:
# Compute accuracy
cat_accuracy = accuracy_score(y_test, y_pred)
cat_accuracy

0.6579

In [255]:
cat_report = classification_report(y_test, y_pred, output_dict=True)
cat_report = pd.DataFrame(cat_report)
cat_report.T

Unnamed: 0,precision,recall,f1-score,support
0,0.899095,0.675138,0.771186,8539.0
1,0.226867,0.557153,0.32244,1461.0
accuracy,0.6579,0.6579,0.6579,0.6579
macro avg,0.562981,0.616145,0.546813,10000.0
weighted avg,0.800883,0.6579,0.705624,10000.0


### Conclusion

Overall, Model 2 (using the undersampling method) and CatBoost performed better than other models for classifying Case 1. When it comes to classifying potential lost customers, both Model 2 and CatBoost are recommended. Going forward, further model tuning and feature engineering could be explored to enhance performance. Additionally, potential solutions for addressing class imbalance include neural networks and treating the problem as an anomaly detection task.