In [29]:
# Import libraries
import pandas as pd
import numpy as np
import pickle


In [30]:
# Load dataset
df = pd.read_csv('Dataset_B_hotel.csv')
df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0.0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0.0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0.0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0.0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0.0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


#Preprocessing

first we drop Booking_ID because its an identifier and useless for predictions

In [31]:
df.drop('Booking_ID', axis=1, inplace=True)

Dropping nulls

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   no_of_adults                          36275 non-null  int64  
 1   no_of_children                        36275 non-null  int64  
 2   no_of_weekend_nights                  36275 non-null  int64  
 3   no_of_week_nights                     36275 non-null  int64  
 4   type_of_meal_plan                     35368 non-null  object 
 5   required_car_parking_space            35005 non-null  float64
 6   room_type_reserved                    36275 non-null  object 
 7   lead_time                             36275 non-null  int64  
 8   arrival_year                          36275 non-null  int64  
 9   arrival_month                         36275 non-null  int64  
 10  arrival_date                          36275 non-null  int64  
 11  market_segment_

In [33]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32607 entries, 0 to 36274
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   no_of_adults                          32607 non-null  int64  
 1   no_of_children                        32607 non-null  int64  
 2   no_of_weekend_nights                  32607 non-null  int64  
 3   no_of_week_nights                     32607 non-null  int64  
 4   type_of_meal_plan                     32607 non-null  object 
 5   required_car_parking_space            32607 non-null  float64
 6   room_type_reserved                    32607 non-null  object 
 7   lead_time                             32607 non-null  int64  
 8   arrival_year                          32607 non-null  int64  
 9   arrival_month                         32607 non-null  int64  
 10  arrival_date                          32607 non-null  int64  
 11  market_segment_type 

Dropping duplicates

In [34]:
df = df.drop_duplicates().reset_index(drop=True)

Encoding the categorical column

In [35]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

Splitting the data

In [36]:
# Split features and target
X = df.drop('booking_status', axis=1)
y = df['booking_status']

In [37]:
# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Scaling the data

In [38]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Training the data

##Random forest

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

##XGBoost

In [40]:
from xgboost import XGBClassifier
# Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_scaled, y_train)
y_pred_xgb = xgb_model.predict(X_test_scaled)

Parameters: { "use_label_encoder" } are not used.



##Evaluation

In [41]:
acc_rf = accuracy_score(y_test, y_pred_rf)
acc_xgb = accuracy_score(y_test, y_pred_xgb)

print("Random Forest Accuracy:", acc_rf)
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

print("XGBoost Accuracy:", acc_xgb)
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

Random Forest Accuracy: 0.8580194667795176
[[ 934  409]
 [ 262 3121]]
              precision    recall  f1-score   support

           0       0.78      0.70      0.74      1343
           1       0.88      0.92      0.90      3383

    accuracy                           0.86      4726
   macro avg       0.83      0.81      0.82      4726
weighted avg       0.85      0.86      0.86      4726

XGBoost Accuracy: 0.8624629707998307
[[ 965  378]
 [ 272 3111]]
              precision    recall  f1-score   support

           0       0.78      0.72      0.75      1343
           1       0.89      0.92      0.91      3383

    accuracy                           0.86      4726
   macro avg       0.84      0.82      0.83      4726
weighted avg       0.86      0.86      0.86      4726



In [42]:

best_model = xgb_model

with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)