In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from imblearn.over_sampling import SMOTE
import joblib

Data Loading

In [3]:
df = pd.read_csv("data/hotel_bookings.csv")

## Prediction of Cancellation

Task Type: Classification Problem Yes or No

Why choosing these models ?

- Logistic Regression - for binary outcomes
- RFC - handle complex patterns and interactions between features for categorical predictions.
- DTC - make decisions on categorical labels 

Model Output:

- The output is a label or category, such as "Yes" (1) or "No" (0). The classification models work to assign probabilities to each label and classify data points based on these probabilities.

Evaluation Metrics:

- Metrics such as accuracy, precision, recall, and F1-score are used to evaluate how well the model predicts the correct category.
- These metrics are relevant for measuring how many bookings are accurately predicted as "canceled" or "not canceled."

In [4]:

features = ['lead_time', 'hotel', 'market_segment', 'previous_cancellations', 
            'booking_changes', 'total_of_special_requests', 'arrival_date_month']
df_cancellation = df[features + ['is_canceled']].dropna()

label_encoders = {}
for col in ['hotel', 'market_segment', 'arrival_date_month']:
    le = LabelEncoder()
    df_cancellation[col] = le.fit_transform(df_cancellation[col])
    label_encoders[col] = le
df_cancellation['is_canceled'] = df_cancellation['is_canceled'].apply(lambda x: 1 if x == 'yes' else 0)
X_cancel = df_cancellation[features]
y_cancel = df_cancellation['is_canceled']
X_train_cancel, X_test_cancel, y_train_cancel, y_test_cancel = train_test_split(X_cancel, y_cancel, test_size=0.3, random_state=42)


In [5]:

# Function to train and evaluate a model
def train_and_evaluate_model(model, X_train_cancel, X_test_cancel, y_train_cancel, y_test_cancel):
    model.fit(X_train_cancel, y_train_cancel)
    y_pred_cancel = model.predict(X_test_cancel)
    accuracy = accuracy_score(y_test_cancel, y_pred_cancel)
    report = classification_report(y_test_cancel, y_pred_cancel)
    return accuracy, report


In [6]:

# Models to evaluate
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}


# Train and evaluate each model
for model_name, model in models.items():
    accuracy, report = train_and_evaluate_model(model, X_train_cancel, X_test_cancel, y_train_cancel, y_test_cancel)
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:\n", report)
    print("-" * 60)


Model: Random Forest
Accuracy: 80.74%
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85     22478
           1       0.76      0.70      0.73     13339

    accuracy                           0.81     35817
   macro avg       0.80      0.79      0.79     35817
weighted avg       0.81      0.81      0.81     35817

------------------------------------------------------------
Model: Logistic Regression
Accuracy: 72.67%
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.88      0.80     22478
           1       0.70      0.47      0.56     13339

    accuracy                           0.73     35817
   macro avg       0.72      0.67      0.68     35817
weighted avg       0.72      0.73      0.71     35817

------------------------------------------------------------
Model: Decision Tree
Accuracy: 80.31%
Classification Report:
               precision    recal

### FineTuning

In [8]:
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}
random_search_cancel = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42), param_distributions=param_dist,
                                          n_iter=50, cv=3, n_jobs=-1, scoring='recall', random_state=42)
random_search_cancel.fit(X_train_cancel, y_train_cancel)

In [9]:
best_model_cancel = random_search_cancel.best_estimator_
print("Best Hyperparameters:", random_search_cancel.best_params_)

Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_depth': 7}


Saving the Fine tuned model

In [10]:
joblib.dump(best_model_cancel, 'Models/tuned_cancellation_model.pkl')

['Models/tuned_cancellation_model.pkl']

Evaluating  the saved model

In [11]:
y_pred_cancel = best_model_cancel.predict(X_test_cancel)
accuracy_cancel = accuracy_score(y_test_cancel, y_pred_cancel)
report_cancel = classification_report(y_test_cancel, y_pred_cancel)

print(f"Accuracy: {accuracy_cancel * 100:.2f}%")
print("Classification Report:\n", report_cancel)

# Save the evaluation results (accuracy and classification report)
with open('cancellation_evaluation_report.txt', 'w') as f:
    f.write(f"Best Hyperparameters: {random_search_cancel.best_params_}\n")
    f.write(f"Accuracy: {accuracy_cancel * 100:.2f}%\n")
    f.write("Classification Report:\n")
    f.write(report_cancel)

Accuracy: 77.04%
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.87      0.83     22478
           1       0.73      0.60      0.66     13339

    accuracy                           0.77     35817
   macro avg       0.76      0.74      0.74     35817
weighted avg       0.77      0.77      0.77     35817



## Predicting Average Daily Rate (ADR) or Revenue

Task Type: Regression problem, where the goal is to predict a continuous numerical value

Why choosing these models ?
- Linear Regression: The simplest regression model that tries to fit a line to explain the relationship between input features and the output.
- Random Forest Regressor and Gradient Boosting Regressor: These models handle complex relationships and non-linear patterns well, which is useful for predicting a continuous value like ADR.
- Decision Tree Regressor: A tree-based approach to predict continuous values.

Model Outputs:

- The output is a continuous numerical value, such as the predicted ADR. Regression models are designed to minimize the error between predicted values and the true continuous values.

Evaluation Metrics:
- Metrics such as Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared (R²) are used to evaluate how close the predicted numerical values are to the actual ADR values.
- These metrics help in understanding the model's accuracy in predicting the continuous target variable.


In [12]:
adr_features = ['hotel', 'lead_time', 'market_segment', 'arrival_date_month',
                'previous_cancellations', 'booking_changes', 'total_of_special_requests', 'is_repeated_guest']

adr_df = df[adr_features + ['adr']].dropna()
for col in ['hotel', 'market_segment', 'arrival_date_month']:
    adr_df[col] = label_encoders[col].transform(adr_df[col])

In [13]:

X_adr = adr_df[adr_features]
y_adr = adr_df['adr']
X_train_adr, X_test_adr, y_train_adr, y_test_adr = train_test_split(X_adr, y_adr, test_size=0.3, random_state=42)

# Function to train and evaluate regression models
def train_and_evaluate_regression(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mae, mse, r2

# Regression models to evaluate
regression_models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)
}


In [14]:
# Train and evaluate each regression model
print("Average Daily Rate (ADR) Prediction Results:\n")
for model_name, model in regression_models.items():
    mae, mse, r2 = train_and_evaluate_regression(model, X_train_adr, X_test_adr, y_train_adr, y_test_adr)
    print(f"Model: {model_name}")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"R-squared (R²): {r2:.2f}")
    print("-" * 60)

Average Daily Rate (ADR) Prediction Results:

Model: Linear Regression
Mean Absolute Error (MAE): 33.76
Mean Squared Error (MSE): 45.61
R-squared (R²): 0.10
------------------------------------------------------------
Model: Decision Tree Regressor
Mean Absolute Error (MAE): 21.23
Mean Squared Error (MSE): 40.47
R-squared (R²): 0.29
------------------------------------------------------------
Model: Random Forest Regressor
Mean Absolute Error (MAE): 19.77
Mean Squared Error (MSE): 34.37
R-squared (R²): 0.49
------------------------------------------------------------
Model: Gradient Boosting Regressor
Mean Absolute Error (MAE): 24.42
Mean Squared Error (MSE): 34.18
R-squared (R²): 0.50
------------------------------------------------------------


In [15]:
gbr = GradientBoostingRegressor(random_state=42)

# Defining the parameter space to search
param_dist = {
    'n_estimators': np.arange(100, 500, 100),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': np.arange(3, 10),
    'min_samples_split': np.arange(2, 15, 2),
    'min_samples_leaf': np.arange(1, 10),
    'subsample': [0.6, 0.8, 1.0]
}

# Setting up the RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=gbr, param_distributions=param_dist, 
                                   n_iter=50, cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2, random_state=42)


random_search.fit(X_train_adr, y_train_adr)
print("Best Parameters:", random_search.best_params_)
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_adr)

mae = mean_absolute_error(y_test_adr, y_pred)
mse = root_mean_squared_error(y_test_adr, y_pred)
r2 = r2_score(y_test_adr, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'subsample': 0.8, 'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_depth': 9, 'learning_rate': 0.05}
Mean Absolute Error (MAE): 20.12
Mean Squared Error (MSE): 31.31
R-squared (R²): 0.58


In [16]:
best_adr_model_cancel = random_search.best_params_
print("Best Hyperparameters:", random_search.best_params_)

Best Hyperparameters: {'subsample': 0.8, 'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_depth': 9, 'learning_rate': 0.05}


In [17]:
joblib.dump(best_adr_model_cancel, 'Models/tuned_adr_model.pkl')

['Models/tuned_adr_model.pkl']

## Predicting Booking Modifications

In [18]:
mod_features = ['lead_time', 'hotel', 'market_segment', 'previous_cancellations', 
                'total_of_special_requests', 'arrival_date_month', 'is_repeated_guest']
mod_df = df[mod_features + ['booking_changes']].dropna()
for col in ['hotel', 'market_segment', 'arrival_date_month']:
    mod_df[col] = label_encoders[col].transform(mod_df[col])
mod_df['booking_changes_binary'] = mod_df['booking_changes'].apply(lambda x: 1 if x > 0 else 0)

In [19]:

X_mod = mod_df[mod_features]
y_mod = mod_df['booking_changes_binary']
X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split(X_mod, y_mod, test_size=0.3, random_state=42)

# Define and train classification models
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Train and evaluate each model
for model_name, model in classifiers.items():
    model.fit(X_train_mod, y_train_mod)
    y_pred_mod = model.predict(X_test_mod)
    
    accuracy = accuracy_score(y_test_mod, y_pred_mod)
    report = classification_report(y_test_mod, y_pred_mod)
    
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:\n", report)
    print("-" * 60)

Model: Logistic Regression
Accuracy: 84.75%
Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92     30359
           1       0.20      0.00      0.00      5458

    accuracy                           0.85     35817
   macro avg       0.52      0.50      0.46     35817
weighted avg       0.75      0.85      0.78     35817

------------------------------------------------------------
Model: Random Forest
Accuracy: 83.86%
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.94      0.91     30359
           1       0.45      0.26      0.33      5458

    accuracy                           0.84     35817
   macro avg       0.66      0.60      0.62     35817
weighted avg       0.81      0.84      0.82     35817

------------------------------------------------------------
Model: Gradient Boosting
Accuracy: 84.86%
Classification Report:
               precision    r

Need to Adjust Class Imbalance
- Resampling 
    - Oversampling the minority class (modifications) using techniques like SMOTE (Synthetic Minority Over-sampling Technique) or random oversampling.

In [20]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_mod, y_train_mod)
# Define and train classification models
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Train and evaluate each model
for model_name, model in classifiers.items():
    model.fit(X_resampled, y_resampled)
    y_pred_mod = model.predict(X_test_mod)
    
    accuracy = accuracy_score(y_test_mod, y_pred_mod)
    report = classification_report(y_test_mod, y_pred_mod)
    
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:\n", report)
    print("-" * 60)

Model: Logistic Regression
Accuracy: 61.93%
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.64      0.74     30359
           1       0.20      0.49      0.28      5458

    accuracy                           0.62     35817
   macro avg       0.54      0.56      0.51     35817
weighted avg       0.77      0.62      0.67     35817

------------------------------------------------------------
Model: Random Forest
Accuracy: 70.70%
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.74      0.81     30359
           1       0.27      0.52      0.35      5458

    accuracy                           0.71     35817
   macro avg       0.58      0.63      0.58     35817
weighted avg       0.80      0.71      0.74     35817

------------------------------------------------------------
Model: Gradient Boosting
Accuracy: 64.08%
Classification Report:
               precision    r

In [23]:
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

random_search = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42), param_distributions=param_dist,
                                   n_iter=50, cv=3, n_jobs=-1, scoring='recall', random_state=42)

random_search.fit(X_resampled, y_resampled)
best_mod_model = random_search.best_estimator_
y_pred_mod = best_mod_model.predict(X_test_mod)
accuracy = accuracy_score(y_test_mod, y_pred_mod)
report = classification_report(y_test_mod, y_pred_mod)
print(f"Best Parameters: {random_search.best_params_}")
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)

Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_depth': 7}
Accuracy: 68.36%
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.72      0.79     30359
           1       0.24      0.48      0.32      5458

    accuracy                           0.68     35817
   macro avg       0.56      0.60      0.56     35817
weighted avg       0.79      0.68      0.72     35817



In [26]:
joblib.dump(best_mod_model, 'Models/tuned_booking_modifications_model.pkl')

['Models/tuned_booking_modifications_model.pkl']