# Hospitality Heuristics: Reviewing Revenues

---

- Data: hotel datasets from capstone
- Goal: Design/develop on-demand report generation for common metrics
- Breakdown:
    - Calculate daily revenue(s)
    - Calculate daily occupancy
        - By room type
        - By rate segmentation
        - By rate amount (e.g., quartiles, etc.)

---

# Package Imports

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE

import seaborn as sns

import shap

from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

import xgboost as xgb

pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
pd.set_option('display.max_rows', 100)

%matplotlib inline

# Reading Datasets

* Need to import data separately
* Add unique identifier for each (IsCity?)
* Concatenate into single dataframe

In [None]:
data_path_resort = './data/Resort_Data.xlsx'
data_path_city = './data/City_Data.xlsx'

In [None]:
resort_df = pd.read_excel(data_path_resort)
resort_df['IsResort'] = 1
# resort_df

city_df = pd.read_excel(data_path_city)
city_df['IsResort'] = 0
# city_df

full_data = pd.concat([resort_df, city_df], axis = 0)
full_data

# Review Full Dataset

**What will I reliably know?**

* Need to filter for features guaranteed to be on reservations
* Could build other features using other models (e.g., regression/classifications)

In [None]:
full_data.info()

## Feature Engineering


Some features are good as-is, while others have options for feature engineering

simple_features: 'LeadTime', 'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth', 'StaysInWeekNights',
                   'StaysInWeekendNights','ADR', 'CustomerType','MarketSegment', 'ReservedRoomType',
                   'IsRepeatedGuest', 'PreviousCancellations', 'PreviousBookingsNotCanceled',
                   'BookingChanges', 'TotalOfSpecialRequests', 'IsResort'

dow_arrival: 'ArrivalDateYear', 'ArrivalDateMonth', 'ArrivalDateDayOfMonth'

dow_departure: 'ArrivalDateYear', 'ArrivalDateMonth', 'ArrivalDateDayOfMonth', 'StaysInWeekNights',
                   'StaysInWeekendNights'

In [None]:
# full_data['StaysInWeekNights'].value_counts(ascending = False, normalize = True, dropna = False)
full_data['StaysInWeekNights'].describe()

In [None]:
full_data[(full_data['StaysInWeekNights'] + full_data['StaysInWeekendNights']) == 0]['IsCanceled'].describe()

In [None]:
full_data['ADR'].describe()

In [None]:
# full_data[full_data['ADR'] != 0]['IsCanceled'].describe()
full_data[full_data['ADR'] != 0]['IsCanceled'].value_counts(normalize= True)

# Feature Notes

* ADR, StaysInWeek/endNights features are too strongly indicative of whether or not a reservation is cancelled.
* Remove these features for now; may be useful in later modeling

In [None]:
# full_data[dow_arrival]

In [None]:
full_data['ArrivalDayofWeek'] = (pd.to_datetime(full_data['ArrivalDateYear'].astype(str)
                                             + "-" + full_data['ArrivalDateMonth']
                                             + "-" + full_data['ArrivalDateDayOfMonth']
                                             .astype(str))
                               .dt.dayofweek)
full_data['ArrivalDayofWeek']

In [None]:
full_data.head()

# Selecting Features

I want to select a subset of reservation features that are independent of any temporal impact.

Several features (e.g., BookingChanges) will be too variable depending on the age of the reservation. I want to understand the likelihood of a cancellation separate from any variable features.

In [None]:
selected_features = ['LeadTime', 'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth', 'ArrivalDayofWeek',
                     'ADR', 'CustomerType', 'MarketSegment', 'ReservedRoomType',
                     'IsRepeatedGuest', 'PreviousCancellations', 'PreviousBookingsNotCanceled',
                     'TotalOfSpecialRequests', 'IsResort']

selected_features

In [None]:
data_subset = full_data[['IsCanceled', *selected_features]]
data_subset

In [None]:
data_subset.info()

In [None]:
data_subset.describe(include = "number")

In [None]:
data_subset.describe(exclude = "number")

# Starting Modeling

In [None]:
def classy_report(model, X_train, y_train, X_test, y_test):
    
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    print("\n\n",classification_report(y_test, y_pred),"\n\n")
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap = 'Blues', normalize = "true")

## Train-Test Split

In [None]:
target = 'IsCanceled'

# Assuming 'df' is your DataFrame
X = data_subset.drop(columns = [target]).select_dtypes('number')
y = data_subset[target]   # Target variable

In [None]:
X.info()

In [None]:
y.info()

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

## XGBoost

In [None]:
# Instantiate an XGBoost classifier
model = xgb.XGBClassifier(eval_metric='logloss')

## Generate predictions and classification reports
classy_report(model, X_train, y_train, X_test, y_test)

## HistGradientBoostingClassifier

In [None]:
# Instantiate the classifier
classifier = HistGradientBoostingClassifier()

## Generate predictions and classification reports
classy_report(classifier, X_train, y_train, X_test, y_test)

# Imbalanced-Learn

In [None]:
# Create the classifier
brf_classifier = BalancedRandomForestClassifier(random_state=42)

## Generate predictions and classification reports
classy_report(brf_classifier, X_train, y_train, X_test, y_test)

In [None]:
# Applying SMOTE to generate synthetic samples for the minority class
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# smote_enn = SMOTETomek(random_state=42)
# X_resampled, y_resampled = smote_enn.fit_resample(X, y)


## RandomForestClassifier

In [None]:
# Create a classifier
rfc_imb = RandomForestClassifier(random_state=42, n_jobs=-1)

## Generate predictions and classification reports
classy_report(rfc_imb, X_train, y_train, X_test, y_test)

In [None]:
# Create a classifier
rfc_bal = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')

## Generate predictions and classification reports
classy_report(rfc_bal, X_train, y_train, X_test, y_test)

## SMOTE-ing

In [None]:
## Applying SMOTE to generate synthetic samples for the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Create a classifier
rfc_imb = RandomForestClassifier(random_state=42, n_jobs=-1)

## Generate predictions and classification reports
classy_report(rfc_imb, X_resampled, y_resampled, X_test, y_test)

In [None]:
# Create a classifier
rfc_bal = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')

## Generate predictions and classification reports
classy_report(rfc_imb, X_resampled, y_resampled, X_test, y_test)

In [None]:
## Testing alternate SMOTE - SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Create a classifier
rfc_imb = RandomForestClassifier(random_state=42, n_jobs=-1)

## Generate predictions and classification reports
classy_report(rfc_imb, X_resampled, y_resampled, X_test, y_test)

In [None]:
## Testing alternate SMOTE - SMOTETomek
smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

# Create a classifier
rfc_tomek = RandomForestClassifier(random_state=42, n_jobs=-1)

## Generate predictions and classification reports
classy_report(rfc_tomek, X_resampled, y_resampled, X_test, y_test)

In [None]:
# Calculate permutation importance
result = permutation_importance(rfc_tomek, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)

# Get importances and feature names
importances = result.importances_mean
feature_names = X_test.columns

In [None]:
# Sort the feature importances in descending order and select the top 5
indices = np.argsort(importances)[::-1]
top_indices = indices[:10]

# Create a horizontal bar plot
plt.figure(figsize=(10, 5))
plt.title("Top 5 Permutation Importances")
plt.barh(range(len(top_indices)), importances[top_indices], color='blue', align='center')
plt.yticks(range(len(top_indices)), [feature_names[i] for i in top_indices])
plt.gca().invert_yaxis()  # Invert y-axis to have the highest value on top
plt.xlabel("Mean Decrease in Accuracy")
plt.ylabel("Feature")
plt.show()

In [None]:
# # Create a SHAP TreeExplainer and compute SHAP values
# explainer = shap.TreeExplainer(rfc_tomek)
# shap_values = explainer.shap_values(X_test)

# # Generate a violin plot for the first class predictions (for binary classification)
# shap.summary_plot(shap_values[1], X_test, plot_type="violin")

In [None]:
# Select a random subset of data for SHAP analysis
subset = X_resampled.sample(n=1000, random_state=42)  # Adjust n based on your dataset size

# Create a SHAP explainer
explainer = shap.Explainer(rfc_tomek, X_resampled, n_jobs=-1)

# Compute SHAP values
shap_values = explainer(X_resampled)

# Generate a violin plot
shap.plots.violin(shap_values)

In [None]:
city_df['LeadTime'].describe()

In [None]:
city_df['LeadTime']

In [None]:
sns.regplot(x = city_df['LeadTime'], y = city_df['IsCanceled'])