In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from icecream import ic
import sys
sys.path.append("./../projectname")
from projectname.config import raw_data_dir
from projectname.config import clean_data_dir
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report


In [2]:
# Load the dataset
file_path = clean_data_dir / 'hotel_bookings_clean.csv'
bookings_df = pd.read_csv(file_path)
bookings_df

Unnamed: 0.1,Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,...,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status
0,0,Resort Hotel,0,342,2015,July,27,1,0,0,...,3,No Deposit,-1.0,-1.0,0,Transient,0.00,0,0,Check-Out
1,1,Resort Hotel,0,737,2015,July,27,1,0,0,...,4,No Deposit,-1.0,-1.0,0,Transient,0.00,0,0,Check-Out
2,2,Resort Hotel,0,7,2015,July,27,1,0,1,...,0,No Deposit,-1.0,-1.0,0,Transient,75.00,0,0,Check-Out
3,3,Resort Hotel,0,13,2015,July,27,1,0,1,...,0,No Deposit,304.0,-1.0,0,Transient,75.00,0,0,Check-Out
4,4,Resort Hotel,0,14,2015,July,27,1,0,2,...,0,No Deposit,240.0,-1.0,0,Transient,98.00,0,1,Check-Out
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,119385,City Hotel,0,23,2017,August,35,30,2,5,...,0,No Deposit,394.0,-1.0,0,Transient,96.14,0,0,Check-Out
119386,119386,City Hotel,0,102,2017,August,35,31,2,5,...,0,No Deposit,9.0,-1.0,0,Transient,225.43,0,2,Check-Out
119387,119387,City Hotel,0,34,2017,August,35,31,2,5,...,0,No Deposit,9.0,-1.0,0,Transient,157.71,0,4,Check-Out
119388,119388,City Hotel,0,109,2017,August,35,31,2,5,...,0,No Deposit,89.0,-1.0,0,Transient,104.40,0,0,Check-Out


## Columns to Discard:
 - hotel: The hotel type (City Hotel or Resort Hotel). While interesting, it might not be as predictive and can be indirectly inferred from other features.
 - arrival_date_month: The month of arrival. Less precise compared to week number and day of the month.
 - reservation_status: Indicates if the booking is canceled, checked-out, or no-show. This is directly related to the target variable and should be discarded to avoid data leakage.
 - reservation_status_date: The date when the reservation status was last updated. This can also lead to data leakage.
 - is_repeated_guest: Indicates if the guest has stayed before. While it can be relevant, it might be less impactful compared to other features. (This column wasn't listed initially, but should be considered if present.)

In [3]:

# Drop the unnamed column
bookings_df.drop(columns=["Unnamed: 0"], inplace=True)

# Separate features and target variable
X = bookings_df.drop(columns=["is_canceled"])
y = bookings_df["is_canceled"]

# Identify categorical and numerical features
categorical_cols = ['meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type',
                    'assigned_room_type', 'deposit_type', 'customer_type', 'agent', 'company', 'is_repeated_guest']

numerical_cols = ['lead_time', 'arrival_date_year', 'arrival_date_week_number', 
                  'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 
                  'adults', 'children', 'babies', 'adr', 'days_in_waiting_list', 
                  'required_car_parking_spaces', 'total_of_special_requests']




In [4]:
# Preprocessing for numerical data: impute missing values and scale
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: impute missing values and one-hot encode
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate each model
results = {}

for model_name, model in models.items():
    # Create a pipeline with the preprocessor and the model
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train the model
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_test)
    
    # Evaluate the model
    report = classification_report(y_test, y_pred, output_dict=True)
    results[model_name] = report

results

{'Logistic Regression': {'0': {'precision': 0.8346716464363523,
   'recall': 0.8995102971758234,
   'f1-score': 0.8658788583236472,
   'support': 14907.0},
  '1': {'precision': 0.8082682708306669,
   'recall': 0.7039349013487906,
   'f1-score': 0.7525023832221163,
   'support': 8971.0},
  'accuracy': 0.826032331015998,
  'macro avg': {'precision': 0.8214699586335096,
   'recall': 0.8017225992623069,
   'f1-score': 0.8091906207728817,
   'support': 23878.0},
  'weighted avg': {'precision': 0.8247518590773355,
   'recall': 0.826032331015998,
   'f1-score': 0.8232831485432706,
   'support': 23878.0}},
 'Random Forest': {'0': {'precision': 0.8947569843092231,
   'recall': 0.9410344133628497,
   'f1-score': 0.9173124080431584,
   'support': 14907.0},
  '1': {'precision': 0.8928048780487805,
   'recall': 0.8160740162746628,
   'f1-score': 0.8527167899365209,
   'support': 8971.0},
  'accuracy': 0.8940866069185024,
  'macro avg': {'precision': 0.8937809311790018,
   'recall': 0.87855421481875

In [5]:
results_df = pd.DataFrame(results).transpose()
results_df

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
Logistic Regression,"{'precision': 0.8346716464363523, 'recall': 0....","{'precision': 0.8082682708306669, 'recall': 0....",0.826032,"{'precision': 0.8214699586335096, 'recall': 0....","{'precision': 0.8247518590773355, 'recall': 0...."
Random Forest,"{'precision': 0.8947569843092231, 'recall': 0....","{'precision': 0.8928048780487805, 'recall': 0....",0.894087,"{'precision': 0.8937809311790018, 'recall': 0....","{'precision': 0.8940235750931066, 'recall': 0...."
Gradient Boosting,"{'precision': 0.8543829840790385, 'recall': 0....","{'precision': 0.8334794040315513, 'recall': 0....",0.847391,"{'precision': 0.8439311940552949, 'recall': 0....","{'precision': 0.8465294780648829, 'recall': 0...."
