In [None]:
# Gavin Casper and Kai Hiratani

In [1]:
# imports
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

import argparse

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

from xgboost import XGBClassifier

In [2]:
from google.colab import drive
drive.mount('/content/drive')

file_path_Gavin = '/content/drive/MyDrive/hotel_booking.csv'
# replace this with where your csv file is located in google drive, I tested if it worked with different accounts
file_path_Kai = '/content/drive/MyDrive/hotel_booking.csv'

# Load the CSV into a DataFrame
data = pd.read_csv(file_path_Gavin)

# print(data.head())

Mounted at /content/drive


In [3]:
# preprocessing
columns_to_drop = [
    "name", "email", "phone-number", "credit_card", "reservation_status_date",
    "reservation_status", "company", "agent"
]

processed_data = data.drop(columns=columns_to_drop)

processed_data["children"].fillna(0, inplace=True)
processed_data["country"].fillna("Unknown", inplace=True)

categorical_columns = [
    "hotel", "arrival_date_month", "meal", "country", "market_segment",
    "distribution_channel", "reserved_room_type", "assigned_room_type",
    "deposit_type", "customer_type"
]

numerical_columns = [
    "lead_time", "arrival_date_week_number", "arrival_date_day_of_month",
    "stays_in_weekend_nights", "stays_in_week_nights", "adults", "children",
    "babies", "previous_cancellations", "previous_bookings_not_canceled",
    "booking_changes", "days_in_waiting_list", "adr",
    "required_car_parking_spaces", "total_of_special_requests"
]

X = processed_data.drop(columns=["is_canceled"])
y = processed_data["is_canceled"]

# for logistic regression
preprocessorLog = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_columns),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns)
    ]
)

# for decision trees and random forests, no scale
preprocessorDT = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('passthrough', 'passthrough')
        ]), numerical_columns),
        ("cat", Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
            ('onehot', OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_columns)
    ]
)

# for neural networks, don't know if it will work or not
preprocessor_nn = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_columns),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns)
    ]
)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_Log = Pipeline(steps=[("preprocessor", preprocessorLog)])
X_train_Log = pipeline_Log.fit_transform(X_train)
X_test_Log = pipeline_Log.transform(X_test)

pipeline_DT = Pipeline(steps=[("preprocessor", preprocessorDT)])
X_train_DT = pipeline_DT.fit_transform(X_train)
X_test_DT = pipeline_DT.transform(X_test)

pipeline_nn = Pipeline(steps=[("preprocessor", preprocessor_nn)])
X_train_nn = pipeline_nn.fit_transform(X_train)
X_test_nn = pipeline_nn.transform(X_test)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_data["children"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_data["country"].fillna("Unknown", inplace=True)


In [4]:
# Logistic Regression
class MyLogisticRegression:
    def __init__(self):
        self.model_logistic = None

    def model_fit_logistic(self):
        self.model_logistic = LogisticRegression(max_iter=500)

        self.model_logistic.fit(X_train_Log, y_train)

    def model_predict_logistic(self):
        self.model_fit_logistic()

        accuracy = 0.0
        precision, recall, f1, support = np.array([0,0]), np.array([0,0]), np.array([0,0]), np.array([0,0])

        if X_test_Log is not None:
            y_pred = self.model_logistic.predict(X_test_Log)
            accuracy = accuracy_score(y_test, y_pred)
            precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred, average=None)

        assert precision.shape == recall.shape == f1.shape == support.shape == (2,), "precision, recall, f1, support should be an array of shape (2,)"

        return accuracy, precision, recall, f1, support

In [5]:
logModel = MyLogisticRegression()

accuracyLog, precisionLog, recallLog, f1Log, supportLog = logModel.model_predict_logistic()
print(accuracyLog, precisionLog, recallLog, f1Log, supportLog)
print('Accuracy: {}'.format(accuracyLog))

0.8202948320629868 [0.81910545 0.82302595] [0.91400013 0.66458589] [0.86395485 0.73536849] [14907  8971]
Accuracy: 0.8202948320629868


For Logistic Regression: recall for is_cancelled = 1 is low (66%), f1 support is low (73%), everything else is above 80%.

In [6]:
# Random Forest Classifier
class MyRandomForest:
  def __init__(self):
        self.model_rf = None
  def model_fit_rf(self):
    self.model_rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=30, min_samples_split=2)

    self.model_rf.fit(X_train_DT, y_train)

  def model_predict_rf(self):
        # self.model_fit_rf()

        accuracy = 0.0
        precision, recall, f1, support = np.array([0,0]), np.array([0,0]), np.array([0,0]), np.array([0,0])

        if X_test_DT is not None:
            y_pred = self.model_rf.predict(X_test_DT)
            accuracy = accuracy_score(y_test, y_pred)
            precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred, average=None)

        assert precision.shape == recall.shape == f1.shape == support.shape == (2,), "precision, recall, f1, support should be an array of shape (2,)"

        return accuracy, precision, recall, f1, support

  def predict_single_rf(self, X_test, index):
    if self.model_rf is None:
      raise ValueError("The model has not been trained. Call model_fit_rf first.")

    if index < 0 or index >= X_test.shape[0]:
      raise IndexError(f"Index out of bounds. Please select an index between 0 and {X_test.shape[0] - 1}.")

    single_sample = X_test[index].reshape(1, -1)  # Reshape for a single prediction
    prediction = self.model_rf.predict(single_sample)
    return prediction[0]

In [7]:
rfModel = MyRandomForest()
rfModel.model_fit_rf()
accuracyRF, precisionRF, recallRF, f1RF, supportRF = rfModel.model_predict_rf()
print(accuracyRF, precisionRF, recallRF, f1RF, supportRF)
print('Accuracy: {}'.format(accuracyRF))

0.8847474662869587 [0.87920384 0.8960642 ] [0.94526062 0.78419351] [0.9110364  0.83640471] [14907  8971]
Accuracy: 0.8847474662869587


In [8]:
# XGBoost
class MyXGBoost:
  def __init__(self):
        self.model_xg = None
  def model_fit_xg(self):
    self.model_xg = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

    self.model_xg.fit(X_train_DT, y_train)

  def model_predict_xg(self):
        # self.model_fit_xg()

        accuracy = 0.0
        precision, recall, f1, support = np.array([0,0]), np.array([0,0]), np.array([0,0]), np.array([0,0])

        if X_test_DT is not None:
            y_pred = self.model_xg.predict(X_test_DT)
            accuracy = accuracy_score(y_test, y_pred)
            precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred, average=None)

        assert precision.shape == recall.shape == f1.shape == support.shape == (2,), "precision, recall, f1, support should be an array of shape (2,)"

        return accuracy, precision, recall, f1, support

In [9]:
xgModel = MyXGBoost()
xgModel.model_fit_xg()
accuracyXG, precisionXG, recallXG, f1XG, supportXG = xgModel.model_predict_xg()
print(accuracyXG, precisionXG, recallXG, f1XG, supportXG)
print('Accuracy: {}'.format(accuracyXG))

Parameters: { "use_label_encoder" } are not used.



0.8721417204120948 [0.88101054 0.85556357] [0.91936674 0.79366849] [0.89978006 0.82345458] [14907  8971]
Accuracy: 0.8721417204120948


For Random Forest: better than logistic regression, 78% is the lowest (cancelled=1 recall)

In [10]:
# Decision Tree Classifier
class MyDecisionTree:
    def __init__(self):
        self.model_dt = None

    def model_fit_dt(self):
        # Initialize
        self.model_dt = DecisionTreeClassifier(random_state=42, max_depth=30, min_samples_split=2)
        # Fit the model
        self.model_dt.fit(X_train_DT, y_train)

    def model_predict_dt(self):
        # Train the model
        self.model_fit_dt()

        accuracy = 0.0
        precision, recall, f1, support = np.array([0, 0]), np.array([0, 0]), np.array([0, 0]), np.array([0, 0])

        if X_test_DT is not None:
            # Predict on the test set
            y_pred = self.model_dt.predict(X_test_DT)
            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred, average=None)

        assert precision.shape == recall.shape == f1.shape == support.shape == (2,), "precision, recall, f1, support should be an array of shape (2,)"

        return accuracy, precision, recall, f1, support


In [11]:
dtModel = MyDecisionTree()

# Train and evaluate the Decision Tree model
accuracyDT, precisionDT, recallDT, f1DT, supportDT = dtModel.model_predict_dt()

# Print results
print(accuracyDT, precisionDT, recallDT, f1DT, supportDT)
print('Accuracy: {}'.format(accuracyDT))


0.855054862216266 [0.88383635 0.80720339] [0.88401422 0.80693345] [0.88392528 0.8070684 ] [14907  8971]
Accuracy: 0.855054862216266


In [12]:
# Neural Network Classifier
class MyNeuralNetwork:
    def __init__(self):
        self.model_nn = None

    def model_fit_nn(self):
        # Initialize
        self.model_nn = MLPClassifier(
            hidden_layer_sizes=(100,),
            #hidden_layer_sizes=(128, 64),  # Two hidden layers
            activation='relu',
            solver='adam',
            max_iter=50,#change
            random_state=42
        )
        # Fit
        self.model_nn.fit(X_train_nn, y_train)

    def model_predict_nn(self):
        # Train
        self.model_fit_nn()

        accuracy = 0.0
        precision, recall, f1, support = np.array([0, 0]), np.array([0, 0]), np.array([0, 0]), np.array([0, 0])

        if X_test_nn is not None:
            # Predict
            y_pred = self.model_nn.predict(X_test_nn)
            # Evaluate
            accuracy = accuracy_score(y_test, y_pred)
            precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred, average=None)

        assert precision.shape == recall.shape == f1.shape == support.shape == (2,), "precision, recall, f1, support should be an array of shape (2,)"

        return accuracy, precision, recall, f1, support


In [13]:
nnModel = MyNeuralNetwork()

# Train and evaluate
accuracyNN, precisionNN, recallNN, f1NN, supportNN = nnModel.model_predict_nn()

print(accuracyNN, precisionNN, recallNN, f1NN, supportNN)
print('Accuracy: {}'.format(accuracyNN))

0.8671161738839098 [0.87081279 0.85985601] [0.92426377 0.77215472] [0.89674249 0.81364891] [14907  8971]
Accuracy: 0.8671161738839098




In [14]:
def main():
    # Initialize and train the model
    # rfModel = MyRandomForest()
    # rfModel.model_fit_rf(X_train_DT, y_train)

    # Ask the user for input
    try:
        index = int(input(f"Enter the index of the test data to predict (0 to {X_test_DT.shape[0] - 1}): "))
        prediction = rfModel.predict_single_rf(X_test_DT, index)
        actual = y_test.iloc[index]  # Assuming y_test is a Pandas Series or similar structure

        # Convert prediction and actual values to descriptive labels
        predLabel = "booking cancelled" if prediction == 1 else "booking kept"
        actualLabel = "booking cancelled" if actual == 1 else "booking kept"

        print(f"Prediction for test data at index {index}: {predLabel}")
        print(f"Actual value: {actualLabel}")

        if prediction == actual:
            print("The prediction is spot on! AI will soon take over the world!")
        else:
            print("The prediction is incorrect.")
    except ValueError:
        print("Please enter a usable integer.")
    except Exception as e:
        print(str(e))

# Run the main function
if __name__ == "__main__":
    main()

Enter the index of the test data to predict (0 to 23877): 2
Prediction for test data at index 2: booking kept
Actual value: booking kept
The prediction is spot on! AI will soon take over the world!
