In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import HistGradientBoostingRegressor

# Load the data
df = pd.read_csv("hotel_bookings.csv")

# Rename 'adr' to 'price'
df.rename(columns={"adr": "price"}, inplace=True)

# Drop unnecessary columns
df.drop(columns=["company", "agent", "country"], inplace=True)

# Handle missing values
# Using SimpleImputer to replace missing values with the column mean
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Encode categorical variables
label_encoder = LabelEncoder()
df_imputed['hotel'] = label_encoder.fit_transform(df_imputed['hotel'])

# Prepare data for training
X = df_imputed.drop("price", axis=1)
y = df_imputed["price"]

# Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=23)

# Train Linear Regression model
lm = LinearRegression()
lm.fit(x_train, y_train)

# Or you can try HistGradientBoostingRegressor
model = HistGradientBoostingRegressor()
model.fit(x_train, y_train)

# Evaluate model performance (optional)
print(f"Linear Regression Training score: {lm.score(x_train, y_train)}")
print(f"Linear Regression Testing score: {lm.score(x_test, y_test)}")

print(f"HistGradientBoostingRegressor Training score: {model.score(x_train, y_train)}")
print(f"HistGradientBoostingRegressor Testing score: {model.score(x_test, y_test)}")
