In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [2]:
# Load hotel price dataset
df = pd.read_csv('../data/hotel_price_prediction_dataset.csv')

# Show first 5 rows to verify
df.head()

Unnamed: 0,Room_ID,Year,Season,Month,Weekday,No. of Guests,Room_Type,Base_Price,Predicted Price
0,R01,2024,Winter,1,Thursday,3,Deluxe,6761,7437
1,R13,2025,Summer,8,Sunday,1,Standard,5187,5187
2,R07,2025,Winter,1,Sunday,5,Suite,7666,8432
3,R09,2023,Fall,11,Tuesday,5,Deluxe,6021,6623
4,R15,2023,Spring,4,Monday,4,Standard,7608,10651


In [None]:
# 3. Encode categorical features

label_encoders = {}
categorical_cols = ['Season', 'Month', 'Weekday', 'Room_Type']

for col in categorical_cols:
    df[col] = df[col].astype(str)
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
    label_encoders[col] = encoder

In [None]:
# 4. Prepare features (X) and target (y)
X = df.drop(columns=['Room_ID', 'Predicted Price'])  # Remove unused columns
y = df['Predicted Price']


In [None]:
# 5. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# 6. Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [7]:
# 7. Evaluate
y_pred = model.predict(X_test)
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R² Score: {r2_score(y_test, y_pred):.2f}")


Mean Squared Error: 211432.24
R² Score: 0.91


In [8]:
# 8. Save model and encoders
joblib.dump(model, '../models/model.pkl')
joblib.dump(label_encoders, '../models/encoders.pkl')

['../models/encoders.pkl']

In [9]:
# 9. Predicting a sample from test data
sample_input = X_test.iloc[0]
print("Sample Input:\n", sample_input)

# Reshape and predict
prediction = model.predict([sample_input.values])
print(f"Predicted Room Price (LKR): {round(prediction[0], 2)}")


Sample Input:
 Year             2025
Season              2
Month               9
Weekday             0
No. of Guests       5
Room_Type           0
Base_Price       5107
Name: 521, dtype: int64
Predicted Room Price (LKR): 6426.77


