In [3]:
# Load the cleaned dataset
import pandas as pd

df = pd.read_csv("../ML-Exam/data/cleaned_airbnb_data.csv")

# Show the first few rows and check column names
df.head()


Unnamed: 0,ID,realSum,room_type,room_shared_bool,room_private_bool,person_capacity,host_is_superhost_bool,multi_bool,biz_bool,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,City,Is_weekend_bool
0,0,194.03,Private room,0,1,2,0,1,0,10,93,1,5.0,2.5,Amsterdam,0
1,1,344.25,Private room,0,1,4,0,0,0,8,85,1,0.5,0.2,Amsterdam,0
2,2,264.1,Private room,0,1,2,0,0,1,9,87,1,5.7,3.7,Amsterdam,0
3,3,433.53,Private room,0,1,4,0,0,1,9,90,2,0.4,0.4,Amsterdam,0
4,4,485.55,Private room,0,1,2,1,0,0,10,98,1,0.5,0.3,Amsterdam,0


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Encode categorical features
df_encoded = pd.get_dummies(df, columns=['room_type', 'City'], drop_first=True)

# Step 2: Define features (X) and target (y)
X = df_encoded.drop(columns=['realSum', 'ID'])
y = df_encoded['realSum']

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Step 5: Predict and evaluate
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2


(44386.217999545435, 0.6063595181516739)

**Interpretation:**

R² = 0.605 means the model explains about 60.5% of the variance in Airbnb prices.

In [7]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [None, 10],
}


# Create the model
rf = RandomForestRegressor(random_state=42)

# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring='r2',
    verbose=1
)

# Fit to training data
grid_search.fit(X_train, y_train)

# Best model and score
best_model = grid_search.best_estimator_
best_r2 = grid_search.best_score_

best_model, best_r2


Fitting 3 folds for each of 4 candidates, totalling 12 fits


(RandomForestRegressor(random_state=42), 0.3240563847348098)