In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import radians
from category_encoders import BinaryEncoder
from datetime import datetime
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error 
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load dataset
df = pd.read_csv('airbnb_dataset_clean.csv')
df.head()

Unnamed: 0,neighbourhood,host_listings_count,host_identity_verified,property_type,room_type,accommodates,beds,price,host_duration_days,log_price,location_cluster
0,North Beach,1,1,Entire condo,Entire home/apt,4,2,417.0,3026,6.035481,0
1,South of Market,32,1,Entire serviced apartment,Entire home/apt,2,1,280.0,854,5.638355,0
2,Noe Valley,1,1,Entire condo,Entire home/apt,4,2,195.0,3645,5.278115,3
3,South of Market,1,1,Private room in home,Private room,2,1,60.0,3468,4.110874,0
4,Castro/Upper Market,1,0,Private room in condo,Private room,1,1,120.0,4290,4.795791,1


In [3]:
# check missing values
df.isna().sum()

neighbourhood             0
host_listings_count       0
host_identity_verified    0
property_type             0
room_type                 0
accommodates              0
beds                      0
price                     0
host_duration_days        0
log_price                 0
location_cluster          0
dtype: int64

## Encoding

In [4]:
# binary encoding for 'property_type'
binary_encoder = BinaryEncoder(cols=['property_type'])
df = binary_encoder.fit_transform(df)

In [5]:
# one-hot encoding for 'room_type'
df = pd.get_dummies(df, columns=['room_type'])
df.head()

Unnamed: 0,neighbourhood,host_listings_count,host_identity_verified,property_type_0,property_type_1,property_type_2,property_type_3,property_type_4,property_type_5,accommodates,beds,price,host_duration_days,log_price,location_cluster,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,North Beach,1,1,0,0,0,0,0,1,4,2,417.0,3026,6.035481,0,1,0,0,0
1,South of Market,32,1,0,0,0,0,1,0,2,1,280.0,854,5.638355,0,1,0,0,0
2,Noe Valley,1,1,0,0,0,0,0,1,4,2,195.0,3645,5.278115,3,1,0,0,0
3,South of Market,1,1,0,0,0,0,1,1,2,1,60.0,3468,4.110874,0,0,0,1,0
4,Castro/Upper Market,1,0,0,0,0,1,0,0,1,1,120.0,4290,4.795791,1,0,0,1,0


## Splitting Dataset

In [6]:
# split the dataset
X = df.drop(['neighbourhood','price', 'log_price'], axis=1) # neighborhood is not relevant for predictions
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [7]:
# check shape of the train and test data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5406, 16)
(5406,)
(1802, 16)
(1802,)


## Model Selection

### Model 1: Linear Regression

In [8]:
# initiate model
linear_reg = LinearRegression()

# perform cross-validation on training set
cv_scores = cross_val_score(linear_reg, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mse_scores = -cv_scores

print("Cross-Validation Mean MSE:", np.mean(mse_scores))
print("Cross-Validation Standard Deviation of MSE:", np.std(mse_scores))

Cross-Validation Mean MSE: 0.420559449988046
Cross-Validation Standard Deviation of MSE: 0.053511341741974486


### Model 2: Random Forest Regressor

In [9]:
# initiate model
random_forest_reg = RandomForestRegressor(random_state=123)

# perform cross-validation on training set
cv_scores = cross_val_score(random_forest_reg, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mse_scores = -cv_scores

print("Cross-Validation Mean MSE:", np.mean(mse_scores))
print("Cross-Validation Standard Deviation of MSE:", np.std(mse_scores))

Cross-Validation Mean MSE: 0.2808595628992869
Cross-Validation Standard Deviation of MSE: 0.029199315077639947


It is evident that **Random Forest Regressor** is the better performing model for our dataset.

## Hyperparameter tuning

In [10]:
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
}

# perform grid search with cross-validation
grid_search = GridSearchCV(estimator=random_forest_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# print the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}


### Final Modeling

In [11]:
# create final model with the best hyperparameters
best_rf_model = RandomForestRegressor(random_state=123, **best_params)

# fit the model on the training data
best_rf_model.fit(X_train, y_train)

# predict on the test set
preds = best_rf_model.predict(X_test)

# evaluate the model
mse = mean_squared_error(y_test, preds)
print("Random Forest Mean Squared Error:", mse)

Random Forest Mean Squared Error: 0.24938580925357015


## Conclusion

* The final Random Forest model achieved an impressive Mean Squared Error (MSE) of approx. **0.25** on the test set, which makes it a reliable tool for estimating Airbnb rental prices, for the city of San Francisco. 
* Future work could explore enhancements, such as incorporating temporal trends or experimenting with advanced modeling techniques, to further refine the predictive capabilities of the model.