In [7]:
# import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import radians
from category_encoders import BinaryEncoder
from datetime import datetime
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [59]:
# load dataset

df = pd.read_csv('airbnb_dataset_clean.csv')
df.head()

Unnamed: 0,neighbourhood,latitude,longitude,host_listings_count,host_identity_verified,property_type,room_type,accommodates,beds,price,host_duration_days,log_price,location_cluster
0,North Beach,0.659774,-2.136426,1,1,Entire condo,Entire home/apt,4,2,417.0,3025,6.035481,0
1,South of Market,0.659316,-2.136561,32,1,Entire serviced apartment,Entire home/apt,2,1,280.0,853,5.638355,0
2,Noe Valley,0.658869,-2.136738,1,1,Entire condo,Entire home/apt,4,2,195.0,3644,5.278115,3
3,South of Market,0.659319,-2.136569,1,1,Private room in home,Private room,2,1,60.0,3467,4.110874,0
4,Castro/Upper Market,0.659017,-2.13689,1,0,Private room in condo,Private room,1,1,120.0,4289,4.795791,1


In [3]:
# check missing values

df.isna().sum()

neighbourhood             0
latitude                  0
longitude                 0
host_since                0
host_listings_count       0
host_identity_verified    0
property_type             0
room_type                 0
accommodates              0
beds                      0
price                     0
log_price                 0
dtype: int64

## Encoding

In [60]:
# binary encoding for 'property_type'

binary_encoder = BinaryEncoder(cols=['property_type'])
df = binary_encoder.fit_transform(df)

In [61]:
# one-hot encoding for 'room_type'

df = pd.get_dummies(df, columns=['room_type'])
df.head()

Unnamed: 0,neighbourhood,latitude,longitude,host_listings_count,host_identity_verified,property_type_0,property_type_1,property_type_2,property_type_3,property_type_4,...,accommodates,beds,price,host_duration_days,log_price,location_cluster,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,North Beach,0.659774,-2.136426,1,1,0,0,0,0,0,...,4,2,417.0,3025,6.035481,0,1,0,0,0
1,South of Market,0.659316,-2.136561,32,1,0,0,0,0,1,...,2,1,280.0,853,5.638355,0,1,0,0,0
2,Noe Valley,0.658869,-2.136738,1,1,0,0,0,0,0,...,4,2,195.0,3644,5.278115,3,1,0,0,0
3,South of Market,0.659319,-2.136569,1,1,0,0,0,0,1,...,2,1,60.0,3467,4.110874,0,0,0,1,0
4,Castro/Upper Market,0.659017,-2.13689,1,0,0,0,0,1,0,...,1,1,120.0,4289,4.795791,1,0,0,1,0


## Splitting Dataset

In [62]:
# split the dataset

X = df.drop(['neighbourhood','price', 'log_price'], axis=1) # neighborhood is not relevant for predictions
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [63]:
# check shape of the train and test data

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5406, 18)
(5406,)
(1802, 18)
(1802,)


## Model 1: Linear Regression

In [64]:
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

# predict on test set
linear_reg_predictions = linear_reg_model.predict(X_test)

# evaluate model
linear_reg_mse = mean_squared_error(y_test, linear_reg_predictions)
print("Linear Regression Mean Squared Error:", round(linear_reg_mse,2))

Linear Regression Mean Squared Error: 0.4240277191208681


## Model 2: Random Forest Regressor

In [65]:
random_forest_model = RandomForestRegressor(random_state=123)
random_forest_model.fit(X_train, y_train)

# predict on test set
random_forest_predictions = random_forest_model.predict(X_test)

# evaluate model
random_forest_mse = mean_squared_error(y_test, random_forest_predictions)
print("Random Forest Mean Squared Error:", round(random_forest_mse,2))

Random Forest Mean Squared Error: 0.2260576652717793


## Model Selection

It is evident that **Random Forest Regressor**, with a MSE of **0.23** is performing better than Linear Regression, which has MSE of **0.42**. 

## Hyperparameter tuning

In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=random_forest_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a Random Forest model with the best hyperparameters
best_random_forest_model = RandomForestRegressor(random_state=123, **best_params)

# Fit the model on the training data
best_random_forest_model.fit(X_train, y_train)

# Predict on the test set
random_forest_predictions = best_random_forest_model.predict(X_test)

# Evaluate the model
random_forest_mse = mean_squared_error(y_test, random_forest_predictions)
print("Random Forest Mean Squared Error:", random_forest_mse)
print("Best Hyperparameters:", best_params)

n_estimators: 100
max_depth: None
min_samples_split: 2
min_samples_leaf: 1

### Conclusion