In [1]:
# import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from category_encoders import BinaryEncoder
from datetime import datetime
from geopy.distance import geodesic
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load dataset

df = pd.read_csv('airbnb_dataset_clean.csv')
df.head()

Unnamed: 0,neighbourhood,latitude,longitude,host_since,host_listings_count,host_identity_verified,property_type,room_type,accommodates,beds,price,log_price
0,North Beach,37.80226,-122.40818,2015-08-18,1,1,Entire condo,Entire home/apt,4,2,417.0,6.035481
1,South of Market,37.77605,-122.41593,2021-07-29,32,1,Entire serviced apartment,Entire home/apt,2,1,280.0,5.638355
2,Noe Valley,37.75043,-122.42609,2013-12-07,1,1,Entire condo,Entire home/apt,4,2,195.0,5.278115
3,South of Market,37.776214,-122.416405,2014-06-02,1,1,Private room in home,Private room,2,1,60.0,4.110874
4,Castro/Upper Market,37.758919,-122.434776,2012-03-02,1,0,Private room in condo,Private room,1,1,120.0,4.795791


In [3]:
# check missing values

df.isna().sum()

neighbourhood             0
latitude                  0
longitude                 0
host_since                0
host_listings_count       0
host_identity_verified    0
property_type             0
room_type                 0
accommodates              0
beds                      0
price                     0
log_price                 0
dtype: int64

## Feature Engineering

In [4]:
# change 'host_since' show duration instead of dates

df['host_since'] = pd.to_datetime(df['host_since'])
df['host_duration_days'] = (datetime.now() - df['host_since']).dt.days

# drop the original 'host_since' column
df = df.drop(columns=['host_since'])

In [5]:
# change latitude & longitude to 'distance to reference'

reference_point = (37.7749, -122.4194) # San Francisco city's center

df['distance_to_reference'] = df.apply(lambda row: geodesic((row['latitude'], row['longitude']), reference_point).miles, axis=1)

# drop the original latitude and longitude columns
df = df.drop(columns=['latitude', 'longitude'])

## Encoding

In [6]:
# binary encoding for 'property_type'

binary_encoder = BinaryEncoder(cols=['property_type'])
df = binary_encoder.fit_transform(df)

In [7]:
# one-hot encoding for 'room_type'

df = pd.get_dummies(df, columns=['room_type'])
df.head()

Unnamed: 0,neighbourhood,host_listings_count,host_identity_verified,property_type_0,property_type_1,property_type_2,property_type_3,property_type_4,property_type_5,accommodates,beds,price,log_price,host_duration_days,distance_to_reference,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,North Beach,1,1,0,0,0,0,0,1,4,2,417.0,6.035481,3011,1.984366,1,0,0,0
1,South of Market,32,1,0,0,0,0,1,0,2,1,280.0,5.638355,839,0.205849,1,0,0,0
2,Noe Valley,1,1,0,0,0,0,0,1,4,2,195.0,5.278115,3630,1.726921,1,0,0,0
3,South of Market,1,1,0,0,0,0,1,1,2,1,60.0,4.110874,3453,0.187323,0,0,1,0
4,Castro/Upper Market,1,0,0,0,0,1,0,0,1,1,120.0,4.795791,4275,1.386877,0,0,1,0


## Splitting Dataset

In [8]:
# split the dataset

X = df.drop(['neighbourhood','price', 'log_price'], axis=1) # neighborhood is not relevant for predictions
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [9]:
# check shape of the train and test data

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5406, 16)
(5406,)
(1802, 16)
(1802,)


## Model 1: Linear Regression

In [10]:
# initiate model: Linear Regression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

# Predict on the test set
linear_reg_predictions = linear_reg_model.predict(X_test)

# Evaluate Linear Regression model
linear_reg_mse = mean_squared_error(y_test, linear_reg_predictions)
print("Linear Regression Mean Squared Error:", linear_reg_mse)

Linear Regression Mean Squared Error: 0.43363410387940554


## Model 2: Random Forest Regressor

In [11]:
random_forest_model = RandomForestRegressor(random_state=123)
random_forest_model.fit(X_train, y_train)

# Predict on the test set
random_forest_predictions = random_forest_model.predict(X_test)

# Evaluate Random Forest model
random_forest_mse = mean_squared_error(y_test, random_forest_predictions)
print("Random Forest Mean Squared Error:", random_forest_mse)

Random Forest Mean Squared Error: 0.2412663265497383


### Conclusion