In [19]:
# import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [69]:
# load dataset

df = pd.read_csv('airbnb_dataset_clean.csv')
df.head()

Unnamed: 0,neighbourhood,latitude,longitude,property_type,room_type,accommodates,beds,price
0,North Beach,37.80226,-122.40818,Entire condo,Entire home/apt,4,2,417.0
1,South of Market,37.77605,-122.41593,Entire serviced apartment,Entire home/apt,2,1,280.0
2,Noe Valley,37.75043,-122.42609,Entire condo,Entire home/apt,4,2,195.0
3,South of Market,37.776214,-122.416405,Private room in home,Private room,2,1,60.0
4,Castro/Upper Market,37.758919,-122.434776,Private room in condo,Private room,1,1,120.0


In [70]:
# check missing values

df.isna().sum()

neighbourhood    0
latitude         0
longitude        0
property_type    0
room_type        0
accommodates     0
beds             0
price            0
dtype: int64

## One-hot encoding

In [71]:
df_encoded = pd.get_dummies(df, columns=['neighbourhood','property_type', 'room_type'])

df_encoded.head()

Unnamed: 0,latitude,longitude,accommodates,beds,price,neighbourhood_Bayview,neighbourhood_Bernal Heights,neighbourhood_Castro/Upper Market,neighbourhood_Chinatown,neighbourhood_Crocker Amazon,...,property_type_Shared room in home,property_type_Shared room in hostel,property_type_Shared room in hotel,property_type_Shared room in loft,property_type_Shared room in rental unit,property_type_Tiny home,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,37.80226,-122.40818,4,2,417.0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,37.77605,-122.41593,2,1,280.0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,37.75043,-122.42609,4,2,195.0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,37.776214,-122.416405,2,1,60.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,37.758919,-122.434776,1,1,120.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [72]:
# check collinearity

## Splitting Dataset

In [77]:
# split the dataset

X = df_encoded.drop(['latitude','longitude','price'], axis=1) # latitude, longitude are not relevant for predictions
y = df_encoded['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [78]:
# check shape of the train and test data

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5404, 93)
(5404,)
(1802, 93)
(1802,)


## Model 1: Linear Regression

In [79]:
# initiate model: Linear Regression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

# Predict on the test set
linear_reg_predictions = linear_reg_model.predict(X_test)

# Evaluate Linear Regression model
linear_reg_mse = mean_squared_error(y_test, linear_reg_predictions)
print("Linear Regression Mean Squared Error:", linear_reg_mse)

Linear Regression Mean Squared Error: 1.2898954842001648e+24


## Model 2: Random Forest Regressor

In [76]:
random_forest_model = RandomForestRegressor(random_state=123)
random_forest_model.fit(X_train, y_train)

# Predict on the test set
random_forest_predictions = random_forest_model.predict(X_test)

# Evaluate Random Forest model
random_forest_mse = mean_squared_error(y_test, random_forest_predictions)
print("Random Forest Mean Squared Error:", random_forest_mse)

Random Forest Mean Squared Error: 3419149.0609838143


### Conclusion