In [None]:
# import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from category_encoders import BinaryEncoder
from datetime import datetime
from geopy.distance import geodesic
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [None]:
# load dataset

df = pd.read_csv('airbnb_dataset_clean.csv')
df.head()

In [None]:
# check missing values

df.isna().sum()

## Feature Engineering

In [None]:
# change 'host_since' show duration instead of dates

df['host_since'] = pd.to_datetime(df['host_since'])
df['host_duration_days'] = (datetime.now() - df['host_since']).dt.days

# drop the original 'host_since' column
df = df.drop(columns=['host_since'])

In [None]:
# change latitude & longitude to 'distance to reference'

reference_point = (37.7749, -122.4194) # San Francisco city's center

df['distance_to_reference'] = df.apply(lambda row: geodesic((row['latitude'], row['longitude']), reference_point).miles, axis=1)

# drop the original latitude and longitude columns
df = df.drop(columns=['latitude', 'longitude'])

## Encoding

In [None]:
# binary encoding for 'property_type'

binary_encoder = BinaryEncoder(cols=['property_type'])
df = binary_encoder.fit_transform(df)

In [None]:
# one-hot encoding for 'room_type'

df = pd.get_dummies(df, columns=['room_type'])
df.head()

## Splitting Dataset

In [None]:
# split the dataset

X = df.drop(['neighbourhood','price', 'log_price'], axis=1) # neighborhood is not relevant for predictions
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [None]:
# check shape of the train and test data

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Model 1: Linear Regression

In [None]:
# initiate model: Linear Regression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

# Predict on the test set
linear_reg_predictions = linear_reg_model.predict(X_test)

# Evaluate Linear Regression model
linear_reg_mse = mean_squared_error(y_test, linear_reg_predictions)
print("Linear Regression Mean Squared Error:", linear_reg_mse)

## Model 2: Random Forest Regressor

In [None]:
random_forest_model = RandomForestRegressor(random_state=123)
random_forest_model.fit(X_train, y_train)

# Predict on the test set
random_forest_predictions = random_forest_model.predict(X_test)

# Evaluate Random Forest model
random_forest_mse = mean_squared_error(y_test, random_forest_predictions)
print("Random Forest Mean Squared Error:", random_forest_mse)

### Conclusion