In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [2]:
df = pd.read_csv("datasets/airbnb_new_york_listings_2024.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   name                            20758 non-null  object 
 2   host_id                         20758 non-null  int64  
 3   host_name                       20758 non-null  object 
 4   neighbourhood_group             20758 non-null  object 
 5   neighbourhood                   20758 non-null  object 
 6   latitude                        20758 non-null  float64
 7   longitude                       20758 non-null  float64
 8   room_type                       20758 non-null  object 
 9   price                           20758 non-null  float64
 10  minimum_nights                  20758 non-null  int64  
 11  number_of_reviews               20758 non-null  int64  
 12  last_review                     

In [4]:
df.head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,rating,bedrooms,beds,baths
0,1312228,Rental unit in Brooklyn · ★5.0 · 1 bedroom,7130382,Walter,Brooklyn,Clinton Hill,40.68371,-73.96461,Private room,55.0,...,2015-12-20,0.03,1,0,0,No License,5.0,1,1,Not specified
1,45277537,Rental unit in New York · ★4.67 · 2 bedrooms ·...,51501835,Jeniffer,Manhattan,Hell's Kitchen,40.76661,-73.9881,Entire home/apt,144.0,...,2023-05-01,0.24,139,364,2,No License,4.67,2,1,1
2,971353993633883038,Rental unit in New York · ★4.17 · 1 bedroom · ...,528871354,Joshua,Manhattan,Chelsea,40.750764,-73.994605,Entire home/apt,187.0,...,2023-12-18,1.67,1,343,6,Exempt,4.17,1,2,1


In [5]:
# Changing some of the datatypes in to unsigned integer
df.id = df.id.astype("uint")
df.host_id = df.host_id.astype("uint")
df.price = df.price.astype("uint16")
df.calculated_host_listings_count = df.calculated_host_listings_count.astype("uint16")
df.availability_365 = df.availability_365.astype("uint16")
df.number_of_reviews_ltm = df.number_of_reviews_ltm.astype("uint8")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  uint32 
 1   name                            20758 non-null  object 
 2   host_id                         20758 non-null  uint32 
 3   host_name                       20758 non-null  object 
 4   neighbourhood_group             20758 non-null  object 
 5   neighbourhood                   20758 non-null  object 
 6   latitude                        20758 non-null  float64
 7   longitude                       20758 non-null  float64
 8   room_type                       20758 non-null  object 
 9   price                           20758 non-null  uint16 
 10  minimum_nights                  20758 non-null  int64  
 11  number_of_reviews               20758 non-null  int64  
 12  last_review                     

In [7]:
# Chaning some of the data in to 'category' data type
columns = ['neighbourhood_group', 'room_type', 'bedrooms', 'baths']
df[columns] = df[columns].astype('category')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   id                              20758 non-null  uint32  
 1   name                            20758 non-null  object  
 2   host_id                         20758 non-null  uint32  
 3   host_name                       20758 non-null  object  
 4   neighbourhood_group             20758 non-null  category
 5   neighbourhood                   20758 non-null  object  
 6   latitude                        20758 non-null  float64 
 7   longitude                       20758 non-null  float64 
 8   room_type                       20758 non-null  category
 9   price                           20758 non-null  uint16  
 10  minimum_nights                  20758 non-null  int64   
 11  number_of_reviews               20758 non-null  int64   
 12  last_review       

In [9]:
# OneHotEncoding
ohe = OneHotEncoder(drop='first', sparse_output=False)
encoded_array = ohe.fit_transform(df[columns])

In [10]:
encoded_array

array([[1., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [11]:
encoded_df = pd.DataFrame(encoded_array, columns = ohe.get_feature_names_out(columns))

In [12]:
encoded_df

Unnamed: 0,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,room_type_Hotel room,room_type_Private room,room_type_Shared room,bedrooms_14,bedrooms_15,bedrooms_2,...,baths_3,baths_3.5,baths_4,baths_4.5,baths_5,baths_5.5,baths_6,baths_6.5,baths_7,baths_Not specified
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20754,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20755,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20756,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   id                              20758 non-null  uint32  
 1   name                            20758 non-null  object  
 2   host_id                         20758 non-null  uint32  
 3   host_name                       20758 non-null  object  
 4   neighbourhood_group             20758 non-null  category
 5   neighbourhood                   20758 non-null  object  
 6   latitude                        20758 non-null  float64 
 7   longitude                       20758 non-null  float64 
 8   room_type                       20758 non-null  category
 9   price                           20758 non-null  uint16  
 10  minimum_nights                  20758 non-null  int64   
 11  number_of_reviews               20758 non-null  int64   
 12  last_review       

In [14]:
drop_columns = ['id', 'name', 'host_name', 'neighbourhood', 'last_review', 'license']
df = df.drop(drop_columns, axis=1)
df = df.drop(columns, axis=1)

# concat the data frame and the encoded data frame with the OHE
df = pd.concat([df, encoded_df], axis=1)

In [15]:
df

Unnamed: 0,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,...,baths_3,baths_3.5,baths_4,baths_4.5,baths_5,baths_5.5,baths_6,baths_6.5,baths_7,baths_Not specified
0,7130382,40.683710,-73.964610,55,30,3,0.03,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,51501835,40.766610,-73.988100,144,30,9,0.24,139,364,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,528871354,40.750764,-73.994605,187,2,6,1.67,1,343,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,19902271,40.835600,-73.942500,120,30,156,1.38,2,363,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,61391963,40.751120,-73.978600,85,30,11,0.24,133,335,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,186680487,40.711380,-73.991560,45,30,124,1.81,1,157,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20754,3237504,40.730580,-74.000700,105,30,56,0.48,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20755,304317395,40.757350,-73.993430,299,30,60,2.09,1,0,27,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20756,163083101,40.713750,-73.991470,115,30,7,0.91,1,363,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df['rating'].replace(['No rating', 'New '], np.nan).astype('float')

0        5.00
1        4.67
2        4.17
3        4.64
4        4.91
         ... 
20753    4.75
20754    4.46
20755    4.93
20756    5.00
20757    4.89
Name: rating, Length: 20758, dtype: float64

In [17]:
df

Unnamed: 0,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,...,baths_3,baths_3.5,baths_4,baths_4.5,baths_5,baths_5.5,baths_6,baths_6.5,baths_7,baths_Not specified
0,7130382,40.683710,-73.964610,55,30,3,0.03,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,51501835,40.766610,-73.988100,144,30,9,0.24,139,364,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,528871354,40.750764,-73.994605,187,2,6,1.67,1,343,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,19902271,40.835600,-73.942500,120,30,156,1.38,2,363,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,61391963,40.751120,-73.978600,85,30,11,0.24,133,335,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,186680487,40.711380,-73.991560,45,30,124,1.81,1,157,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20754,3237504,40.730580,-74.000700,105,30,56,0.48,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20755,304317395,40.757350,-73.993430,299,30,60,2.09,1,0,27,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20756,163083101,40.713750,-73.991470,115,30,7,0.91,1,363,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

In [19]:
df_median = df.copy()

In [20]:
df_median['rating'] = df_median['rating'].fillna(df_median['rating'].median())