In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

In [6]:
np.random.seed(1)

In [7]:
airbnb = pd.read_csv("airbnb.csv")

In [8]:
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3555 entries, 0 to 3554
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   host_is_superhost                  3555 non-null   int64  
 1   host_identity_verified             3555 non-null   int64  
 2   neighbourhood_cleansed             3555 non-null   object 
 3   latitude                           3555 non-null   float64
 4   longitude                          3555 non-null   float64
 5   property_type                      3552 non-null   object 
 6   room_type                          3555 non-null   object 
 7   accommodates                       3555 non-null   int64  
 8   bathrooms                          3541 non-null   float64
 9   bedrooms                           3545 non-null   float64
 10  beds                               3546 non-null   float64
 11  bed_type                           3555 non-null   objec

In [9]:
airbnb.describe()

Unnamed: 0,host_is_superhost,host_identity_verified,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,price,price_gte_150
count,3555.0,3555.0,3555.0,3555.0,3555.0,3541.0,3545.0,3546.0,3555.0,3555.0,3555.0,3555.0,3555.0,3555.0,2755.0,3555.0,3555.0
mean,0.11308,0.727989,42.339973,-71.084874,3.023629,1.215899,1.246544,1.597293,14.85879,1.427004,10.886639,3.116737,19.126582,279.052602,91.89147,166.060478,0.500422
std,0.316735,0.445058,0.024464,0.031614,1.754808,0.492656,0.73844,0.995467,4.82126,1.050204,19.092755,8.273949,35.666178,408.686952,9.548381,103.378456,0.50007
min,0.0,0.0,42.235942,-71.171789,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,20.0,10.0,0.0
25%,0.0,0.0,42.329875,-71.105183,2.0,1.0,1.0,1.0,12.0,1.0,0.0,1.0,1.0,0.0,89.0,85.0,0.0
50%,0.0,1.0,42.345191,-71.078487,2.0,1.0,1.0,1.0,15.0,1.0,0.0,2.0,5.0,92.0,94.0,150.0,1.0
75%,0.0,1.0,42.354672,-71.062142,4.0,1.0,2.0,2.0,18.0,1.0,20.0,3.0,21.0,402.0,98.0,219.0,1.0
max,1.0,1.0,42.389982,-71.0001,16.0,6.0,5.0,16.0,30.0,14.0,200.0,300.0,404.0,2680.0,100.0,650.0,1.0


In [10]:
airbnb.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          3
room_type                              0
accommodates                           0
bathrooms                             14
bedrooms                              10
beds                                   9
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 800
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

In [11]:
category_var_list = list(airbnb.select_dtypes(include='object').columns)
category_var_list

['neighbourhood_cleansed',
 'property_type',
 'room_type',
 'bed_type',
 'cancellation_policy',
 'price_category']

In [12]:
for cat in category_var_list:
    print(f"Category: {cat} Values: {airbnb[cat].unique()}")

Category: neighbourhood_cleansed Values: ['Roslindale' 'Jamaica Plain' 'Mission Hill' 'Longwood Medical Area'
 'Bay Village' 'Leather District' 'Chinatown' 'North End' 'Roxbury'
 'South End' 'Back Bay' 'East Boston' 'Charlestown' 'West End'
 'Beacon Hill' 'Downtown' 'Fenway' 'Brighton' 'West Roxbury' 'Hyde Park'
 'Mattapan' 'Dorchester' 'South Boston Waterfront' 'South Boston'
 'Allston']
Category: property_type Values: ['House' 'Apartment' 'Condominium' 'Villa' 'Bed & Breakfast' 'Townhouse'
 'Entire Floor' 'Loft' 'Guesthouse' 'Boat' 'Dorm' 'Other' nan 'Camper/RV']
Category: room_type Values: ['Entire home/apt' 'Private room' 'Shared room']
Category: bed_type Values: ['Real Bed' 'Pull-out Sofa' 'Futon' 'Airbed' 'Couch']
Category: cancellation_policy Values: ['moderate' 'flexible' 'strict' 'super_strict_30']
Category: price_category Values: ['gte_226' 'lte_$75' 'btw_$75-$150' 'btw_$151-$225']


In [13]:
airbnb.drop(['price_category', 'price_gte_150'], axis=1, inplace = True)

In [14]:
airbnb['property_type'].isna().sum() 

3

In [15]:
airbnb["property_type"].fillna("unkown", inplace = True)

In [16]:
airbnb['neighbourhood_cleansed'].isna().sum() 

0

In [17]:
airbnb['room_type'].isna().sum() # can see by the results below, no missing values

0

In [18]:
airbnb['bed_type'].isna().sum() # can see by the results below, no missing values

0

In [19]:
airbnb['cancellation_policy'].isna().sum() # can see by the results below, no missing values

0

In [21]:
one_hot_encoding = pd.get_dummies(airbnb[['neighbourhood_cleansed','property_type','cancellation_policy']])

In [22]:
airbnb = airbnb.join(one_hot_encoding)

In [23]:
airbnb = airbnb.drop(['neighbourhood_cleansed','property_type','cancellation_policy'],axis=1)

In [24]:
labelencoder = LabelEncoder()
airbnb['room_type'] = labelencoder.fit_transform(airbnb['room_type'])
airbnb['bed_type'] = labelencoder.fit_transform(airbnb['bed_type'])

In [25]:
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3555 entries, 0 to 3554
Data columns (total 61 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   host_is_superhost                               3555 non-null   int64  
 1   host_identity_verified                          3555 non-null   int64  
 2   latitude                                        3555 non-null   float64
 3   longitude                                       3555 non-null   float64
 4   room_type                                       3555 non-null   int32  
 5   accommodates                                    3555 non-null   int64  
 6   bathrooms                                       3541 non-null   float64
 7   bedrooms                                        3545 non-null   float64
 8   beds                                            3546 non-null   float64
 9   bed_type                                 

In [26]:
train_df, test_df = train_test_split(airbnb, test_size=0.3)


target = 'price'
predictors = list(airbnb.columns)
predictors.remove(target)

In [27]:
numeric_cols_with_nas = list(train_df.isna().sum()[train_df.isna().sum() > 0].index)
numeric_cols_with_nas

['bathrooms', 'bedrooms', 'beds', 'review_scores_rating']

In [28]:
scaler = preprocessing.StandardScaler()
cols_to_stdize = ['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 
                   'guests_included', 'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 
                   'review_scores_rating']                
               

train_df[cols_to_stdize] = scaler.fit_transform(train_df[cols_to_stdize]) 


test_df[cols_to_stdize] = scaler.transform(test_df[cols_to_stdize]) 


In [29]:
train_X = train_df[predictors]
train_y = train_df[target] 
test_X = train_df[predictors]
test_y = test_df[target] 

train_df.to_csv('airbnb_train_df.csv', index=False)
train_X.to_csv('airbnb_train_X.csv', index=False)
train_y.to_csv('airbnb_train_y.csv', index=False)
test_df.to_csv('airbnb_test_df.csv', index=False)
test_X.to_csv('airbnb_test_X.csv', index=False)
test_y.to_csv('airbnb_test_y.csv', index=False)