In [1]:
cd ..

/Users/sgemma.sun/Documents/data101/airbnb-ml


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import src.transform as trans

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
import s3fs
import pyarrow.parquet as pq

s3 = s3fs.S3FileSystem()

filePath = 's3://airbnb-barcelona/valid/currentDate=2020-03-11'
airbnb_df = pq.ParquetDataset(filePath, filesystem=s3).read_pandas().to_pandas()

airbnb = airbnb_df.drop(columns=[
    'rowId',
    'id',
    'host_location',
    'host_neighbourhood',
    'street',
    'neighbourhood',
    'neighbourhood_cleansed',
    'market',
    'license',
    'zipcode',
    'calendar_updated'
])

print(airbnb.shape)

(20428, 55)


# Object values

In [5]:
cat_df = airbnb.select_dtypes(include=['object']).copy()

In [4]:
# drop outliers
airbnb = trans.drop_rows_occurs_less_than(airbnb, "cancellation_policy", 2)
airbnb = trans.drop_rows_occurs_less_than(airbnb, "host_response_time", 1)
# boolean to float
airbnb = trans.encode_boolean_to_float(airbnb, "host_is_superhost")
airbnb = trans.encode_boolean_to_float(airbnb, "host_has_profile_pic")
airbnb = trans.encode_boolean_to_float(airbnb, "host_identity_verified")
airbnb = trans.encode_boolean_to_float(airbnb, "is_location_exact")
airbnb = trans.encode_boolean_to_float(airbnb, "has_availability")
airbnb = trans.encode_boolean_to_float(airbnb, "instant_bookable")
airbnb = trans.encode_boolean_to_float(airbnb, "require_guest_profile_picture")
airbnb = trans.encode_boolean_to_float(airbnb, "require_guest_phone_verification")
# fillna
airbnb = trans.fillna_with_lowest_occurance(airbnb, "host_since")
airbnb = trans.fillna_with_lowest_occurance(airbnb, "host_response_time")
airbnb = trans.fillna_with_lowest_occurance(airbnb, "host_is_superhost")
airbnb = trans.fillna_with_lowest_occurance(airbnb, "host_has_profile_pic")
airbnb = trans.fillna_with_lowest_occurance(airbnb, "host_identity_verified")
airbnb = trans.fillna_with_lowest_occurance(airbnb, "first_review")
airbnb = trans.fillna_with_lowest_occurance(airbnb, "last_review")
# element count
airbnb = trans.extract_num_of_items_for_column(airbnb, "host_verifications")
airbnb = trans.extract_num_of_items_for_column(airbnb, "amenities")
# category encode
category_encoder = trans.encode_category_dic(airbnb)
category_columns = [
    "neighbourhood_group_cleansed",
    "property_type",
    "room_type",
    "bed_type",
    "cancellation_policy"
]
category_dic = trans.foldleft(category_encoder, {}, category_columns)
dic_host_response_time = {'host_response_time': {'N/A': 1, 'a few days or more': 2, 'within a day': 3, 'within a few hours': 4, 'within an hour': 5}}
category_dic = dict(dic_host_response_time, **category_dic)
airbnb = airbnb.replace(category_dic)

In [6]:
# date columns
ymd_to_time = trans.string_to_timestamp('%Y-%m-%d')
airbnb["host_since_dt"] = airbnb["host_since"].apply(ymd_to_time)
airbnb["first_review_dt"] = airbnb["first_review"].apply(ymd_to_time)
airbnb["last_review_dt"] = airbnb["last_review"].apply(ymd_to_time)
days_from_2020_03_11 = trans.days_from_date(compare_date=pd.to_datetime('2020-03-11', format='%Y-%m-%d'))
airbnb["host_since_2020_03_11"] = airbnb["host_since_dt"].apply(days_from_2020_03_11)
airbnb["first_review_2020_03_11"] = airbnb["host_since_dt"].apply(days_from_2020_03_11)
airbnb["last_review_2020_03_11"] = airbnb["host_since_dt"].apply(days_from_2020_03_11)
# drop unused columns
airbnb = airbnb.drop(columns=[
    'host_since',
    'first_review',
    'last_review',
    'host_since_dt',
    'first_review_dt',
    'last_review_dt',
])

# Numeric values

In [7]:
numeric_df = airbnb_df.select_dtypes(include=['float64', 'int32']).copy()
numeric_df.head(1)

Unnamed: 0,host_response_rate,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,0.99,46.0,46.0,41.40889,2.18555,6.0,1.0,2.0,4.0,130.0,150.0,42.0,2.0,25.0,3.0,730.0,3.0,730.0,0.0,0.0,29.0,304.0,1.0,0.0,80.0,10.0,10.0,2.0,10.0,10.0,8.0,30.0,30.0,0.0,0.0,0.02


In [8]:
airbnb = airbnb.fillna(airbnb.mean())
airbnb.head(10)

Unnamed: 0,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,host_since_2020_03_11,first_review_2020_03_11,last_review_2020_03_11
0,5,0.99,0.0,46.0,46.0,5,1.0,1.0,8,41.40889,2.18555,1.0,2,1,6.0,1.0,2.0,4.0,5,18,130.0,150.0,42.0,2.0,25.0,3.0,730.0,3.0,730.0,1.0,0.0,0.0,29.0,304.0,1.0,0.0,80.0,10.0,10.0,2.0,10.0,10.0,8.0,0.0,1,0.0,0.0,30.0,30.0,0.0,0.0,0.02,3704,3704,3704
1,5,0.99,0.0,46.0,46.0,5,1.0,1.0,2,41.4042,2.17306,1.0,2,1,8.0,2.0,3.0,6.0,5,22,60.0,150.0,50.0,2.0,30.0,1.0,1125.0,3.9,1125.0,1.0,15.0,24.0,50.0,312.0,20.0,15.0,87.0,9.0,9.0,10.0,10.0,9.0,9.0,1.0,3,0.0,0.0,30.0,30.0,0.0,0.0,0.25,3704,3704,3704
2,5,1.0,1.0,5.0,5.0,8,1.0,1.0,8,41.41203,2.22114,0.0,2,1,6.0,2.0,3.0,5.0,5,46,210.0,300.0,80.0,3.0,10.0,3.0,30.0,3.1,1125.0,1.0,29.0,59.0,89.0,359.0,51.0,17.0,95.0,10.0,10.0,10.0,10.0,9.0,9.0,1.0,3,0.0,1.0,2.0,2.0,0.0,0.0,0.48,3655,3655,3655
3,5,1.0,1.0,1.0,1.0,8,1.0,1.0,3,41.40145,2.15645,1.0,2,3,2.0,1.0,1.0,1.0,5,17,32.0,166.427606,43.103105,1.0,25.0,1.0,730.0,1.0,730.0,1.0,13.0,13.0,13.0,46.0,268.0,44.0,95.0,10.0,9.0,10.0,10.0,10.0,10.0,1.0,3,1.0,1.0,1.0,0.0,1.0,0.0,2.38,3619,3619,3619
4,5,0.92,0.0,39.0,39.0,8,1.0,0.0,3,41.4095,2.15938,1.0,2,1,4.0,1.0,1.0,1.0,5,10,60.0,200.0,58.0,4.0,0.0,1.0,27.0,2.1,27.0,1.0,16.0,33.0,58.0,324.0,182.0,32.0,92.0,9.0,9.0,8.0,9.0,9.0,9.0,1.0,3,0.0,0.0,39.0,39.0,0.0,0.0,1.71,3570,3570,3570
5,5,0.92,0.0,39.0,39.0,8,1.0,0.0,3,41.40928,2.16112,1.0,2,1,5.0,1.5,3.0,3.0,5,10,70.0,200.0,108.0,5.0,0.0,1.0,27.0,2.1,27.0,1.0,12.0,28.0,58.0,326.0,90.0,23.0,88.0,9.0,9.0,9.0,9.0,9.0,9.0,1.0,3,0.0,0.0,39.0,39.0,0.0,0.0,0.84,3570,3570,3570
6,1,0.941213,0.0,1.0,1.0,4,1.0,0.0,5,41.3872,2.14088,1.0,2,3,1.0,1.0,1.0,1.0,5,25,30.0,0.0,0.0,1.0,15.0,29.0,60.0,29.0,60.0,1.0,0.0,24.0,54.0,144.0,19.0,0.0,99.0,10.0,10.0,10.0,10.0,9.0,9.0,0.0,2,0.0,0.0,1.0,0.0,1.0,0.0,0.17,3563,3563,3563
7,5,1.0,0.0,13.0,13.0,8,1.0,1.0,3,41.40464,2.16954,1.0,2,1,6.0,1.5,2.0,3.0,5,36,140.0,200.0,75.0,4.0,20.0,2.0,365.0,2.0,31.0,1.0,28.0,56.0,86.0,350.0,59.0,12.0,86.0,8.0,9.0,10.0,9.0,10.0,9.0,1.0,1,0.0,0.0,14.0,14.0,0.0,0.0,0.58,3572,3572,3572
8,4,1.0,0.0,3.0,3.0,8,1.0,1.0,1,41.37916,2.17535,1.0,2,3,2.0,1.0,1.0,1.0,5,17,100.0,150.0,40.0,1.0,0.0,5.0,730.0,5.0,730.0,1.0,30.0,60.0,90.0,180.0,8.0,0.0,68.0,8.0,8.0,7.0,9.0,8.0,7.0,0.0,2,0.0,0.0,2.0,1.0,1.0,0.0,0.07,3699,3699,3699
9,4,1.0,0.0,3.0,3.0,8,1.0,1.0,1,41.37859,2.1773,1.0,2,1,8.0,3.0,4.0,6.0,5,38,250.0,240.0,78.0,6.0,50.0,4.0,365.0,3.8,365.0,1.0,15.0,45.0,75.0,165.0,142.0,16.0,90.0,10.0,9.0,10.0,10.0,10.0,9.0,1.0,3,0.0,0.0,2.0,1.0,1.0,0.0,1.28,3699,3699,3699


# Price prediction

In [9]:
y = airbnb["price"]
X = airbnb.drop(columns=['price'])

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
scaler.transform(X)

array([[ 0.6673665 ,  0.3464109 , -0.47385002, ...,  2.25404242,
         2.25404242,  2.25404242],
       [ 0.6673665 ,  0.3464109 , -0.47385002, ...,  2.25404242,
         2.25404242,  2.25404242],
       [ 0.6673665 ,  0.41741589,  2.11037238, ...,  2.20208554,
         2.20208554,  2.20208554],
       ...,
       [-0.04380398, -0.15062408, -0.47385002, ..., -1.38399938,
        -1.38399938, -1.38399938],
       [ 0.6673665 ,  0.41741589,  2.11037238, ..., -1.30553389,
        -1.30553389, -1.30553389],
       [-0.75497446, -1.49971904, -0.47385002, ..., -1.52184415,
        -1.52184415, -1.52184415]])

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
params = {}
model = LinearRegression(fit_intercept=False)
grid_search = GridSearchCV(model, params, cv=5, scoring=('r2'))
grid_search.fit(x_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LinearRegression(copy_X=True, fit_intercept=False,
                                        n_jobs=None, normalize=False),
             iid='deprecated', n_jobs=None, param_grid={},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='r2', verbose=0)

In [17]:
best_model = grid_search.best_estimator_

In [18]:
from sklearn.metrics import r2_score
y_predict = best_model.predict(x_test)
r2_score(y_test, y_predict)

0.09379032163946688

In [19]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_predict)

225972.71853650658