In [1]:
#import the required libaries
import pandas as pd
import numpy as np
import plotly.offline as pyo
import plotly.graph_objs as go
import geopy.distance
from geopy.geocoders import Nominatim
from ast import literal_eval
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score
import json
import plotly.io as pio

pyo.init_notebook_mode(connected=True)

### 1. Business Understanding

- On which factors does the rating of a listing depend the most?
- How accurately can we predict the actual rating?
- How do different cities compare?

### 2. Data Understanding

In [2]:
#define the path to the database
data_path = '.\\data\\'

In [3]:
#read the listings
listings = pd.read_csv(data_path + 'boston_listings.csv')

#show the shape and end of the dataframe
print(listings.shape)
listings.tail(n = 5)

(6247, 106)


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
6242,31614370,https://www.airbnb.com/rooms/31614370,20190117205200,2019-01-17,(40-2) Simple Studio in Beacon Hill,"""One of Boston’s most picturesque areas, tony ...","Studio apartment, all about location. Brand ne...","""One of Boston’s most picturesque areas, tony ...",none,Beacon Hill is an old neighbor with brick side...,...,t,f,strict_14_with_grace_period,f,f,42,41,1,0,
6243,31614385,https://www.airbnb.com/rooms/31614385,20190117205200,2019-01-17,(M5D)Downtown Renovated 2bed 2 Bath,"""The Financial District is downtown’s business...",Fantastic two bedroom in the heart of the Fina...,"""The Financial District is downtown’s business...",none,"Primarily a pedestrian shopping zone, Downtown...",...,t,f,strict_14_with_grace_period,f,f,42,41,1,0,
6244,31615171,https://www.airbnb.com/rooms/31615171,20190117205200,2019-01-17,89 minot,Beautifull nice rooms quiet close to highway ...,,Beautifull nice rooms quiet close to highway ...,none,,...,t,f,flexible,f,f,1,0,1,0,
6245,31615291,https://www.airbnb.com/rooms/31615291,20190117205200,2019-01-17,"Spacious Room w/ Balcony near BU, Harvard,Down...","Spacious room with private balcony, comfy bed,...",I live in a large but cozy unit with 6 private...,"Spacious room with private balcony, comfy bed,...",none,Allston's neighborhood is quite vibrant! With ...,...,t,f,flexible,f,f,2,0,2,0,
6246,31624092,https://www.airbnb.com/rooms/31624092,20190117205200,2019-01-17,MINUTES TO BCEC/DOWNTOWN/SEAPORT/PRIVATE BATH,"Welcome to South Boston, one of Boston’s premi...",You will enjoy the privacy of your private ent...,"Welcome to South Boston, one of Boston’s premi...",none,This residence is conveniently located less th...,...,t,f,flexible,f,f,1,1,0,0,


In [4]:
#define the columns that may hold important informations
cols_of_interest = ['host_id', 'host_since', 'host_response_time', 'host_response_rate', 'host_is_superhost',
'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'bathrooms', 'accommodates', 'bedrooms', 'beds',
'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'extra_people',
'minimum_nights', 'maximum_nights', 'number_of_reviews', 'review_scores_rating', 'instant_bookable', 'is_business_travel_ready',
'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification', 
]

### 3. Data Preparation

In [8]:
def clean_data(listings, columns, city):
    #extract only a subset of columns
    clean_listings = listings[columns]
    
    #remove columns that exclusively contain nans
    clean_listings = clean_listings.dropna(how = 'all', axis = 1)

    #recreate list from string
    clean_listings['host_verifications'] = clean_listings['host_verifications'].apply(literal_eval)

    #change f and t to false and true
    bool_dict = {'f': 0,
                't': 1}
    clean_listings.replace(bool_dict, inplace = True)  

    #remove unnecessary characters from amenities column
    clean_listings['amenities'] = clean_listings['amenities'].apply(lambda col: re.sub(r'[{}" ]', '', col))
    #split the strings
    clean_listings['amenities'] = clean_listings['amenities'].apply(lambda col: col.split(','))

    #define columns that contain prices
    price_cols = ['price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'extra_people']
    #replace nans by -1
    clean_listings[price_cols]=clean_listings[price_cols].fillna('$-1')
    #remove dollar sign and convert to float
    for pcol in price_cols:
        clean_listings[pcol] = clean_listings[pcol].apply(lambda col: float(re.sub('[$,]', '', col)))

    #convert date string to timestamp in seconds
    clean_listings['host_since'] = pd.to_datetime(clean_listings['host_since'][0],  format='%Y-%m-%d')
    clean_listings['host_since']= clean_listings['host_since'].apply(lambda col: float(col.to_datetime64())) / 1000 #ms to s

    #replace all nans with zeros
    clean_listings['review_scores_rating'].fillna(0, inplace = True)
    #create a column for the average ratings of other listings of the same host
    clean_listings['avg_rating'] = 0

    #loop through all listings
    for ii in range(clean_listings.shape[0]):
        #get the amenities of the current listing
        amenities = clean_listings.loc[ii, 'amenities']
        verifications = clean_listings.loc[ii, 'host_verifications']
        host_id = clean_listings.loc[ii, 'host_id']

        #check whether there are any amenities
        if amenities:
            #loop through each amenity
            for amen in amenities:
                #check whether the amenity is in the dataframe
                if amen in clean_listings.columns:
                    clean_listings.loc[ii, 'amenities_' + amen] = 1
                else:
                    clean_listings['amenities_' + amen] = 0
                    clean_listings.loc[ii, 'amenities_' + amen] = 1
        #check whether there are any verifications
        if verifications:
            #loop through each verification type
            for verific in verifications:
                #check whether the verification type is in the dataframe
                if verific in clean_listings.columns:
                    clean_listings.loc[ii, 'host_verifications_' + verific] = 1
                else:
                    clean_listings['host_verifications_' + verific] = 0
                    clean_listings.loc[ii, 'host_verifications_' + verific] = 1

        #add up all ratings of this host without the current listing
        all_ratings = [rating for rating in clean_listings.drop(labels = ii).loc[(clean_listings['host_id'] == host_id) & (clean_listings['number_of_reviews'] > 5), 'review_scores_rating'] if rating != 0]
        #average the ratings
        if all_ratings:
            clean_listings.loc[ii, 'avg_rating'] = (sum(all_ratings)) / len(all_ratings)
    
    #drop the old columns
    clean_listings.drop(labels = ['amenities', 'host_verifications', 'host_id'], inplace = True, axis = 1)

    #convert percent to floats
    clean_listings['host_response_rate'].fillna('0%', inplace = True)
    clean_listings['host_response_rate'] = clean_listings['host_response_rate'].apply(lambda col: float(re.sub('[%]', '', col)))

    #get the distances from the city center
    geolocator = Nominatim(user_agent='rating_predictor')
    location = geolocator.geocode(city)
    city_center = (location.longitude, location.latitude)
    clean_listings['dist_to_center'] = [geopy.distance.vincenty(city_center, (row_lon, row_lat)).km for row_lon, row_lat in zip(clean_listings['longitude'].values, clean_listings['latitude'].values)]
    
    #split numerical and categorical
    listings_cat = clean_listings.select_dtypes(include = ['object'])
    listings_num = clean_listings.select_dtypes(include = ['number'])

    #impute numerical columns with the median
    fill_median = lambda col: col.fillna(col.median())
    listings_num.apply(fill_median, axis = 0)

    #one hot encode the categorical variables
    listings_cat = pd.get_dummies(listings_cat, drop_first = True, dummy_na = True)
    
    #concatenate categorical and numerical data
    clean_listings = pd.concat([listings_cat, listings_num], axis = 1)
    
    return clean_listings

In [410]:
#define the city which are in the database
cities = ['boston', 'amsterdam', 'stockholm']

#loop through all cities and export the cleaned data to speed up the process afterwards
for city in cities:
    #read the listings
    listings = pd.read_csv(data_path + city + '_listings.csv')
    #clean the data
    clean_listings = clean_data(listings, cols_of_interest, city)
    #save the cleaned and reengineered dataframe to save time
    clean_listings.to_csv(data_path + city + '_clean_data.csv', sep = ';', index = False)


Columns (94) have mixed types. Specify dtype option on import or set low_memory=False.



In [149]:
#get the city of interest
city = 'amsterdam' #boston, stockholm, amsterdam

#read the cleaned data
clean_listings = pd.read_csv(data_path + city + '_clean_data.csv', sep = ';')
clean_listings = clean_listings.loc[clean_listings['avg_rating'] != 0, :]
# clean_listings.drop(labels = 'avg_rating', inplace = True, axis = 1)
clean_listings

Unnamed: 0,host_response_time_within a day,host_response_time_within a few hours,host_response_time_within an hour,host_response_time_nan,property_type_Apartment,property_type_Barn,property_type_Bed and breakfast,property_type_Boat,property_type_Boutique hotel,property_type_Bungalow,...,amenities_,amenities_Ski-in/Ski-out,amenities_Showerchair,amenities_Airpurifier,host_verifications_weibo,amenities_Privatebathroom,host_verifications_zhima_selfie,host_verifications_sesame,host_verifications_sesame_offline,dist_to_center
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.157525
4,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.500048
5,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.128873
6,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.040320
9,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1.154129
11,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.480466
15,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.755035
20,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.437133
22,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.364001
24,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.352059


In [150]:
#define the acces token for mapbox maps
with open("token.JSON") as file:
    token = json.load(file)
mapbox_access_token = token['mapbox_access_token']

#estimate the city center by the average geographical location of the listings
geolocator = Nominatim(user_agent='rating_predictor')
location = geolocator.geocode(city)
city_center = (location.longitude, location.latitude)

#define the dict with all traces
trace=go.Scattermapbox(
                        lon=clean_listings['longitude'],
                        lat=clean_listings['latitude'],
                        mode='markers',
                        text=clean_listings['review_scores_rating'],
                        marker=dict(
                                    size=10,
                                    opacity=0.5,
                                    color=clean_listings['review_scores_rating'],
                                    colorscale='Viridis',
                                    colorbar = dict(
                                                    title = 'Rating',
                                                    titleside = 'right',
                                                    tickmode = 'array',
                                                    tickvals = [0,20,40,60,80,100],
                                                    ticks = 'outside',
                                                    titlefont=dict(size=18),
                                                    tickfont=dict(size=14)
                                                    )
                                    ),
                        hoverinfo='text'
                        )

#define the layout
layout=go.Layout(
                title=city.upper(),
                titlefont=dict(
                            size=24,
                            ),
                width=900,
                height=600,
                hovermode='closest',
                mapbox=dict(
                            accesstoken=mapbox_access_token,
                            style='light',
                            bearing=0,
                            center=dict(
                                        lat=city_center[1],
                                        lon=city_center[0]
                                        ),
                            pitch=0,
                            zoom=12
                            ),
                )

#create the figure
fig_map = go.Figure(data=[trace], layout=layout)

#show the figure
pyo.iplot(fig_map)

In [151]:
#split rated and unrated listings
rated = clean_listings[(clean_listings['number_of_reviews'] > 5) & (clean_listings['review_scores_rating'] > 0)]
unrated = clean_listings[clean_listings['number_of_reviews'] == 0]

#drop the number of reviews
rated = rated.drop(labels = 'number_of_reviews', axis = 1)
unrated = unrated.drop(labels = 'number_of_reviews', axis = 1)

#check the price differences between rated and unrated offers
unrated_prices = unrated['price']
rated_prices = rated['price']

#show the prediction vs the truth
trace_rated = go.Histogram(
                histnorm='percent',
                x = unrated_prices,
                opacity = 0.5,
                xbins=dict(
                    start=0,
                    end=300,
                    size=10
                ),
                name = 'Unrated'
                )
trace_unrated = go.Histogram(
                histnorm='percent',
                x = rated_prices,
                opacity = 0.5,
                xbins=dict(
                    start=0,
                    end=300,
                    size=10
                ),
                name = 'Rated'
                )

layout =  go.Layout(
                title=city.upper(),
                titlefont=dict(
                            size = 24
                            ),
                barmode='overlay',
                xaxis=dict(
                    title='Price in ($)',
                    titlefont=dict(
                        size=18,
                        color='rgb(0, 0, 0)'
                    )
                ),
                yaxis=dict(
                    title='Frequency in (%)',
                    titlefont=dict(
                        size=18,
                        color='rgb(0, 0, 0)'
                    )
                )
                    )
fig_price_hist = go.Figure(data = [trace_rated, trace_unrated], layout = layout)
    
pyo.iplot(fig_price_hist)

In [152]:
#show the mean prices
print(rated.shape[0])
print(rated['price'].mean())

print(unrated.shape[0])
print(unrated['price'].mean())

2069
139.3450942484292
275
158.0690909090909


In [153]:
#min-max scale the dataframes
rated = (rated-clean_listings.min())/(clean_listings.max()-clean_listings.min())
rated.fillna(0, inplace = True)
unrated = (unrated-clean_listings.min())/(clean_listings.max()-clean_listings.min())
unrated.fillna(0, inplace = True)

In [154]:
#show the entries that strongly correlate positively with the rating
top_corr_pos = rated.corrwith(rated['review_scores_rating']).nlargest(n = 10)

trace = go.Bar(
            x = top_corr_pos.values,
            y = [idx.lower().replace('_', ' ') for idx in top_corr_pos.index],
            text = np.round(1000 * top_corr_pos.values) / 1000,
            textposition = 'auto',
            orientation = 'h'
            )
layout = go.Layout(
                title=city.upper(),
                titlefont=dict(
                            size = 24
                            ),
                barmode='overlay',
                xaxis=dict(
                    title='Correlation',
                    titlefont=dict(
                        size=18,
                        color='rgb(0, 0, 0)'
                                ),
                        ),
                yaxis=dict(
                            automargin = True
                            )
                        )
                    
fig_top_corr_pos = go.Figure(data = [trace], layout = layout)

pyo.iplot(fig_top_corr_pos)

In [155]:
#show the entries that strongly correlate negatively with the rating
top_corr_neg = rated.corrwith(rated['review_scores_rating']).nsmallest(n = 10)

trace = go.Bar(
            x = top_corr_neg.values,
            y = [idx.lower().replace('_', ' ') for idx in top_corr_neg.index],
            text = np.round(1000 * top_corr_neg.values) / 1000,
            textposition = 'auto',
            orientation = 'h'
            )
layout = go.Layout(
                title=city.upper(),
                titlefont=dict(
                            size = 24
                            ),
                barmode='overlay',
                xaxis=dict(
                    title='Correlation',
                    titlefont=dict(
                        size=18,
                        color='rgb(0, 0, 0)'
                                ),
                        ),
                yaxis=dict(
                            automargin = True
                            )
                        )
                    
fig_top_corr_neg = go.Figure(data = [trace], layout = layout)

pyo.iplot(fig_top_corr_neg)

In [156]:
#show the entries that strongly correlate with the rating
top_corr_pos = rated.corrwith(rated['review_scores_rating']).nlargest(n = 6)
top_corr_neg = rated.corrwith(rated['review_scores_rating']).nsmallest(n = 5)
top_corr = pd.concat((top_corr_pos, top_corr_neg))

trace = go.Bar(
            x = top_corr.values,
            y = [idx.lower().replace('_', ' ') for idx in top_corr.index],
            text = np.round(100 * top_corr.values) / 100,
            textposition = 'auto',
            orientation = 'h'
            )
layout = go.Layout(
                title=city.upper(),
                titlefont=dict(
                            size = 24
                            ),
                barmode='overlay',
                xaxis=dict(
                    title='Correlation',
                    titlefont=dict(
                        size=18,
                        color='rgb(0, 0, 0)'
                                ),
                        ),
                yaxis=dict(
                            automargin = True
                            )
                        )
                    
fig_top_corr = go.Figure(data = [trace], layout = layout)

pyo.iplot(fig_top_corr)

### 4. Modeling

In [157]:
#split predictor and response data
X = rated.drop(labels = 'review_scores_rating', axis = 1)
y = rated['review_scores_rating']

#split the data in test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 102)

In [158]:
#instantiate the model
model = LassoCV(cv = 100, random_state = 1021)

#fit the model
model.fit(X_train, y_train)
#predict the test and train labels and round to two decimals
pred_train = np.round(100*model.predict(X_train))/100
pred_test = np.round(100*model.predict(X_test))/100

#clip predictions between 0 and 1
pred_train[pred_train < 0] = 0
pred_train[pred_train > 1] = 1
pred_test[pred_test < 0] = 0
pred_test[pred_test > 1] = 1

#merge all rated predicitons and responses
pred_rated = np.concatenate((pred_train, pred_test))
y_rated = np.concatenate((y_train, y_test))

In [159]:
#show the model alpha
model.alpha_

0.0002938902957718028

### 5. Evaluation

In [160]:
#measure the model performance
train_score = r2_score(y_true = y_train, y_pred = pred_train) 
test_score = r2_score(y_true = y_test, y_pred = pred_test)

print(train_score)
print(test_score)

0.3979730717252252
0.3680004486442532


In [161]:
#show the prediction vs the truth
data = go.Scatter(
                x = 100 * y_test,
                y = 100 * pred_test,
                mode = 'markers',
                marker = dict(
                            size = 12,
                            color = 'rgba(255, 127, 0, .3)',
                    )
                )
layout =  go.Layout(
                title=city.upper(),
                titlefont=dict(
                            size=24,
                            ),
                xaxis=dict(
                    title='True rating',
                    titlefont=dict(
                        size=18,
                        color='rgb(0, 0, 0)'
                    )
                ),
                yaxis=dict(
                    title='Predicted rating',
                    titlefont=dict(
                        size=18,
                        color='rgb(0, 0, 0)'
                    )
                )
                    )
fig_prediction = go.Figure(data = [data], layout = layout)
    
pyo.iplot(fig_prediction)

In [162]:
#split unrated data to response and predictor
X_unrated = unrated.drop(labels = 'review_scores_rating', axis = 1)
y_unrated = unrated['review_scores_rating']

#predict the unrated listings
pred_unrated = np.round(100*model.predict(X_unrated))/100

#clip the predictions
pred_unrated[pred_unrated > 1] = 1
pred_unrated[pred_unrated < 0] = 0

#remove the predictions smaller than 0.33
# pred_unrated = pred_unrated[pred_unrated > 0.33]

print("Unrated listings:\n   - Mean: {:0.2f}\n   - Std. dev.: {:1.4f}".format(np.mean(pred_unrated), np.std(pred_unrated)))
print("Rated listings:\n   - Mean: {:0.2f}\n   - Std. dev.: {:1.4f}".format(np.mean(y_rated), np.std(y_rated)))
print("Relative diff.:\n   - Mean: {:0.1f}%\n   - Std. dev.: {:1.1f}%".format(100*abs((np.mean(y_rated) - np.mean(pred_unrated)) / np.mean(y_rated)),
                                                                            100*abs((np.std(y_rated) - np.std(pred_unrated)) / np.std(y_rated))))

Unrated listings:
   - Mean: 0.92
   - Std. dev.: 0.0385
Rated listings:
   - Mean: 0.93
   - Std. dev.: 0.0573
Relative diff.:
   - Mean: 0.9%
   - Std. dev.: 32.9%


In [163]:
#show the prediction vs the truth
trace_rated = go.Histogram(
                histnorm='percent',
                x = y_rated * 100,
                opacity = 0.5,
                xbins=dict(
                    start=0,
                    end=100,
                    size=1
                ),
                name = 'Rated'
                )
trace_unrated = go.Histogram(
                histnorm='percent',
                x = pred_unrated * 100,
                opacity = 0.5,
                xbins=dict(
                    start=0,
                    end=100,
                    size=1
                ),
                name = 'Unrated'
                )

layout =  go.Layout(
                title=city.upper(),
                titlefont=dict(
                            size=24,
                            ),
                barmode='overlay',
                xaxis=dict(
                    title='Rating',
                    titlefont=dict(
                        size=18,
                        color='rgb(0, 0, 0)'
                    )
                ),
                yaxis=dict(
                    title='Frequency in (%)',
                    titlefont=dict(
                        size=18,
                        color='rgb(0, 0, 0)'
                    )
                )
                    )
fig_rating_hist = go.Figure(data = [trace_unrated, trace_rated], layout = layout)
    
pyo.iplot(fig_rating_hist)

In [164]:
# pio.write_image(fig_map, 'exports/' + city + '_map.pdf')
pio.write_image(fig_top_corr, 'exports/' + city + '_top_corr.pdf')
pio.write_image(fig_top_corr_pos, 'exports/' + city + '_top_corr_pos.pdf')
pio.write_image(fig_top_corr_neg, 'exports/' + city + '_top_corr_neg.pdf')
pio.write_image(fig_prediction, 'exports/' + city + '_prediction.pdf')
pio.write_image(fig_price_hist, 'exports/' + city + '_price_histogram.pdf')
pio.write_image(fig_rating_hist, 'exports/' + city + '_rating_histogram.pdf')

### 6. Deployment

- [Medium](https://medium.com/@edizherkert/how-can-you-predict-airbnb-ratings-125a5491db3e)
- [GitHub](https://github.com/EdKaHe/airbnb_rating_prediction/tree/master)