In [1]:
import pandas as pd
import numpy as np
import re

from collections import Counter
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

Since the Listings has a lot of columns, we remove some columns that might be not useful when predicting price, such as url, scrape id. Again, this is from my perspective, of course you can try to fetch those images through the url and treate it as a feature when predicting and you will need some technique to cope with these images.


##### Some reasons why we remove these columns
- 'location': it's the Boston Airbnb information, therefore we don't need this feature.
- 'country': same as location, Boston is in U.S.
- 'review_scores': we only keep the rating scores, since we think the rating scores is the overall information of the lisintg.
- 'square_feet': there are too many NaN in the feature and it's hard to find a method to fill these NaN values.

In [2]:
listings = pd.read_csv('listings.csv')
remove_list = ['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'experiences_offered', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url',
               'host_id', 'host_url', 'host_name', 'host_thumbnail_url', 'host_picture_url', 'smart_location', 'city', 'state',
               'country_code', 'host_location', 'host_about', 'zipcode', 'host_response_time', 'market', 'country', 'latitude', 'longitude', 'weekly_price', 'extra_people', 'minimum_nights',
               'maximum_nights', 'calendar_updated', 'has_availability', 'availability_30', 'availability_60', 'availability_90',
               'availability_365', 'calendar_last_scraped', 'license', 'jurisdiction_names', 'require_guest_profile_picture',
               'reviews_per_month', 'calculated_host_listings_count', 'require_guest_phone_verification', 'first_review', 'last_review',
               'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
               'review_scores_value', 'host_response_rate', 'host_acceptance_rate', 'host_neighbourhood', 'host_verifications',
               'host_listings_count', 'security_deposit', 'requires_license', 'guests_included', 'neighbourhood_group_cleansed',
               'host_total_listings_count', 'monthly_price', 'square_feet', 'host_since', 'street', 'neighbourhood',
              ]
listings = listings.drop(remove_list, axis=1)
print(listings.columns)
print(len(listings.columns))

Index(['summary', 'space', 'description', 'neighborhood_overview', 'notes',
       'transit', 'access', 'interaction', 'house_rules', 'host_is_superhost',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'is_location_exact', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'bed_type', 'amenities', 'price', 'cleaning_fee', 'number_of_reviews',
       'review_scores_rating', 'instant_bookable', 'cancellation_policy'],
      dtype='object')
28


## Preprocess text data

In [3]:
def tokenize(text):
    """
    input: text, string, a single comment from a user
    output: tokens, list, a list contained preprocessed text
    
    """
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    
    if type(text) != str:
        return '0'
    
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    tokens = word_tokenize(text)
    
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    lem = [lemmatizer.lemmatize(word) for word in stemmed]
    
    return lem

def transform_text(lst):
    """
    Compute the intersection between top20 words and original text
    """
    if len(set(lst)) != 0 and lst != '0':
        return len(set(lst).intersection(set(top_20))) / len(set(lst))
    else:
        return 0

In [4]:
text_columns = ['summary', 'space', 'description', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules',]
for col in text_columns:
    listings[col] = listings[col].apply(lambda w: tokenize(w))
    counter = Counter()
    for words in listings[col].values:
        for word in words:
            counter[word] += 1

    top_20 = []
    for word_pair in counter.most_common(20):
        top_20.append(word_pair[0])
    listings[col] = listings[col].apply(lambda w: transform_text(w))

## Turn the 'f' & 't' into 1, 0

In [5]:
cols = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'is_location_exact', 'instant_bookable']
for col in cols:
    listings[col] = listings[col].apply(lambda val: 1 if val == 't' else 0)

## Extract price
Since the 'price' and 'cleaning_fee' is in string form like '$##.##', we have to extract the correct price.

In [6]:
cols = ['price', 'cleaning_fee']
# fill the NaN in cleaning_fee with '0'
listings['cleaning_fee'] = listings['cleaning_fee'].fillna('0')

for col in cols:
    listings[col] = listings[col].apply(lambda val: float(re.sub(r'[$,]', '', val)) if val != '0' else 0)

# replace the 0 in cleaning_fee with mean
mean = listings['cleaning_fee'].mean()//1
listings['cleaning_fee'] = listings['cleaning_fee'].apply(lambda val: mean if val == 0 else val)

## Fill NaN
Fill the NaN in 'review_scores_rating', 'bathrooms', 'bedrooms', 'beds'

In [7]:
cols = ['review_scores_rating', 'bathrooms', 'bedrooms', 'beds']

for col in cols:
    mean = listings[col].mean()//1
    listings[col] = listings[col].fillna(mean)

## Create dummy variables
Let's deal with the categorical data

In [8]:
def create_dummy_df(df, cols, dummy_na):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
    '''
    for col in cols:
        try:
            # for each cat add dummy var, drop original column
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            continue
    return df

In [9]:
cols = ['neighbourhood_cleansed', 'property_type', 'room_type', 'bed_type', 'cancellation_policy']

# handle the NaN in 'property_type' with mode
listings['property_type'] = listings['property_type'].fillna('Apartment')

listings = create_dummy_df(listings, cols, dummy_na=False)

## Amenities column
Extract the content in amenities column

In [10]:
listings['amenities'] = listings['amenities'].map(
    lambda amns: "|".join([amn.replace("}", "").replace("{", "").replace('"', "")\
                           for amn in amns.split(",")])
)
amenities = np.unique(np.concatenate(listings['amenities'].map(lambda amns: amns.split("|"))))[1:]
amenity_arr = np.array([listings['amenities'].map(lambda amns: amn in amns) for amn in amenities])
listings = pd.concat([listings, pd.DataFrame(data=amenity_arr.T, columns=amenities)], axis=1)

In [11]:
listings = listings.drop(['amenities'], axis=1)

## Modeling

In [12]:
listings = listings.query('price <= 300')
X = listings.drop(['price'], axis=1)
y = listings['price']

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=1)

lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train, Y_train)
    
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)
    
test_score = r2_score(Y_test, y_test_preds)
train_score = r2_score(Y_train, y_train_preds)

In [13]:
train_score

0.6776038095413444

In [14]:
test_score

0.6760545311982498

## Plotting

In [15]:
coefs = list(zip(lm_model.coef_, listings.drop('price', axis='columns')))
coefs.sort(key=lambda x: x[0], reverse=True)

In [16]:
from bokeh.plotting import figure, output_file, show
from bokeh.transform import factor_cmap
from bokeh.models import ColumnDataSource
from bokeh.palettes import Inferno256
from bokeh.io import output_notebook, export_png

### Neighbourhood

In [17]:
# get the var name and value
neighbourhood_effects = [v for v in coefs if v[1].split('_')[0] == 'neighbourhood']
neighbourhood_effects.sort(key=lambda x: x[0])
neighbourhood_name = [n[1].split('_')[-1] for n in neighbourhood_effects]
neighbourhood_value = [n[0] for n in neighbourhood_effects]

# color palette
idx = np.linspace(0, 255, num=len(neighbourhood_effects), dtype=int)
palette = [Inferno256[idx] for idx in idx]

# build plot
source = ColumnDataSource(data=dict(name=neighbourhood_name, value=neighbourhood_value))

output_notebook()
p = figure(plot_height=300, plot_width=800, x_range=neighbourhood_name, y_range=(-50, 100), title="Neighbourhoods' Coef")
p.vbar(x='name', top='value', width=0.9, source=source, line_color='white', fill_color=factor_cmap('name', palette=palette, factors=neighbourhood_name))

p.y_range.start = -50
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 3.14/5
p.outline_line_color = None

show(p)

### Text data

In [19]:
# get the var name and value
text_effects = [v for v in coefs if v[1] in text_columns]
text_effects.sort(key=lambda x: x[0])
text_name = text_columns
text_value = [n[0] for n in text_effects]

# color palette
idx = np.linspace(0, 255, num=len(text_effects), dtype=int)
palette = [Inferno256[idx] for idx in idx]

# build plot
source = ColumnDataSource(data=dict(name=text_name, value=text_value))

output_notebook()
p = figure(plot_height=300, plot_width=800, x_range=text_name, y_range=(-30, 30), title="Text data Coef")
p.vbar(x='name', top='value', width=0.9, source=source, line_color='white', fill_color=factor_cmap('name', palette=palette, factors=text_name))

p.y_range.start = -30
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 3.14/5
p.outline_line_color = None

show(p)


### Amenities

In [20]:
# get the var name and value
amenities_effects = [v for v in coefs if v[1] in amenities]
amenities_effects.sort(key=lambda x: x[0])
amenities_name = [n[1] for n in amenities_effects]
amenities_value = [n[0] for n in amenities_effects]

# color palette
idx = np.linspace(0, 255, num=len(amenities_effects), dtype=int)
palette = [Inferno256[idx] for idx in idx]

# build plot
source = ColumnDataSource(data=dict(name=amenities_name, value=amenities_value))

output_notebook()
p = figure(plot_height=300, plot_width=800, x_range=amenities_name, y_range=(-30, 30), title="Amenities Coef")
p.vbar(x='name', top='value', width=0.9, source=source, line_color='white', fill_color=factor_cmap('name', palette=palette, factors=amenities_name))

p.y_range.start = -30
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 3.14/5
p.outline_line_color = None

show(p)

In [21]:
# get the var name and value
top20_effects = [v for v in coefs[:20]]
top20_effects.sort(key=lambda x: x[0])
top20_name = [n[1] for n in top20_effects]
top20_value = [n[0] for n in top20_effects]

# color palette
idx = np.linspace(0, 255, num=len(top20_effects), dtype=int)
palette = [Inferno256[idx] for idx in idx]

# build plot
source = ColumnDataSource(data=dict(name=top20_name, value=top20_value))

output_notebook()
p = figure(plot_height=300, plot_width=800, x_range=top20_name, y_range=(-30, 100), title="Top20 Coef")
p.vbar(x='name', top='value', width=0.9, source=source, line_color='white', fill_color=factor_cmap('name', palette=palette, factors=top20_name))

p.y_range.start = 0
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 3.14/5
p.outline_line_color = None

show(p)