In [None]:
from protodata.display import *
from protodata.data_ops import *
from protodata.utils import *
from protodata.datasets import Datasets
from protodata.datasets.airbnb import get_data_path, get_amenities_path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import *

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler  

import warnings

%matplotlib inline
pandas.set_option("display.max_columns", 100)
warnings.filterwarnings('ignore')

# Reading data and finding duplicates

First we read the created dataset and look for possible duplicates. Since we did not collect the data ourselves, we cannot ensure that data does not contain duplicities.

We discard those entries which have repeated listing identifiers and, as a matter of understanding, we also want to know how many listings have duplicated images in their profiles. Sometimes users offer more than one lodging with similar characteristics (not identical) and post the same pictures for all of them. Nevertheless, the amount of repeated image links is very low (~0.03%) and preserving will not do any harm in future steps.

In [None]:
# Read data
airbnb_root = get_tmp_data_location(Datasets.AIRBNB_PRICE)
data = pd.read_csv(get_data_path(airbnb_root))
metadata = load_pickle(get_amenities_path(airbnb_root))

# Duplicated ids
duplicated_id = data.duplicated('id', keep=False)
print('Found %d duplicated ids. Removing them ...' % duplicated_id[duplicated_id == True].shape[0])
data = data.drop(data[duplicated_id == True].index)

# Duplicated urls
duplicated_pic = data.duplicated('picture_url', keep=False)
print('Found %d duplicated picture urls. Removing them...' % duplicated_pic[duplicated_pic == True].shape[0])
data = data.drop(data[duplicated_pic == True].index)

In [None]:
col_names = [x for x in data.columns.values]
print('List of columns: {}'.format(col_names))

# Column overview

Let's have a look at the columns to see if data is in the expected range. First let's observe the numerical columns.

In [None]:
data.describe(include='all')

## Binary columns

We have some columns which are intended to be binary but have a string representation ('f'/'t'). Let's binarize them.

In [None]:
def str_to_binary(x):
    """ Returns binary value of the input word """
    if x.lower() == 't':
        return True
    elif x.lower() == 'f':
        return False
    else:
        raise ValueError('Unexpected feature value {}'.format(x))

to_binarize = ['host_has_profile_pic', 'host_identity_verified', 'host_is_superhost', 'instant_bookable']
for c in to_binarize:
    data.loc[:, c] = data[c].apply(str_to_binary)

Those binary columns will be treated as numeric values in the future. Therefore, we can convert them into floats. From now and on, these columns will be considered as numerical features. Amenities, however, will be treated as sparse columns in the future and we prefer to keep them like boolean).

In [None]:
data = convert_boolean(data, excluded_columns=list(metadata['amenities']), func=float)

## Numeric columns

Let's take a look at the histogram of the numeric columns. First we separate columns by their nature.

In [None]:
# Separate between numeric and categorical columns
num_cols = data._get_numeric_data().columns
cat_cols = list(set(data.columns.values) - set(num_cols))

Now we plot histograms of numerical data:

In [None]:
numeric_data = data[num_cols]
excluded_num = ['id', 'scrape_id']
plot_histograms(data=numeric_data, 
                var=[c for c in num_cols if c not in excluded_num], 
                path=None, 
                prefix=None, 
                nrows=2, 
                grid=4, 
                bins=25,
                fonts=8, 
                fig_size=(10, 5))

 Taking a quick glance at the column descriptions, we can observe:

* Most of the hosts have profile picture while just 2 thirds are verified. Just a few of them are superhosts.
* Accommodates, beds, bedrooms and bathrooms follow a reasonable distribution. We may find options with no bedrooms (e.g. studio, cabin) or no bathrooms, but it is not expected not to have a bed (even if the bed is not a traditional one).
* Cleaning fee seems to have outliers. We observe the same for the security deposit.
* Prices also seem to include very high values that we will consider as outliers.
* Number of maximum guests (accommodates) are mainly distributed between 0 and 6, but there are also lodgings offering up to 18 guests, which is a reasonable amount for large apartments.
* Host verifications range from 1 to 10, which is ok.
* Minimum number of nights also happen to contain outliers.
* Though there are some listings with very high number of reviews per month (up to 25), we regard this as a rare but feasible situation.
* Amenities seem to be reasonably distributed.
* Review columns have negative values when missing (non-rated). We see that listings are, on average, rated high (above 8 for specific scores and above 60 for overall rating).
* The price for extra people contain also high values (outliers).
* We see there are a high amount of listings that are either fully booked or barely busy.
* There are some listings which include no guests, which does not have non-sense.

Let's further observe those columns which seemed to contain outliers in the visualization.

#### Beds

We discard those instances that do not have any beds.

In [None]:
size_before = data.shape[0]
data = data[data['beds'] >= 1]
size_after = data.shape[0]
print('Eliminated %d instances out of %d after removing rows without beds'
     % (size_before - size_after, size_before))

#### Cleaning fee

We just consider lodgings that have cleaning fees below 750 dollars.

In [None]:
size_before = data.shape[0]
data = data[data['cleaning_fee'] <= 500]
size_after = data.shape[0]
print('Eliminated %d instances out of %d after removing cleaning fees above 750 dollars'
     % (size_before - size_after, size_before))

#### Price

We saw that the 75 percentile of the prices is around 170 dollars while the maximum price recorded is above 300k dollars. It is also noticeable that there are prices around 0 that must be omitted. We decide to discard those prices below 20 dollars and above 1500. 

In [None]:
size_before = data.shape[0]
data = data[(data['final_price'] <= 1500) & (data['final_price'] >= 20)]
size_after = data.shape[0]
print('Eliminated %d instances out of %d after removing prices below 20 dollars and above 1500'
     % (size_before - size_after, size_before))

#### Minimum nights

Since our project will focus on short-term lodging price prediction, we will restrict lodgings that have more than 30 days as minimum stay.

In [None]:
size_before = data.shape[0]
data = data[data['minimum_nights'] <= 30]
size_after = data.shape[0]
print('Eliminated %d instances out of %d after removing minimum stays above 30 days'
     % (size_before - size_after, size_before))

#### Security deposit

Let's consider a maximum of 2000 dollars as security deposit.

In [None]:
size_before = data.shape[0]
data = data[data['security_deposit'] <= 2000]
size_after = data.shape[0]
print('Eliminated %d instances out of %d after removing security deposits above 2000 dollars'
     % (size_before - size_after, size_before))

#### Guests included

Guests are the number of people that are included in the price. From that number to the maximum number (accommodates), an extra price (extra_people) is paid per person and night.

First, let's ensure that the number of guests never surpasses the number of accommodates:

In [None]:
size_before = data.shape[0]
data = data[data['guests_included'] <= data['accommodates']]
size_after = data.shape[0]
print('Eliminated %d instances out of %d after removing non-coherent guests feature'
     % (size_before - size_after, size_before))

The number of guests is the number of people the host includes in the price. Therefore, let's discard those entries containing no guests.

In [None]:
size_before = data.shape[0]
data = data[data['guests_included'] > 0]
size_after = data.shape[0]
print('Eliminated %d instances out of %d after removing entries without guests included'
     % (size_before - size_after, size_before))

#### Extra people

The extra_people column contains the number of dollars to be paid for extra guests not included in the price.
Extra people price is not expected to be above the base price:

In [None]:
size_before = data.shape[0]
data = data[data['final_price'] >= data['extra_people']]
size_after = data.shape[0]
print('Eliminated %d instances out of %d after removing high extra people prices'
     % (size_before - size_after, size_before))

### Scores

We have seen that most of the scores are distributes in a similar way: most of the values are top-scores or non-rated while 9 scores have considerable support and the other options have a very limited support. We decide to define 4 categories: very good (score of 10), good (score of 9), regular (less than 9) and non-rated for all scores between 0 and 10.

In [None]:
def categorize_score(score):
    """ Converts score into category considering the input bounds. Categories defined as: 
            - 10 is very good
            - 9 is good
            - below 9 is regular
            - Missing is non-rated
    """
    if score == -1:
        return "non-rated"
    else:
        if score == 10:
            return "very good"
        elif score == 9:
            return "good"
        elif score < 9:
            return "regular"

# Categorize scores in interval 0-10
review_10_scores = ['review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
    'review_scores_communication', 'review_scores_location', 'review_scores_value']
for c in review_10_scores:
    data.loc[:, c] = data[c].apply(categorize_score)

### Score rating

Let's now categorize the general score of the listing.

In [None]:
def categorize_rating(score):
    """ Converts rating into category considering the input bounds. Categories defined as: 
            - 95 < x <= 100: very good
            - 90 <= x <= 95: good
            - x < 90: regular
            - Missing is non-rated
    """
    if score == -1:
        return "non-rated"
    else:
        if score > 95:
            return "very good"
        elif score >= 90 and score <= 95:
            return "good"
        elif score < 90:
            return "regular"

col = 'review_scores_rating'
data.loc[:, col] = data[col].apply(categorize_rating)

## Categorical columns

Now it is time to analyse the categorical columns

In [None]:
def show_categorical(dataframe, cols, excluded_cols, normalized=False):
    """ Shows distribution of categories within each categorical feature """
    subset = dataframe[cols]
    for c in subset:
        if c not in excluded_cols:
            print('Counts for %s' % c)
            unique = subset[c].value_counts(normalize=normalized)
            print(unique)
            print('\n')

excluded_cat = ['listing_url', 'picture_url', 'last_scraped']
show_categorical(data, cat_cols, excluded_cat)

### Removing small cities

We observe that there are many cities with just a bunch of listings, which add sparsity to the resulting dataset without adding much information. Let's just use cities with more than 5000 listings.

In [None]:
city_counts = data['area'].value_counts(normalize=False)
big_cities = city_counts[city_counts > 5000].index
data = data[data['area'].isin(big_cities)]
print('List of resulting cities({}): {}'.format(len(big_cities), data['area'].unique().tolist()))

Let's see now how data categorical data is distributed

In [None]:
show_categorical(data, cat_cols, excluded_cat)

### Reducing sparsity

There are many identifiers that are very little supported in the data. As an example, we can see how no_refunds, long_term or super strict cancelation policies have very few instances. We are going to merge those categories in columns to represent a new category which is already not represented by the other ones. 

In [None]:
def aggregate_categories(data, column, minimum_support, new_category):
    """ Converts those categories with support below threshold into a new category """
    counts = data[column].value_counts(normalize=True)
    values = counts[counts < minimum_support].index
    data.loc[data[column].isin(values), column] = new_category
    return data

data = aggregate_categories(data, 'cancellation_policy', 0.001, 'other')

Let's do the same for the property types, discarding those below 200 instances.

In [None]:
data = aggregate_categories(data, 'property_type', 0.001, 'Other')

Finally we have should apply this logic for the subareas (neighbourhood). Let's keep those who have at least 0.01% of instances and set the rest to "Other".

In [None]:
subareas_before = len(data['subarea'].unique())
data = aggregate_categories(data, 'subarea', 0.0001, 'Other')
subareas_after = len(data['subarea'].unique())
print('Before we had %d neighbourhoods and now we have %d' % (subareas_before, subareas_after))

Before saving the final version of the dataset, let's have another general look at the data.

In [None]:
data.describe(include='all')

In [None]:
show_categorical(data, cat_cols, excluded_cat)

Finally let's visualize the distribution of the prices

In [None]:
data['final_price'].hist(bins=20)

And the anual availability of the listings:

In [None]:
data['availability_365'].hist(bins=20)

In [None]:
data['final_price'].head(n=15)

Now we save the final version of the dataset into a csv file.

In [None]:
print('Final dataset contains %d instances and %d columns' % (data.shape[0], data.shape[1]))
data.to_csv(get_data_path(airbnb_root), index=False)