### Import Libaries:

In [75]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy.stats import mode
from sklearn.linear_model import LinearRegression as LinReg
import matplotlib
import matplotlib.pyplot as plt
from sklearn import discriminant_analysis
from sklearn.decomposition import PCA
from sklearn import preprocessing
from collections import Counter
%matplotlib inline

### Import listings, clean data, extract features

In [2]:
# Read in the data 
listings = pd.read_csv('listings.csv', delimiter=',')

# View feature list
print listings.columns.values

Fig 1

['id' 'scrape_id' 'last_scraped' 'name' 'picture_url' 'host_id' 'host_name'
 'host_since' 'host_picture_url' 'street' 'neighbourhood'
 'neighbourhood_cleansed' 'city' 'state' 'zipcode' 'market' 'country'
 'latitude' 'longitude' 'is_location_exact' 'property_type' 'room_type'
 'accommodates' 'bathrooms' 'bedrooms' 'beds' 'bed_type' 'square_feet'
 'price' 'weekly_price' 'monthly_price' 'guests_included' 'extra_people'
 'minimum_nights' 'maximum_nights' 'calendar_updated' 'availability_30'
 'availability_60' 'availability_90' 'availability_365'
 'calendar_last_scraped' 'number_of_reviews' 'first_review' 'last_review'
 'review_scores_rating' 'review_scores_accuracy'
 'review_scores_cleanliness' 'review_scores_checkin'
 'review_scores_communication' 'review_scores_location'
 'review_scores_value' 'host_listing_count']


In [46]:
print 'Listings Data Shape: ', listings.shape

# Unsuppress Output
pd.options.display.max_columns = 77

listings.head(n = 3)

Listings Data Shape:  (27392, 52)


Unnamed: 0,id,scrape_id,last_scraped,name,picture_url,host_id,host_name,host_since,host_picture_url,street,neighbourhood,neighbourhood_cleansed,city,state,zipcode,market,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,square_feet,price,weekly_price,monthly_price,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_listing_count
0,1069266,20150100000000.0,1/2/15,Stay like a real New Yorker!,https://a0.muscache.com/pictures/50276484/larg...,5867023,Michael,4/10/13,https://a2.muscache.com/ic/users/5867023/profi...,"East 53rd Street, New York, NY 10022, United S...",Midtown East,Midtown East,New York,NY,10022-4175,New York,United States,40.756852,-73.964754,t,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,,$160.00,"$1,000.00",,2,$0.00,3,14,3 weeks ago,21,51,72,322,1/2/15,62,4/28/13,12/17/14,86.0,9.0,7.0,9.0,9.0,10.0,9.0,1
1,1846722,20150100000000.0,1/2/15,Apartment 20 Minutes Times Square,https://a1.muscache.com/pictures/35865039/larg...,2631556,Denise,6/13/12,https://a2.muscache.com/ic/users/2631556/profi...,"West 155th Street, New York, NY, United States",Hamilton Heights,Hamilton Heights,New York,NY,,New York,United States,40.830599,-73.941014,f,Apartment,Entire home/apt,10,1.0,3.0,3.0,Real Bed,,$105.00,,,1,$50.00,1,180,4 days ago,28,58,88,348,1/2/15,22,1/5/14,12/29/14,85.0,8.0,8.0,9.0,8.0,7.0,8.0,2
2,2061725,20150100000000.0,1/2/15,Option of 2 Beds w Private Bathroom,https://a2.muscache.com/pictures/50650147/larg...,4601412,Miao,1/5/13,https://a0.muscache.com/ic/users/4601412/profi...,"Van Buren Street, Brooklyn, NY 11221, United S...",Bushwick,Bushwick,Brooklyn,NY,11221,New York,United States,40.692189,-73.92412,t,Apartment,Private room,2,1.0,1.0,2.0,Real Bed,,$58.00,,,1,$12.00,3,30,today,4,13,26,227,1/2/15,35,2/4/14,12/29/14,98.0,10.0,10.0,10.0,10.0,9.0,10.0,4


For our baseline model, we can start by using features that we intuitively sense will impact a listing's price. Here are some good features to start with.
- price
- neighborhood_cleansed
- property_type
- room_type
- accommodates
- bathrooms
- bedrooms
- guests_included
- extra_people
- number_of_reviews
- review_scores_rating
- host_listing_count

This brings our feature list down from 52 to 13.



In [64]:
listings_slim = listings[['property_type',
                  'price',
                  'neighbourhood_cleansed',
                  'room_type',
                  'accommodates',
                  'bathrooms',
                  'bedrooms',
                  'beds',
                  'guests_included',
                  'extra_people',
                  'number_of_reviews',
                  'review_scores_rating',
                  'host_listing_count']]

print 'Size of trimmed data: ', listings_slim.shape

Size of trimmed data:  (27392, 13)


Let's remove entries (rows) that have faulty data like when
- There are 0 bedrooms
- There are 0 bathrooms
- There are 0 beds
- The price is $0

In [65]:
# Delete bad entries
listings_slim = listings_slim[listings_slim.bedrooms != 0]
listings_slim = listings_slim[listings_slim.beds != 0]
listings_slim = listings_slim[listings_slim.price != 0]

# Delete additional entries with NaN values
listings_slim = listings_slim.dropna(axis=0)

print 'Size of trimmed data: ', listings_slim.shape

Size of trimmed data:  (16694, 13)


We also need to drop the dollar sign from our price and turn the type into a float.

In [66]:
# Convert $ to float for 'price'
listings_slim['price'] = listings_slim['price'].replace('[\$,)]','',  \
        regex=True).replace('[(]','-', regex=True).astype(float)

# Convert $ to float for 'extra people'
listings_slim['extra_people'] = listings_slim['extra_people'].replace('[\$,)]','',  \
        regex=True).replace('[(]','-', regex=True).astype(float)

Because we are doing OLS for our baseline regression, we must have only numerical predictors and so we must also one-hot encode our categorical variables.

In [67]:
def one_hot(x_df):
    x_df_expanded = pd.DataFrame({})
    
    # From lecture notes to remove categorical
    for column in x_df.columns:
        # Check if attribute is not categorical: either dtype is not object
        if(x_df[column].dtype == np.dtype('object') or x_df[column].dtype == np.dtype('string')):
            # otherwise: use one-hot encoding
            encoding = pd.get_dummies(x_df[column], prefix=column)  # Convert categorical variable into dummy/indicator variables
            # append expanded attribute to data frame
            x_df_expanded = pd.concat([x_df_expanded, encoding], axis=1)
        else:
            # standardizes non-categorial variables
#             x_df[column] = x_df[column] / x_df[column].std(axis = 0)

            x_df_expanded = pd.concat([x_df_expanded, x_df[column]], axis=1)
    return x_df_expanded

print 'Size of trimmed data: ', listings_slim.shape
listings_slim = one_hot(listings_slim)
print 'Size of trimmed data: ', listings_slim.shape

listings_slim.head(n=4)

Size of trimmed data:  (16694, 13)
Size of trimmed data:  (16694, 197)


Unnamed: 0,property_type_Apartment,property_type_Bed & Breakfast,property_type_Boat,property_type_Cabin,property_type_Castle,property_type_Chalet,property_type_Dorm,property_type_Earth House,property_type_House,property_type_Lighthouse,property_type_Loft,property_type_Other,property_type_Tent,property_type_Treehouse,property_type_Villa,price,neighbourhood_cleansed_Allerton,neighbourhood_cleansed_Alphabet City,neighbourhood_cleansed_Astoria,neighbourhood_cleansed_Bath Beach,neighbourhood_cleansed_Battery Park City,neighbourhood_cleansed_Bay Ridge,neighbourhood_cleansed_Baychester,neighbourhood_cleansed_Bayside,neighbourhood_cleansed_Bedford Park,neighbourhood_cleansed_Bedford-Stuyvesant,neighbourhood_cleansed_Bensonhurst,neighbourhood_cleansed_Bergen Beach,neighbourhood_cleansed_Boerum Hill,neighbourhood_cleansed_Borough Park,neighbourhood_cleansed_Brighton Beach,neighbourhood_cleansed_Bronxdale,neighbourhood_cleansed_Brooklyn,neighbourhood_cleansed_Brooklyn Heights,neighbourhood_cleansed_Brooklyn Navy Yard,neighbourhood_cleansed_Bushwick,neighbourhood_cleansed_Canarsie,neighbourhood_cleansed_Carroll Gardens,...,neighbourhood_cleansed_Throgs Neck,neighbourhood_cleansed_Times Square/Theatre District,neighbourhood_cleansed_Todt Hill,neighbourhood_cleansed_Tompkinsville,neighbourhood_cleansed_Tottenville,neighbourhood_cleansed_Tremont,neighbourhood_cleansed_Tribeca,neighbourhood_cleansed_Union Square,neighbourhood_cleansed_University Heights,neighbourhood_cleansed_Upper East Side,neighbourhood_cleansed_Upper West Side,neighbourhood_cleansed_Utopia,neighbourhood_cleansed_Van Nest,neighbourhood_cleansed_Vinegar Hill,neighbourhood_cleansed_Washington Heights,neighbourhood_cleansed_West Brighton,neighbourhood_cleansed_West Village,neighbourhood_cleansed_Westchester Village,neighbourhood_cleansed_Westerleigh,neighbourhood_cleansed_Whitestone,neighbourhood_cleansed_Williamsbridge,neighbourhood_cleansed_Williamsburg,neighbourhood_cleansed_Windsor Terrace,neighbourhood_cleansed_Woodhaven,neighbourhood_cleansed_Woodlawn,neighbourhood_cleansed_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,accommodates,bathrooms,bedrooms,beds,guests_included,extra_people,number_of_reviews,review_scores_rating,host_listing_count
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,160.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,1.0,1.0,1.0,2,0.0,62,86.0,1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,105.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10,1.0,3.0,3.0,1,50.0,22,85.0,2
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2,1.0,1.0,2.0,1,12.0,35,98.0,4
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,185.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,1.0,1.0,1.0,1,0.0,26,96.0,1


We can see that one-hot encoding brought our number of features from 13 to 197. In actuality and as a quick sanity check, the only categorical variables are 'property_type', 'room_type', and 'neighbourhood_cleansed' and there are 186 neighbourhoods so this makes sense.

### Baseline OLS Model

In [102]:
# stores our predictors as the independent, x, variables
X = listings_slim.drop('price', axis = 1)

# stores price as the dependent, y, variable
Y = listings_slim['price']

linreg = LinReg()
linreg.fit(X,Y)
training_set_score = linreg.score(X,Y)
print 'The R^2 score on our training data is: ' + str(round(training_set_score,3))

The R^2 score on our training data is: 0.489


In [111]:
# stores the coefficient values of the predictors
coefficient_values = np.array(linreg.coef_)

# stores the names of the variables
variable_names = X.columns.values

# creates table storing the coefficient values and variable names
coef_matrix = pd.DataFrame({'CoefValues':coefficient_values, 'VarName': variable_names, 'AbsCoef': abs(coefficient_values)})

Below is a table that contains the sorted coefficient values for each variable that we decided to include.

In [115]:
sorted_coef_matrix = coef_matrix.sort(columns='AbsCoef').drop('AbsCoef', axis=1)
sorted_coef_matrix

  if __name__ == '__main__':


Unnamed: 0,CoefValues,VarName
192,0.06015169,extra_people
193,-0.3239639,number_of_reviews
194,0.9691101,review_scores_rating
195,-1.146117,host_listing_count
191,4.755259,guests_included
190,-10.45568,beds
187,13.88578,accommodates
189,57.46099,bedrooms
188,67.74525,bathrooms
2,-19571390.0,property_type_Boat


As we can see, our three categorical features have the same weight despite their encodings – this would likely not be the case in a non-linear model and will be interesting to explore.