### Import Libaries:

In [None]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy.stats import mode
from sklearn.linear_model import LinearRegression as LinReg
import matplotlib
import matplotlib.pyplot as plt
from sklearn import discriminant_analysis
from sklearn.decomposition import PCA
from sklearn import preprocessing
from collections import Counter
%matplotlib inline

### Import listings, clean data, extract features

In [None]:
# Read in the data 
listings = pd.read_csv('listings.csv', delimiter=',')

# View feature list
print listings.columns.values

In [None]:
print 'Listings Data Shape: ', listings.shape

# Unsuppress Output
pd.options.display.max_columns = 77

listings.head(n = 3)

For our baseline model, we can start by using features that we intuitively sense will impact a listing's price. Here are some good features to start with.
- price
- neighborhood_cleansed
- property_type
- room_type
- accommodates
- bathrooms
- bedrooms
- guests_included
- extra_people
- number_of_reviews
- review_scores_rating
- host_listing_count

This brings our feature list down from 52 to 13.



In [None]:
listings_slim = listings[['property_type',
                  'price',
                  'neighbourhood_cleansed',
                  'room_type',
                  'accommodates',
                  'bathrooms',
                  'bedrooms',
                  'beds',
                  'guests_included',
                  'extra_people',
                  'number_of_reviews',
                  'review_scores_rating',
                  'host_listing_count']]

print 'Size of trimmed data: ', listings_slim.shape

Let's remove entries (rows) that have faulty data like when
- There are 0 bedrooms
- There are 0 bathrooms
- There are 0 beds
- The price is $0

In [None]:
# Delete bad entries
listings_slim = listings_slim[listings_slim.bedrooms != 0]
listings_slim = listings_slim[listings_slim.beds != 0]
listings_slim = listings_slim[listings_slim.price != 0]

# Delete additional entries with NaN values
listings_slim = listings_slim.dropna(axis=0)

print 'Size of trimmed data: ', listings_slim.shape

We also need to drop the dollar sign from our price and turn the type into a float.

In [None]:
# Convert $ to float for 'price'
listings_slim['price'] = listings_slim['price'].replace('[\$,)]','',  \
        regex=True).replace('[(]','-', regex=True).astype(float)

# Convert $ to float for 'extra people'
listings_slim['extra_people'] = listings_slim['extra_people'].replace('[\$,)]','',  \
        regex=True).replace('[(]','-', regex=True).astype(float)

Because we are doing OLS for our baseline regression, we must have only numerical predictors and so we must also one-hot encode our categorical variables.

In [None]:
def one_hot(x_df):
    x_df_expanded = pd.DataFrame({})
    
    # From lecture notes to remove categorical
    for column in x_df.columns:
        # Check if attribute is not categorical: either dtype is not object
        if(x_df[column].dtype == np.dtype('object') or x_df[column].dtype == np.dtype('string')):
            # otherwise: use one-hot encoding
            encoding = pd.get_dummies(x_df[column], prefix=column)  # Convert categorical variable into dummy/indicator variables
            # append expanded attribute to data frame
            x_df_expanded = pd.concat([x_df_expanded, encoding], axis=1)
        else:
            # standardizes non-categorial variables
#             x_df[column] = x_df[column] / x_df[column].std(axis = 0)

            x_df_expanded = pd.concat([x_df_expanded, x_df[column]], axis=1)
    return x_df_expanded

print 'Size of trimmed data: ', listings_slim.shape
listings_slim = one_hot(listings_slim)
print 'Size of trimmed data: ', listings_slim.shape

listings_slim.head(n=4)

We can see that one-hot encoding brought our number of features from 13 to 197. In actuality and as a quick sanity check, the only categorical variables are 'property_type', 'room_type', and 'neighbourhood_cleansed' and there are 186 neighbourhoods so this makes sense.

### Baseline OLS Model

In [None]:
# stores our predictors as the independent, x, variables
X = listings_slim.drop('price', axis = 1)

# stores price as the dependent, y, variable
Y = listings_slim['price']

linreg = LinReg()
linreg.fit(X,Y)
training_set_score = linreg.score(X,Y)
print 'The R^2 score on our training data is: ' + str(round(training_set_score,3))

In [None]:
# stores the coefficient values of the predictors
coefficient_values = np.array(linreg.coef_)

# stores the names of the variables
variable_names = X.columns.values

# creates table storing the coefficient values and variable names
coef_matrix = pd.DataFrame({'CoefValues':coefficient_values, 'VarName': variable_names, 'AbsCoef': abs(coefficient_values)})

Below is a table that contains the sorted coefficient values for each variable that we decided to include.

In [None]:
sorted_coef_matrix = coef_matrix.sort(columns='AbsCoef').drop('AbsCoef', axis=1)
sorted_coef_matrix

As we can see, our three categorical features have the same weight despite their encodings – this would likely not be the case in a non-linear model and will be interesting to explore.