In [577]:
# laoding up packages
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sklearn.linear_model as slm
from sklearn.model_selection import train_test_split

## 1 Loading data

In [578]:
# reading in the dataset
data = pd.read_csv('airbnb-seattle-listings-train.csv', sep='\t')

# 1.1 Describing the dataset
# As we can see there are 7540 rows and 106 columns in the dataset. most of them are non-numerical data, such as 
# description of the facilitry, a short summary of the apartment, and boolean values on things such as if you can
# smoke in the room. 
# most of them, we consider, will be not useful for making a predictive model and we will use be selective on what
# variables to use. However, some of the qualitative variables are useful, like house-type, and neighborhood type.
# To take them into our model, we need to manipulate the dataset. Turning some non-numerical values into 0/1 representation
# or the grouped mean using group method. 

# 1.2 Missing data
# Missing data in the set comes in different forms and dropna() is not always useful.
# In training our model we draw a board line on missing percentage. If more then 20% of the data is missing, we
# will avoid to use such variables in our regression. For the data that is missing, we remove the row with the
# selected variable in the regression who has >20% data missing.

# 1.3 
# For example, we use security_desposit in the full model. we think it carries information on the housing price. 
# we will talk more into it when working on regression. 

## 2 Modeling

In [579]:
# 2.1
# we split our dataset into train/test subsets.
# np.random.seed(1)
# train, test = train_test_split(data, test_size=0.25)
# we will split after we cleaning up the large dataset

In [580]:
# 2.2 
# cleaning/ filter the dataset
# converting house price string to float point
data['price'] = [re.sub('[$,]', '', str(x)) for x in data['price']]

# converting security deposit to float point
data['security_deposit'] = [re.sub('[$,]', '', str(x)) for x in data['security_deposit']]

# converting cleaning fee to float point
data['cleaning_fee'] = [re.sub('[$,]', '', str(x)) for x in data['cleaning_fee']]

# converting cleaning fee to float point
data['weekly_price'] = [re.sub('[$,]', '', str(x)) for x in data['weekly_price']]

# converting cleaning fee to float point
data['monthly_price'] = [re.sub('[$,]', '', str(x)) for x in data['monthly_price']]

# converting require license to float point
data['requires_license'] = data['requires_license']
data['requires_license'].replace('f', 0, inplace=True)
data['requires_license'].replace('t', 1, inplace=True)

# # split the dataset into two subsets
np.random.seed(1)
train, test = train_test_split(data, test_size=0.3)

### simple regression

In [581]:
# two variable model 
# selecting a subset of table, making it easier to make regression
df = train[['price','weekly_price', 'monthly_price']]
df = df.apply(pd.to_numeric, errors='coerce').dropna()
x = df[['weekly_price', 'monthly_price']]
x = sm.add_constant(x)
y = df.price

# make a regression using statsmodel
mod = sm.OLS(y, x)
res = mod.fit()
print(res.summary())

# compute the rmse using sklearn
m = slm.LinearRegression().fit(x, y)
df_2 = test[['price','weekly_price', 'monthly_price']]
df_2 = df_2.apply(pd.to_numeric, errors='coerce').dropna()
y_test = df_2.price
x_test = df_2[['weekly_price', 'monthly_price']]
x_test = sm.add_constant(x_test)
yhat = m.predict(x_test)
rmse = np.sqrt(np.mean((yhat - y_test)**2))

# print out the sklearn rmse.
print(rmse)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.439
Model:                            OLS   Adj. R-squared:                  0.437
Method:                 Least Squares   F-statistic:                     168.2
Date:                Thu, 05 Dec 2019   Prob (F-statistic):           1.17e-54
Time:                        19:08:57   Log-Likelihood:                -2536.1
No. Observations:                 432   AIC:                             5078.
Df Residuals:                     429   BIC:                             5090.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            43.7677      6.081      7.198

  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)


### Full regression

In [587]:
# making a regression using everything we think is important.
df = train[['price','accommodates','bathrooms','bedrooms','beds', 'weekly_price', 'monthly_price',
            'reviews_per_month', 'requires_license', 'review_scores_location', 'review_scores_accuracy',
            'review_scores_rating', 'cleaning_fee']]
df = df.apply(pd.to_numeric, errors='coerce').dropna()

# transform the y value: remove dollar sign and convert it to float point data type
x = df[['accommodates','bathrooms','bedrooms','beds', 'weekly_price', 'monthly_price',
            'reviews_per_month', 'requires_license', 'review_scores_location', 'review_scores_accuracy',
            'review_scores_rating', 'cleaning_fee']]
x = sm.add_constant(x)
y = df.price

# make a regression using statsmodel
mod = sm.OLS(y, x)
res = mod.fit()
print(res.summary())

# computing the rmse using sklearn package
m_full= slm.LinearRegression().fit(x, y)
df_3 = test[['price','accommodates','bathrooms','bedrooms','beds', 'weekly_price', 'monthly_price',
            'reviews_per_month', 'requires_license', 'review_scores_location', 'review_scores_accuracy',
            'review_scores_rating', 'cleaning_fee']]
df_3 = df_3.apply(pd.to_numeric, errors='coerce').dropna()
y_test = df_3.price
x_test = df_3[['accommodates','bathrooms','bedrooms','beds', 'weekly_price', 'monthly_price',
            'reviews_per_month', 'requires_license', 'review_scores_location', 'review_scores_accuracy',
            'review_scores_rating', 'cleaning_fee']]
x_test = sm.add_constant(x_test)
yhat = m_full.predict(x_test)

rmse = np.sqrt(np.mean((yhat - y_test)**2))
print(rmse)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.556
Model:                            OLS   Adj. R-squared:                  0.543
Method:                 Least Squares   F-statistic:                     44.19
Date:                Thu, 05 Dec 2019   Prob (F-statistic):           1.10e-61
Time:                        19:13:22   Log-Likelihood:                -2298.4
No. Observations:                 400   AIC:                             4621.
Df Residuals:                     388   BIC:                             4669.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
accommodates               6

  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)


### In between regression

In [583]:
# making a regression using somewhat in between. 
# selecting a subset of dataset
df = train[['price','accommodates','bedrooms', 'weekly_price', 'review_scores_location', 
            'cleaning_fee', 'number_of_reviews']]
df = df.apply(pd.to_numeric, errors='coerce').dropna()

# defining x and y value
x = df[['accommodates','bedrooms', 'weekly_price', 'review_scores_location', 
            'cleaning_fee', 'number_of_reviews']]
x = sm.add_constant(x)
y = df.price

# make a regression using statsmodel
mod = sm.OLS(y, x)
res = mod.fit()
print(res.summary())

# computing the rmse using sklearn package
m = slm.LinearRegression().fit(x, y)
df_3 = test[['price','accommodates','bedrooms', 'weekly_price', 'review_scores_location', 
            'cleaning_fee', 'number_of_reviews']]
df_3 = df_3.apply(pd.to_numeric, errors='coerce').dropna()
y_test = df_3.price
x_test = df_3[['accommodates','bedrooms', 'weekly_price', 'review_scores_location', 
            'cleaning_fee', 'number_of_reviews']]
x_test = sm.add_constant(x_test)
yhat = m.predict(x_test)

rmse = np.sqrt(np.mean((yhat - y_test)**2))
print(rmse)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.246
Model:                            OLS   Adj. R-squared:                  0.238
Method:                 Least Squares   F-statistic:                     29.16
Date:                Thu, 05 Dec 2019   Prob (F-statistic):           3.01e-30
Time:                        19:08:58   Log-Likelihood:                -3456.4
No. Observations:                 542   AIC:                             6927.
Df Residuals:                     535   BIC:                             6957.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                   -105

  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)


### Interpret the result

#### simple regression 
>df = train[['price','weekly_price', 'monthly_price']]

For our simple regression model, the coefficients were relatively tiny, at 0.1705 and -0.0194 for weekly_price and monthly_price, respectively. This indicated that these coefficients had a very small affect on the final predicted price, and that this model was perhaps not very accurate. 

#### full regression
> df = train[['price','accommodates','bathrooms','bedrooms','beds', 'weekly_price', 'monthly_price',
            'reviews_per_month', 'requires_license', 'review_scores_location', 'review_scores_accuracy',
            'review_scores_rating', 'cleaning_fee']]
            
In our full regression model, the largest coefficient was for the bedrooms variable, which was the number of bedrooms available in the AirBnb, and had a value of 26.075. This means that this had the largest positive impact on price in our model. The lowest value was of the requires license variable, which had a value of -115. This meant that this had the highest impact of all the coefficients used, and would make the predicted value drastically lower. Some possible reasons for this could be that AirBnbs that attract lower income buyers may be more self conscious about their customers, and would want to screen them for safety and legal reasons. When comparing our full and in between regression models, there was definitely a discrepancy between the coefficient values. This could be explained by the fact that as a model takes into account more variables, it adjusts the impact that individual variables have on the final predicted price. Some more interesting observations in our full regression model was that the bedrooms and bathrooms had the highest coefficients, which could possibly be explained by how larger properties tend to have more bedrooms and bathrooms, and the larger the property the higher the cost it would be to rent. 

#### in-between regression 
we use part of the variables we used in full model in the third model. Those are variables we think are strong
indicators of the daily housing price. 

> test[['price','accommodates','bedrooms', 'weekly_price', 'review_scores_location', 
            'cleaning_fee', 'number_of_reviews']]
            
the y intercept is -105.1464, that means when all independent variables are zero the y value, housing price, would be -105.1464. Although it seems irrelistic, some of the variables are qualitative data and we have to transform them into numerical values. This transformation can affect the intercept value.The strongest variable we have is 'bedrooms' which has coefficent of 25.8278. review_scores_location has coefficent of 13.1981. That observations fit in our common sense. We think location and number of rooms the property have can strongly influence the price. 

## 3 Think

1.Does your model do a good job in predicting the prices?

Since no model can be absolutely perfect, it is quite difficult to create a model that captures every single attribute of any dataset. Regardless, we notice that our model does a good job in predicting the prices. With a pretty high confidence, our model shows a consistent prediction capability.  
 
2.how will your model be useful to

(a)	AirBnB hosts – it can help hosts to model their houses in such a way that, it particularly solves a issue in their neighborhood. If a host sees a gap in their area, they can work efficiently using the model to create value for the customers.  

(b) AirBnB customers – Customers can use our model to predict and plan their budgets, it is can be excellent tool to help customers understand how they can get the most back for their buck.

3.Did you include any other price-related variables, such as _weekly price_ or _security deposit_ in your model? Do you think it is a good idea to use these attributes while trying to predict price?

Through our models we realized that _security deposit_ is not a good indicator in our model. Because it disrupts the other values and their consistencies. The weekly price, according to our model is also not a good indicator of the model.
  
4.Do you think this model can be used by Airbnb itself or the government?

Airbnb being a for-profit organization, it would be much more valuable for them to take over this model. The government on the other hand can use this model in their social programs to take the biggest challenges they face.   

5.Do you see any ethical issues with this work?
Although this model utilizes personal information from a lot of people, when used ethically the positives of using this model outweighs the negatives. Anyone using this model has to be trained in ethics so they understand the value of the information being used in the model and be responsible. 

## 4 Additional Task

In [592]:
# loading up the dataset
data = pd.read_csv('airbnb-seattle-listings-test.csv', sep='\t')

# cleaning the dataset

# converting house price string to float point
data['price'] = [re.sub('[$,]', '', str(x)) for x in data['price']]

# converting security deposit to float point
data['security_deposit'] = [re.sub('[$,]', '', str(x)) for x in data['security_deposit']]

# converting cleaning fee to float point
data['cleaning_fee'] = [re.sub('[$,]', '', str(x)) for x in data['cleaning_fee']]

# converting cleaning fee to float point
data['weekly_price'] = [re.sub('[$,]', '', str(x)) for x in data['weekly_price']]

# converting cleaning fee to float point
data['monthly_price'] = [re.sub('[$,]', '', str(x)) for x in data['monthly_price']]

# converting require license to float point
data['requires_license'] = data['requires_license']
data['requires_license'].replace('f', 0, inplace=True)
data['requires_license'].replace('t', 1, inplace=True)

# loading up the test set
df_4 = data[['price','accommodates','bathrooms','bedrooms','beds', 'weekly_price', 'monthly_price',
            'reviews_per_month', 'requires_license', 'review_scores_location', 'review_scores_accuracy',
            'review_scores_rating', 'cleaning_fee']]

df_4 = df_4.apply(pd.to_numeric, errors='coerce').dropna()

# defining x and y
y_test = df_4.price
x_test = df_4[['accommodates','bathrooms','bedrooms','beds', 'weekly_price', 'monthly_price',
            'reviews_per_month', 'requires_license', 'review_scores_location', 'review_scores_accuracy',
            'review_scores_rating', 'cleaning_fee']]
x_test = sm.add_constant(x_test)

# computing yhat
yhat = m_full.predict(x_test)

# computing rmse
rmse = np.sqrt(np.mean((yhat - y_test)**2))
print(rmse)
# we get rmse of 46.32714310548417.

46.32714310548417


  return ptp(axis=axis, out=out, **kwargs)
