## Prepared for the textbook:
-------------------------------------------------------------------
## Data Analysis for Business, Economics, and Policy
#### by Gabor BEKES and  Gabor KEZDI 
----------------------------------
#### Cambridge University Press 2021
-----------------------------------------------------------------------------------------------
#### License: Free to share, modify and use for educational purposes. Not to be used for business purposes.


In [1]:
import pandas as pd
import os
from pathlib import Path
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices
import sys

In [2]:
path = Path(os.getcwd())

In [3]:
base_dir = path.parent.parent

In [4]:
data_in = os.path.join(str(base_dir) , "da_data_repo/hotels-vienna/clean/")

In [5]:
data_out = os.path.join(str(base_dir), "da_case_studies/ch10-hotels-multiple-reg/")

In [6]:
func = os.path.join(str(base_dir) ,   "da_case_studies/ch00-tech-prep/")

In [7]:
sys.path.append(func)

In [8]:
from py_helper_functions import *

In [9]:
hotels = pd.read_csv(os.path.join(data_in,"hotels-vienna.csv"))

In [10]:
hotels = hotels.query('accommodation_type=="Hotel"'). \
       query('city_actual=="Vienna"'). \
       query('stars>=3 & stars<=4'). \
       query('price<=600')

In [11]:
hotels = hotels[hotels['stars'].notnull()]

In [12]:
len(hotels)

207

In [13]:
hotels['lnprice']=hotels['price'].map(lambda x:np.log(x))

In [14]:
hotels['distance2']=hotels['distance']

In [15]:
hotels.loc[hotels['distance2']<0.05,'distance2']=0.05

In [16]:
hotels['lndistance']=hotels['distance2'].map(lambda x:np.log(x))

In [17]:
hotels['star35']=(hotels['stars']==3.5).map(int)

In [18]:
hotels['star4']=(hotels['stars']==4).map(int)

In [19]:
hotels['price'].describe()

count    207.000000
mean     109.975845
std       42.221381
min       50.000000
25%       82.000000
50%      100.000000
75%      129.500000
max      383.000000
Name: price, dtype: float64

In [20]:
hotels['distance'].describe()

count    207.000000
mean       1.529952
std        1.161507
min        0.000000
25%        0.800000
50%        1.300000
75%        1.900000
max        6.600000
Name: distance, dtype: float64

In [21]:
hotels['lnprice'].describe()

count    207.000000
mean       4.640219
std        0.336751
min        3.912023
25%        4.406719
50%        4.605170
75%        4.863673
max        5.948035
Name: lnprice, dtype: float64

In [22]:
reg0 = smf.ols('lnprice ~ rating', data=hotels).fit()
reg1 = smf.ols('lnprice ~ distance', data=hotels).fit()
reg2 = smf.ols('lnprice ~ distance + rating', data=hotels).fit()

In [23]:
print(reg0.summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.252
Model:                            OLS   Adj. R-squared:                  0.248
Method:                 Least Squares   F-statistic:                     69.11
Date:                Tue, 14 Jul 2020   Prob (F-statistic):           1.28e-14
Time:                        11:44:47   Log-Likelihood:                -37.850
No. Observations:                 207   AIC:                             79.70
Df Residuals:                     205   BIC:                             86.37
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.8460      0.217     13.128      0.0

In [24]:
print(reg0.get_robustcov_results(cov_type='HC1').summary())
print(reg1.get_robustcov_results(cov_type='HC1').summary())
print(reg2.get_robustcov_results(cov_type='HC1').summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.252
Model:                            OLS   Adj. R-squared:                  0.248
Method:                 Least Squares   F-statistic:                     46.78
Date:                Tue, 14 Jul 2020   Prob (F-statistic):           8.96e-11
Time:                        11:44:48   Log-Likelihood:                -37.850
No. Observations:                 207   AIC:                             79.70
Df Residuals:                     205   BIC:                             86.37
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.8460      0.265     10.731      0.0

In [25]:
y,X = dmatrices("lnprice ~ lspline(distance,[1,4]) + lspline(rating, 3.5) + star35 + star4",hotels)

In [26]:
model = sm.OLS(y, X)
results = model.fit()

In [27]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.550
Model:                            OLS   Adj. R-squared:                  0.534
Method:                 Least Squares   F-statistic:                     34.76
Date:                Tue, 14 Jul 2020   Prob (F-statistic):           2.19e-31
Time:                        11:44:49   Log-Likelihood:                 14.744
No. Observations:                 207   AIC:                            -13.49
Df Residuals:                     199   BIC:                             13.17
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept       

In [28]:
hotels['lnprice_hat']=results.predict(X)

In [29]:
hotels['lnprice_resid'] = hotels['lnprice'] - hotels['lnprice_hat']

In [30]:
hotels['bestdeals'] = hotels.index.isin(hotels['lnprice_resid'].sort_values(ascending=False).tail().index.values)

In [31]:
hotels

Unnamed: 0,country,city_actual,rating_count,center1label,center2label,neighbourhood,price,city,stars,ratingta,...,nnights,rating,lnprice,distance2,lndistance,star35,star4,lnprice_hat,lnprice_resid,bestdeals
1,Austria,Vienna,189.0,City centre,Donauturm,17. Hernals,81,Vienna,4.0,3.5,...,1,3.9,4.394449,1.7,0.530628,0,1,4.556784,-0.162335,False
2,Austria,Vienna,53.0,City centre,Donauturm,Alsergrund,85,Vienna,4.0,3.5,...,1,3.7,4.442651,1.4,0.336472,0,1,4.499887,-0.057236,False
3,Austria,Vienna,55.0,City centre,Donauturm,Alsergrund,83,Vienna,3.0,4.0,...,1,4.0,4.418841,1.7,0.530628,0,0,4.443176,-0.024335,False
4,Austria,Vienna,33.0,City centre,Donauturm,Alsergrund,82,Vienna,4.0,3.5,...,1,3.9,4.406719,1.2,0.182322,0,1,4.581920,-0.175201,False
6,Austria,Vienna,57.0,City centre,Donauturm,Alsergrund,103,Vienna,4.0,3.5,...,1,3.9,4.634729,0.9,-0.105361,0,1,4.637984,-0.003255,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,Austria,Vienna,77.0,City centre,Donauturm,Wieden,100,Vienna,3.0,4.0,...,1,4.0,4.605170,1.2,0.182322,0,0,4.468312,0.136859,False
421,Austria,Vienna,572.0,City centre,Donauturm,Wieden,95,Vienna,4.0,4.0,...,1,4.1,4.553877,1.5,0.405465,0,1,4.638817,-0.084940,False
422,Austria,Vienna,53.0,City centre,Donauturm,Wieden,73,Vienna,3.0,3.0,...,1,3.4,4.290459,1.5,0.405465,0,0,4.284615,0.005845,False
425,Austria,Vienna,112.0,City centre,Donauturm,Wieden,100,Vienna,4.0,4.5,...,1,4.4,4.605170,1.0,0.000000,0,1,4.771920,-0.166750,False


In [32]:
y,X = dmatrices("lnprice ~ lspline(distance,[1,4])",hotels)

In [33]:
model = sm.OLS(y, X)
reg4 = model.fit()

In [34]:
print(reg4.summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.371
Model:                            OLS   Adj. R-squared:                  0.362
Method:                 Least Squares   F-statistic:                     39.97
Date:                Tue, 14 Jul 2020   Prob (F-statistic):           2.43e-20
Time:                        11:44:53   Log-Likelihood:                -19.879
No. Observations:                 207   AIC:                             47.76
Df Residuals:                     203   BIC:                             61.09
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept       

In [35]:
# y - yhat graph

In [64]:
# residual - yhat graph