## Prepared for the textbook:
-------------------------------------------------------------------
## Data Analysis for Business, Economics, and Policy
#### by Gabor BEKES and  Gabor KEZDI 
----------------------------------
#### Cambridge University Press 2021
-----------------------------------------------------------------------------------------------
#### License: Free to share, modify and use for educational purposes. Not to be used for business purposes.


In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import seaborn as sns
from mizani.formatters import percent_format
import os
from plotnine import *
import numpy as np
import sys
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Current script folder
current_path = os.getcwd()
dirname = "/".join(current_path.split("/")[:-2]) + "/"
# location folders
data_in = dirname + "da_data_repo/hotels-vienna/clean/"
data_out = dirname + "da_case_studies/ch08-hotels-measurement-error/"
output = dirname + "da_case_studies/ch08-hotels-measurement-error/output/"
func = dirname + "da_case_studies/ch00-tech-prep/"
sys.path.append(func)
# Import the prewritten helper functions
from py_helper_functions import *

In [2]:
# load vienna
hotels=pd.read_csv(data_in+"hotels-vienna.csv")

#### SAMPLE SELECTION

In [3]:
hotels = (
    hotels.query('accommodation_type=="Hotel"')
    .query('city_actual=="Vienna"')
    .query("stars>=3 & stars<=4")
    .query("stars.notna()")
    .query("price<=600")
)

#######################################
### Look at measurement error by rating count
#######################################

In [4]:
hotels['lnprice']=np.log(hotels.price)

In [5]:
# define cutoffs
k1=100
k2=200

In [6]:
reg_me=smf.ols(formula='lnprice ~ rating',data=hotels.loc[hotels.rating_count<k1,:])
reg_me_fit=reg_me.fit()

In [7]:
print(reg_me_fit.summary(slim=True))

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.244
Model:                            OLS   Adj. R-squared:                  0.234
No. Observations:                  77   F-statistic:                     24.20
Covariance Type:            nonrobust   Prob (F-statistic):           4.99e-06
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.1375      0.285     11.025      0.000       2.571       3.704
rating         0.3540      0.072      4.920      0.000       0.211       0.497

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


In [None]:
hotels['yhat']=reg_me_fit.predict(hotels)

In [None]:
reg_me2=smf.ols(formula='lnprice ~ rating',data=hotels.loc[(hotels.rating_count>=k1)&(hotels.rating_count<k2),:])
reg_me_fit2=reg_me2.fit()
reg_me_fit2.summary(slim=True)

In [None]:
hotels['yhat2']=reg_me_fit2.predict(hotels)

In [None]:
reg_me3=smf.ols(formula='lnprice ~ rating',data=hotels.loc[hotels.rating_count>=k2,:])
reg_me_fit3=reg_me3.fit()
reg_me_fit3.summary(slim=True)

In [None]:
hotels['yhat3']=reg_me_fit3.predict(hotels)

In [None]:
ggplot(hotels) + geom_line(
    aes(x="rating", y="yhat"),color=color[1], size=1
) + geom_line(aes(x="rating", y="yhat3"),color=color[0], size=1)+ coord_cartesian(
    xlim=[2, 5], ylim=[3.5, 5]
) + expand_limits(
    x=0.01, y=0.01
) + scale_y_continuous(
    expand=[0.01, 0.01]
) + scale_x_continuous(
    expand=[0.01, 0.01], limits=[2, 5], breaks=seq(2, 5.1, 0.5)
) + labs(
    x="Average rating", y="ln(Hotel price, US dollars)"
) + theme_bw() + annotate(
    "text", x=2.6, y=4.4, label="More noisy: # of ratings<100", size=10, color=color[1]
) + annotate(
    "text", x=3.1, y=3.6, label="Less noisy: # of ratings>200", size=10, color=color[0]
)