In [1]:
import pandas as pd
import os
from pathlib import Path
import numpy as np
from patsy import bs,dmatrix,dmatrices
import sys
import statsmodels.api as sm

In [2]:
path = Path(os.getcwd())

In [3]:
base_dir = path.parent.parent

In [4]:
data_in = os.path.join(str(base_dir) , "da_data_repo/hotels-europe/clean/")

In [5]:
data_out = os.path.join(str(base_dir), "da_case_studies/ch09-hotels-europe-stability/")

In [6]:
func = os.path.join(str(base_dir) ,   "da_case_studies/ch00-tech-prep/")

In [7]:
sys.path.append(func)

In [8]:
from py_helper_functions import *

In [9]:
hotels_europe_price = pd.read_csv(os.path.join(data_in,"hotels-europe_price.csv"))

In [10]:
hotels_europe_features = pd.read_csv(os.path.join(data_in,"hotels-europe_features.csv"))

In [11]:
data = pd.merge(hotels_europe_price,hotels_europe_features,on='hotel_id',how='left')

In [12]:
data = data[data['city_actual'].isin(['Vienna','Amsterdam','Barcelona'])]

In [13]:
data = data[data['accommodation_type'].isin(['Hotel','Apartment'])]

In [14]:
data = data[data['nnights']!=4]

In [15]:
data = data[data['price']<1000]

In [16]:
data = data.drop_duplicates()

In [17]:
data.loc[(data['month']==11) & (data['weekend']==0),'date']='2017-NOV-weekday'
data.loc[(data['month']==11) & (data['weekend']==1),'date']='2017-NOV-weekend'
data.loc[(data['month']==12) & (data['holiday']==1),'date']='2017-DEC-holiday'
data.loc[(data['month']==6) & (data['weekend']==1),'date']='2018-JUNE-weekend'

In [18]:
data = data[data['date'].notna()]

In [19]:
data['city'].value_counts()

Barcelona    1564
Vienna       1326
Amsterdam     830
Name: city, dtype: int64

In [20]:
pd.crosstab(index=data['accommodation_type'], columns=data['city'])

city,Amsterdam,Barcelona,Vienna
accommodation_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apartment,31,300,457
Hotel,799,1264,869


In [21]:
pd.crosstab(index=data['date'], columns=data['city'])

city,Amsterdam,Barcelona,Vienna
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-DEC-holiday,290,420,338
2017-NOV-weekday,315,452,377
2017-NOV-weekend,125,393,256
2018-JUNE-weekend,100,299,355


In [22]:
data['lnprice']=data['price'].map(lambda x:np.log(x))

In [23]:
data = data[["hotel_id", "date", "city", "accommodation_type", "stars", "rating", "distance", "price", "lnprice"]]

In [24]:
data.to_csv(os.path.join(data_out,"hotels_work.csv"),index=False)

In [25]:
data = data[(data['stars']>=3) & (data['stars']<=4)]

In [26]:
data = data[data['accommodation_type'] == 'Hotel'] 

In [27]:
data = data[data['city']=='Vienna']

In [28]:
data['date'].value_counts()

2017-NOV-weekday     207
2017-DEC-holiday     189
2018-JUNE-weekend    181
2017-NOV-weekend     125
Name: date, dtype: int64

In [29]:
data[['distance','price','lnprice']].describe()

Unnamed: 0,distance,price,lnprice
count,702.0,702.0,702.0
mean,1.566382,122.752137,4.737121
std,1.154614,53.30483,0.366648
min,0.0,50.0,3.912023
25%,0.8,86.0,4.454347
50%,1.4,109.0,4.691348
75%,1.9,144.0,4.969813
max,6.6,491.0,6.196444


In [30]:
data.groupby('date')['distance'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-DEC-holiday,189.0,1.503175,1.059903,0.0,0.8,1.3,1.9,5.2
2017-NOV-weekday,207.0,1.529952,1.161507,0.0,0.8,1.3,1.9,6.6
2017-NOV-weekend,125.0,1.7728,1.298161,0.0,0.9,1.6,2.1,6.6
2018-JUNE-weekend,181.0,1.531492,1.13007,0.0,0.8,1.3,1.9,6.6


In [31]:
data.groupby('date')['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-DEC-holiday,189.0,116.492063,46.308358,57.0,85.0,103.0,138.0,386.0
2017-NOV-weekday,207.0,109.975845,42.221381,50.0,82.0,100.0,129.5,383.0
2017-NOV-weekend,125.0,149.144,76.530903,60.0,92.0,132.0,180.0,491.0
2018-JUNE-weekend,181.0,125.674033,45.053534,59.0,94.0,111.0,154.0,297.0


In [32]:
data.groupby('date')['lnprice'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-DEC-holiday,189.0,4.69671,0.334883,4.043051,4.442651,4.634729,4.927254,5.955837
2017-NOV-weekday,207.0,4.640219,0.336751,3.912023,4.406719,4.60517,4.863673,5.948035
2017-NOV-weekend,125.0,4.902204,0.437582,4.094345,4.521789,4.882802,5.192957,6.196444
2018-JUNE-weekend,181.0,4.776133,0.334283,4.077537,4.543295,4.70953,5.036953,5.693732


In [33]:
## Regression with splines

In [34]:
y,X = dmatrices("lnprice ~ lspline(distance,2)",data[data.date=='2017-NOV-weekday'])

In [35]:
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.314
Model:                            OLS   Adj. R-squared:                  0.308
Method:                 Least Squares   F-statistic:                     46.79
Date:                Tue, 15 Sep 2020   Prob (F-statistic):           1.89e-17
Time:                        20:17:51   Log-Likelihood:                -28.843
No. Observations:                 207   AIC:                             63.69
Df Residuals:                     204   BIC:                             73.68
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [36]:
print(results.get_robustcov_results(cov_type='HC1').summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.314
Model:                            OLS   Adj. R-squared:                  0.308
Method:                 Least Squares   F-statistic:                     46.04
Date:                Tue, 15 Sep 2020   Prob (F-statistic):           3.16e-17
Time:                        20:18:09   Log-Likelihood:                -28.843
No. Observations:                 207   AIC:                             63.69
Df Residuals:                     204   BIC:                             73.68
Df Model:                           2                                         
Covariance Type:                  HC1                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 