# Solutions for chapter 10 exercises

## Setup

In [3]:
# Common libraries
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
import seaborn as sns

# To rescale numeric variables
from sklearn.preprocessing import MinMaxScaler
# To one-hot encode cat. variables
from sklearn.preprocessing import OneHotEncoder

## Exercise 2 – Hierarchical regression, guided. 

### 1) Traditional regression.

a)	Load the data in chap4-hotel_booking_case_study.csv and create a copy of it where all countries representing less than 1% of the data are lumped under “Other”.

In [15]:
#Loading the data
dat_df = pd.read_csv("chap4-hotel_booking_case_study.csv")
#Removing NA's for the relevant variables
dat_df = dat_df.dropna(subset = ['Country', 'ADR', 'MarketSegment'])
dat_df.head(5)

Unnamed: 0,NRDeposit,IsCanceled,DistributionChannel,MarketSegment,CustomerType,Children,ADR,PreviousCancellations,IsRepeatedGuest,Country,Quarter,Year
0,0,0,Direct,Direct,Transient,0,0.0,0,0,PRT,Q3,2015
1,0,0,Direct,Direct,Transient,0,0.0,0,0,PRT,Q3,2015
2,0,0,Direct,Direct,Transient,0,75.0,0,0,GBR,Q3,2015
3,0,0,Corporate,Corporate,Transient,0,75.0,0,0,GBR,Q3,2015
4,0,0,TA/TO,Online TA,Transient,0,98.0,0,0,GBR,Q3,2015


In [16]:
#Reducing the number of values for Country of origin by keeping most frequent countries only
#and aggregating the remaining ones under "Other"
countries_df = dat_df.groupby('Country').agg(pct = ('NRDeposit', lambda x: len(x)/len(dat_df))).\
sort_values(by=['pct'], ascending = False)
top_countries_df = countries_df.loc[countries_df.pct >= 0.01].reset_index()
top_countries_lst = top_countries_df['Country'].tolist()
print(top_countries_lst)
dat_df_agg = dat_df.copy()
dat_df_agg['Country'] = np.where(dat_df_agg['Country'].isin(top_countries_lst), dat_df_agg['Country'], 'Other')

['PRT', 'GBR', 'FRA', 'ESP', 'DEU', 'ITA', 'IRL', 'BEL', 'BRA', 'NLD', 'USA', 'CHE', 'CN', 'AUT']


b) Run a linear regression of ADR on Country and MarketSegment. Save the predicted values for all the rows in the data and calculate the mean absolute difference (MAD1) between the predicted values and the true values.

In [17]:
model = ols("ADR~Country+MarketSegment", data=dat_df_agg)
res = model.fit(disp=0)
res.summary()

0,1,2,3
Dep. Variable:,ADR,R-squared:,0.157
Model:,OLS,Adj. R-squared:,0.157
Method:,Least Squares,F-statistic:,851.1
Date:,"Wed, 23 Jun 2021",Prob (F-statistic):,0.0
Time:,10:03:53,Log-Likelihood:,-459390.0
No. Observations:,86943,AIC:,918800.0
Df Residuals:,86923,BIC:,919000.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,76.9789,1.727,44.583,0.000,73.595,80.363
Country[T.BEL],-0.4602,1.871,-0.246,0.806,-4.127,3.207
Country[T.BRA],-5.5452,1.884,-2.943,0.003,-9.238,-1.852
Country[T.CHE],5.3397,1.964,2.718,0.007,1.490,9.190
Country[T.CN],1.7780,2.119,0.839,0.401,-2.375,5.931
Country[T.DEU],-6.2456,1.682,-3.713,0.000,-9.542,-2.949
Country[T.ESP],7.0743,1.649,4.289,0.000,3.842,10.307
Country[T.FRA],-1.0628,1.632,-0.651,0.515,-4.262,2.137
Country[T.GBR],-10.8968,1.620,-6.724,0.000,-14.073,-7.721

0,1,2,3
Omnibus:,13152.42,Durbin-Watson:,0.927
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26887.407
Skew:,0.925,Prob(JB):,0.0
Kurtosis:,5.0,Cond. No.,47.3


In [22]:
#Calculating the predicted values from the model
predicted_values1 = res.predict(dat_df_agg)
predicted_values1.head(5)

0    112.819243
1    112.819243
2    111.445144
3     66.082078
4    110.824898
dtype: float64

In [23]:
#Calculating the mean absolute difference
MAD1 = np.mean(abs(predicted_values1 - dat_df['ADR']))
MAD1

35.98010182376623

### 2) Hierarchical regression.

Run a hierarchical linear regression of ADR on Country and MarketSegment, with Country as the clustering variable. Save the predicted values for all the rows in the data and calculate the mean absolute difference (MAD2) between the predicted values and the true values.

In [20]:
mixed = smf.mixedlm("ADR~Country+MarketSegment", data = dat_df, 
                   groups = dat_df["Country"])
print(mixed.fit().summary())



                     Mixed Linear Model Regression Results
Model:                   MixedLM        Dependent Variable:        ADR         
No. Observations:        86943          Method:                    REML        
No. Groups:              177            Scale:                     2268.9227   
Min. group size:         1              Log-Likelihood:            -458581.7347
Max. group size:         27452          Converged:                 Yes         
Mean group size:         491.2                                                 
-------------------------------------------------------------------------------
                                Coef.   Std.Err.    z    P>|z|  [0.025   0.975]
-------------------------------------------------------------------------------
Intercept                        83.671   58.344   1.434 0.152  -30.681 198.024
Country[T.AGO]                   -7.818   75.359  -0.104 0.917 -155.519 139.883
Country[T.AIA]                  136.660   89.114   1.534 0.12

In [27]:
#Calculating the predicted values from the model
predicted_values2 = mixed.fit().predict(dat_df)
predicted_values2.head(5)



0    112.862694
1    112.862694
2    111.497726
3     66.124030
4    110.792836
dtype: float64

In [25]:
#Calculating the mean absolute difference
MAD2 = np.mean(abs(predicted_values2 - dat_df['ADR']))
MAD2

35.892134550353326

### 3) Comparison

In [26]:
diff = MAD1 - MAD2
diff

0.08796727341290733

This number means that the hierarchical regression is more accurate by $0.09 on average.