# Baseline Modeling - Regression

---

* Goal: to develop baseline models prior to feature engineering to compare performance vs. post-engineered models.

---

In [17]:
import numpy as np
import pandas as pd


In [41]:
## SKLearn and Modeling Tools

from sklearn import metrics
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression, RidgeCV, SGDRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

set_config(transform_output='pandas')

## Load Data

In [19]:
df_data_h1 = pd.read_csv('../../data/source/H1.csv')
df_data_h2 = pd.read_csv('../../data/source/H2.csv')

df_data_full = pd.concat([df_data_h1,df_data_h2])

df_data_full

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,342,2015,July,27,1,0,0,2,0.0,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,0,737,2015,July,27,1,0,0,2,0.0,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,0,7,2015,July,27,1,0,1,1,0.0,...,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,0,13,2015,July,27,1,0,1,1,0.0,...,No Deposit,304,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,0,14,2015,July,27,1,0,2,2,0.0,...,No Deposit,240,,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79325,0,23,2017,August,35,30,2,5,2,0.0,...,No Deposit,394,,0,Transient,96.14,0,0,Check-Out,2017-09-06
79326,0,102,2017,August,35,31,2,5,3,0.0,...,No Deposit,9,,0,Transient,225.43,0,2,Check-Out,2017-09-07
79327,0,34,2017,August,35,31,2,5,2,0.0,...,No Deposit,9,,0,Transient,157.71,0,4,Check-Out,2017-09-07
79328,0,109,2017,August,35,31,2,5,2,0.0,...,No Deposit,89,,0,Transient,104.40,0,0,Check-Out,2017-09-07


## Set Target Feature

In [20]:
target_feature = 'ADR'

## Quick Overview

In [21]:
df_data_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119390 entries, 0 to 79329
Data columns (total 31 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   IsCanceled                   119390 non-null  int64  
 1   LeadTime                     119390 non-null  int64  
 2   ArrivalDateYear              119390 non-null  int64  
 3   ArrivalDateMonth             119390 non-null  object 
 4   ArrivalDateWeekNumber        119390 non-null  int64  
 5   ArrivalDateDayOfMonth        119390 non-null  int64  
 6   StaysInWeekendNights         119390 non-null  int64  
 7   StaysInWeekNights            119390 non-null  int64  
 8   Adults                       119390 non-null  int64  
 9   Children                     119386 non-null  float64
 10  Babies                       119390 non-null  int64  
 11  Meal                         119390 non-null  object 
 12  Country                      118902 non-null  object 
 13  Marke

In [22]:
df_data_full.describe(include = 'number').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
IsCanceled,119390.0,0.370416,0.482918,0.0,0.0,0.0,1.0,1.0
LeadTime,119390.0,104.011416,106.863097,0.0,18.0,69.0,160.0,737.0
ArrivalDateYear,119390.0,2016.156554,0.707476,2015.0,2016.0,2016.0,2017.0,2017.0
ArrivalDateWeekNumber,119390.0,27.165173,13.605138,1.0,16.0,28.0,38.0,53.0
ArrivalDateDayOfMonth,119390.0,15.798241,8.780829,1.0,8.0,16.0,23.0,31.0
StaysInWeekendNights,119390.0,0.927599,0.998613,0.0,0.0,1.0,2.0,19.0
StaysInWeekNights,119390.0,2.500302,1.908286,0.0,1.0,2.0,3.0,50.0
Adults,119390.0,1.856403,0.579261,0.0,2.0,2.0,2.0,55.0
Children,119386.0,0.10389,0.398561,0.0,0.0,0.0,0.0,10.0
Babies,119390.0,0.007949,0.097436,0.0,0.0,0.0,0.0,10.0


In [23]:
df_data_full.describe(exclude = 'number').T

Unnamed: 0,count,unique,top,freq
ArrivalDateMonth,119390,12,August,13877
Meal,119390,5,BB,92310
Country,118902,177,PRT,48590
MarketSegment,119390,8,Online TA,56477
DistributionChannel,119390,5,TA/TO,97870
ReservedRoomType,119390,10,A,85994
AssignedRoomType,119390,12,A,74053
DepositType,119390,3,No Deposit,104641
Agent,119390,334,9,31961
Company,119390,353,,112593


In [24]:
target_feature = 'ADR'

In [25]:
df_data_full[target_feature].describe()

count    119390.000000
mean        101.831122
std          50.535790
min          -6.380000
25%          69.290000
50%          94.575000
75%         126.000000
max        5400.000000
Name: ADR, dtype: float64

# Hotel 1

In [26]:
df_data_h1.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,342,2015,July,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,0,737,2015,July,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,0,7,2015,July,27,1,0,1,1,0,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,0,13,2015,July,27,1,0,1,1,0,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,0,14,2015,July,27,1,0,2,2,0,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [27]:
X = df_data_h1.drop(columns = target_feature)
y = df_data_h1[target_feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 903)

In [28]:
### ---  Creating ColumnTransformer and sub-transformers for imputation and encoding --- ###
num_cols = X.select_dtypes('number').columns
cat_cols = X.select_dtypes('object').columns

cat_pipe = Pipeline(steps=[('ohe',
                            OneHotEncoder(drop = 'if_binary',
                                          handle_unknown='ignore',
                                          sparse_output=False))])

num_pipe = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', num_pipe, num_cols),
                                               ('cat', cat_pipe, cat_cols)])


## DummyRegressor

In [29]:
# Integrating the preprocessor with the SGDRegressor into a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', DummyRegressor()
                            )])

pipeline.fit(X_train, y_train)


preds = pipeline.predict(X_test)
mae = metrics.mean_absolute_error(y_test, preds)
rmse = metrics.root_mean_squared_error(y_test, preds)
r2 = metrics.r2_score(y_test, preds)

print(f'\nThe MAE is: {mae:.2f}',
      f'\nThe RMSE is: {rmse:.2f}'
      f'\nThe R2 is: {r2:.2f}')


The MAE is: 47.92 
The RMSE is: 60.89
The R2 is: -0.00




## SGDRegressor

In [30]:
# Integrating the preprocessor with the SGDRegressor into a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', SGDRegressor(loss='huber',
                                                      penalty='elasticnet',
                                                      random_state=903)
                            )])

pipeline.fit(X_train, y_train)


preds = pipeline.predict(X_test)
mae = metrics.mean_absolute_error(y_test, preds)
rmse = metrics.root_mean_squared_error(y_test, preds)
r2 = metrics.r2_score(y_test, preds)

print(f'\nThe MAE is: {mae:.2f}',
      f'\nThe RMSE is: {rmse:.2f}'
      f'\nThe R2 is: {r2:.2f}')




The MAE is: 26.80 
The RMSE is: 40.86
The R2 is: 0.55


## HGBRegressor

In [42]:
# Integrating the preprocessor with the SGDRegressor into a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', HistGradientBoostingRegressor(
                                 random_state=903)
                            )])

pipeline.fit(X_train, y_train)


preds = pipeline.predict(X_test)
mae = metrics.mean_absolute_error(y_test, preds)
rmse = metrics.root_mean_squared_error(y_test, preds)
r2 = metrics.r2_score(y_test, preds)

print(f'\nThe MAE is: {mae:.2f}',
      f'\nThe RMSE is: {rmse:.2f}'
      f'\nThe R2 is: {r2:.2f}')




The MAE is: 9.89 
The RMSE is: 40.79
The R2 is: 0.44


# Hotel 2

In [45]:
X = df_data_h2.drop(columns = target_feature)
y = df_data_h2[target_feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 903)

In [46]:
### ---  Creating ColumnTransformer and sub-transformers for imputation and encoding --- ###
num_cols = X.select_dtypes('number').columns
cat_cols = X.select_dtypes('object').columns

cat_pipe = Pipeline(steps=[('ohe',OneHotEncoder(drop = 'if_binary',handle_unknown='ignore',sparse_output=False)),
                           ('cat_imp', SimpleImputer(strategy = 'most_frequent'))])

num_pipe = Pipeline(steps=[('num_imp', SimpleImputer(strategy = 'mean')),
                            ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', num_pipe, num_cols),
                                               ('cat', cat_pipe, cat_cols)])


## DummyRegressor

In [48]:
# Integrating the preprocessor with the SGDRegressor into a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', DummyRegressor()
                            )])

pipeline.fit(X_train, y_train)


preds = pipeline.predict(X_test)
mae = metrics.mean_absolute_error(y_test, preds)
rmse = metrics.root_mean_squared_error(y_test, preds)
r2 = metrics.r2_score(y_test, preds)

print(f'\nThe DummyRegressor MAE is: {mae:.2f}',
      f'\nThe DummyRegressor RMSE is: {rmse:.2f}'
      f'\nThe DummyRegressor R2 is: {r2:.2f}')




The DummyRegressor MAE is: 29.95 
The DummyRegressor RMSE is: 54.55
The DummyRegressor R2 is: -0.00


## SGDRegressor

In [None]:
# Integrating the preprocessor with the SGDRegressor into a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', SGDRegressor(loss='huber',
                                                      penalty='elasticnet',
                                                      random_state=903)
                            )])

pipeline.fit(X_train, y_train)


preds = pipeline.predict(X_test)
mae = metrics.mean_absolute_error(y_test, preds)
rmse = metrics.root_mean_squared_error(y_test, preds)
r2 = metrics.r2_score(y_test, preds)

print(f'\nThe SGDRegressor MAE is: {mae:.2f}',
      f'\nThe SGDRegressor RMSE is: {rmse:.2f}'
      f'\nThe SGDRegressor R2 is: {r2:.2f}')




The SGDRegressor MAE is: 18.80 
The SGDRegressor RMSE is: 46.67
The SGDRegressor R2 is: 0.27


## HGBRegressor

In [49]:
# Integrating the preprocessor with the SGDRegressor into a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', HistGradientBoostingRegressor(
                                 random_state=903)
                            )])

pipeline.fit(X_train, y_train)


preds = pipeline.predict(X_test)
mae = metrics.mean_absolute_error(y_test, preds)
rmse = metrics.root_mean_squared_error(y_test, preds)
r2 = metrics.r2_score(y_test, preds)

print(f'\nThe MAE is: {mae:.2f}',
      f'\nThe RMSE is: {rmse:.2f}'
      f'\nThe R2 is: {r2:.2f}')




The MAE is: 9.89 
The RMSE is: 40.79
The R2 is: 0.44


## RandomForestRegressor

In [50]:
# Integrating the preprocessor with the SGDRegressor into a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor( n_jobs = -1,
                                 random_state=903)
                            )])

pipeline.fit(X_train, y_train)


preds = pipeline.predict(X_test)
mae = metrics.mean_absolute_error(y_test, preds)
rmse = metrics.root_mean_squared_error(y_test, preds)
r2 = metrics.r2_score(y_test, preds)

print(f'\nThe MAE is: {mae:.2f}',
      f'\nThe RMSE is: {rmse:.2f}'
      f'\nThe R2 is: {r2:.2f}')




The MAE is: 6.12 
The RMSE is: 40.16
The R2 is: 0.46
