# Baseline Modeling - Regression

---

* Goal: to develop baseline models prior to feature engineering to compare performance vs. post-engineered models.

---

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns


In [2]:
## SKLearn and Modeling Tools

from sklearn import metrics
from sklearn import set_config
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler

set_config(transform_output='pandas')

from xgboost import XGBRegressor

## Load Data

In [3]:
df_data_h1 = pd.read_csv('../../data/source/H1.csv')
df_data_h1['HotelNumber'] = 'H1'

df_data_h2 = pd.read_csv('../../data/source/H2.csv')
df_data_h2['HotelNumber'] = 'H2'

df_data_full = pd.concat([df_data_h1,df_data_h2])

df_data_full

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelNumber
0,0,342,2015,July,27,1,0,0,2,0.0,...,,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1
1,0,737,2015,July,27,1,0,0,2,0.0,...,,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1
2,0,7,2015,July,27,1,0,1,1,0.0,...,,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1
3,0,13,2015,July,27,1,0,1,1,0.0,...,304,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1
4,0,14,2015,July,27,1,0,2,2,0.0,...,240,,0,Transient,98.00,0,1,Check-Out,2015-07-03,H1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79325,0,23,2017,August,35,30,2,5,2,0.0,...,394,,0,Transient,96.14,0,0,Check-Out,2017-09-06,H2
79326,0,102,2017,August,35,31,2,5,3,0.0,...,9,,0,Transient,225.43,0,2,Check-Out,2017-09-07,H2
79327,0,34,2017,August,35,31,2,5,2,0.0,...,9,,0,Transient,157.71,0,4,Check-Out,2017-09-07,H2
79328,0,109,2017,August,35,31,2,5,2,0.0,...,89,,0,Transient,104.40,0,0,Check-Out,2017-09-07,H2


## Set Target Feature

In [4]:
target_feature = 'ADR'

## Quick Overview

In [5]:
df_data_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119390 entries, 0 to 79329
Data columns (total 32 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   IsCanceled                   119390 non-null  int64  
 1   LeadTime                     119390 non-null  int64  
 2   ArrivalDateYear              119390 non-null  int64  
 3   ArrivalDateMonth             119390 non-null  object 
 4   ArrivalDateWeekNumber        119390 non-null  int64  
 5   ArrivalDateDayOfMonth        119390 non-null  int64  
 6   StaysInWeekendNights         119390 non-null  int64  
 7   StaysInWeekNights            119390 non-null  int64  
 8   Adults                       119390 non-null  int64  
 9   Children                     119386 non-null  float64
 10  Babies                       119390 non-null  int64  
 11  Meal                         119390 non-null  object 
 12  Country                      118902 non-null  object 
 13  Marke

In [6]:
df_data_full.describe(include = 'number').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
IsCanceled,119390.0,0.370416,0.482918,0.0,0.0,0.0,1.0,1.0
LeadTime,119390.0,104.011416,106.863097,0.0,18.0,69.0,160.0,737.0
ArrivalDateYear,119390.0,2016.156554,0.707476,2015.0,2016.0,2016.0,2017.0,2017.0
ArrivalDateWeekNumber,119390.0,27.165173,13.605138,1.0,16.0,28.0,38.0,53.0
ArrivalDateDayOfMonth,119390.0,15.798241,8.780829,1.0,8.0,16.0,23.0,31.0
StaysInWeekendNights,119390.0,0.927599,0.998613,0.0,0.0,1.0,2.0,19.0
StaysInWeekNights,119390.0,2.500302,1.908286,0.0,1.0,2.0,3.0,50.0
Adults,119390.0,1.856403,0.579261,0.0,2.0,2.0,2.0,55.0
Children,119386.0,0.10389,0.398561,0.0,0.0,0.0,0.0,10.0
Babies,119390.0,0.007949,0.097436,0.0,0.0,0.0,0.0,10.0


In [7]:
df_data_full.describe(exclude = 'number').T

Unnamed: 0,count,unique,top,freq
ArrivalDateMonth,119390,12,August,13877
Meal,119390,5,BB,92310
Country,118902,177,PRT,48590
MarketSegment,119390,8,Online TA,56477
DistributionChannel,119390,5,TA/TO,97870
ReservedRoomType,119390,10,A,85994
AssignedRoomType,119390,12,A,74053
DepositType,119390,3,No Deposit,104641
Agent,119390,334,9,31961
Company,119390,353,,112593


In [8]:
target_feature = 'ADR'

In [9]:
df_data_full[target_feature].describe().round(2)

count    119390.00
mean        101.83
std          50.54
min          -6.38
25%          69.29
50%          94.58
75%         126.00
max        5400.00
Name: ADR, dtype: float64

## Convert NaN and Negative ADRs to .0001

In [10]:
df_data_full[target_feature] = np.where(df_data_full[target_feature] <=0,
                                        .0001,
                                        df_data_full[target_feature])
df_data_full[target_feature]

0          0.0001
1          0.0001
2         75.0000
3         75.0000
4         98.0000
           ...   
79325     96.1400
79326    225.4300
79327    157.7100
79328    104.4000
79329    151.2000
Name: ADR, Length: 119390, dtype: float64

In [11]:
df_data_full[target_feature].describe()

count    119390.000000
mean        101.831177
std          50.535676
min           0.000100
25%          69.290000
50%          94.575000
75%         126.000000
max        5400.000000
Name: ADR, dtype: float64

# Drop Questionable Features

In [12]:
df_data_full = df_data_full.drop(columns = ['IsCanceled', 'ReservationStatus', 'ReservationStatusDate', 'ArrivalDateYear'])
df_data_full

Unnamed: 0,LeadTime,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,...,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,HotelNumber
0,342,July,27,1,0,0,2,0.0,0,BB,...,3,No Deposit,,,0,Transient,0.0001,0,0,H1
1,737,July,27,1,0,0,2,0.0,0,BB,...,4,No Deposit,,,0,Transient,0.0001,0,0,H1
2,7,July,27,1,0,1,1,0.0,0,BB,...,0,No Deposit,,,0,Transient,75.0000,0,0,H1
3,13,July,27,1,0,1,1,0.0,0,BB,...,0,No Deposit,304,,0,Transient,75.0000,0,0,H1
4,14,July,27,1,0,2,2,0.0,0,BB,...,0,No Deposit,240,,0,Transient,98.0000,0,1,H1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79325,23,August,35,30,2,5,2,0.0,0,BB,...,0,No Deposit,394,,0,Transient,96.1400,0,0,H2
79326,102,August,35,31,2,5,3,0.0,0,BB,...,0,No Deposit,9,,0,Transient,225.4300,0,2,H2
79327,34,August,35,31,2,5,2,0.0,0,BB,...,0,No Deposit,9,,0,Transient,157.7100,0,4,H2
79328,109,August,35,31,2,5,2,0.0,0,BB,...,0,No Deposit,89,,0,Transient,104.4000,0,0,H2


# Train-Test Split and Preprocessor

In [13]:
df_data_full.head()

Unnamed: 0,LeadTime,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,...,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,HotelNumber
0,342,July,27,1,0,0,2,0.0,0,BB,...,3,No Deposit,,,0,Transient,0.0001,0,0,H1
1,737,July,27,1,0,0,2,0.0,0,BB,...,4,No Deposit,,,0,Transient,0.0001,0,0,H1
2,7,July,27,1,0,1,1,0.0,0,BB,...,0,No Deposit,,,0,Transient,75.0,0,0,H1
3,13,July,27,1,0,1,1,0.0,0,BB,...,0,No Deposit,304.0,,0,Transient,75.0,0,0,H1
4,14,July,27,1,0,2,2,0.0,0,BB,...,0,No Deposit,240.0,,0,Transient,98.0,0,1,H1


In [14]:
X = df_data_full.drop(columns = target_feature)
y = df_data_full[target_feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 903)

# Pipeline

In [15]:
def create_and_test_bl_model(X_train, y_train,
                             X_test, y_test,
                             regressor,
                             show_metrics = True):

### ---  Creating ColumnTransformer and sub-transformers for imputation and encoding --- ###
    num_cols = X_train.select_dtypes('number').columns
    cat_cols = X_train.select_dtypes('object').columns
    
    cat_pipe = Pipeline(steps=[('cat_imp', SimpleImputer(strategy = 'most_frequent')),
                               ('ohe',OneHotEncoder(drop = 'first',
                                                    handle_unknown='infrequent_if_exist',
                                                    sparse_output=False,
                                                    min_frequency = .1))])
    
    num_pipe = Pipeline(steps=[('cat_imp', SimpleImputer(strategy = 'mean')),
                               ('scaler', StandardScaler())])
    
    preprocessor = ColumnTransformer(transformers=[('num', num_pipe, num_cols),
                                                   ('cat', cat_pipe, cat_cols)])
        
    # Integrating the preprocessor with the regressor into a pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', regressor)])
    
    pipeline.fit(X_train, y_train)
    
    if show_metrics == True:
        preds = pipeline.predict(X_test)
        mae = metrics.mean_absolute_error(y_test, preds)
        rmse = metrics.root_mean_squared_error(y_test, preds)
        r2 = metrics.r2_score(y_test, preds)
        
        print(f'\nThe MAE is: {mae:.2f}',
              f'\nThe RMSE is: {rmse:.2f}'
              f'\nThe R2 is: {r2:.2f}')
    else:
        pass

    return pipeline

## DummyRegressor

In [16]:
create_and_test_bl_model(X_train,y_train, X_test, y_test, DummyRegressor())


The MAE is: 36.36 
The RMSE is: 48.23
The R2 is: -0.00




In [17]:
create_and_test_bl_model(X_train,y_train, X_test, y_test,
                         HistGradientBoostingRegressor(random_state = 903))


The MAE is: 14.41 
The RMSE is: 21.36
The R2 is: 0.80




In [24]:
rfr_model = create_and_test_bl_model(X_train,y_train, X_test, y_test, 
                                     RandomForestRegressor(random_state = 903, 
                                                           bn_jobs = -1,
                                                           min_samples_split = 2,
                                                           min_samples_leaf = 2,
                                                          m))




The MAE is: 9.51 
The RMSE is: 19.18
The R2 is: 0.84


In [30]:
rfr_score_train = rfr_model.score(X_train, y_train)
rfr_score_test = rfr_model.score(X_test, y_test)

print(f'The training score is: {rfr_score_train:,.2f} and the test score is: {rfr_score_test:,.2f}.')

The training score is: 0.90 and the test score is: 0.84.




In [27]:
preds_train = rfr_model.predict(X_train)
preds_test = rfr_model.predict(X_test)



In [None]:
depths = [tree.get_depth() for tree in rfr_model[-1].estimators_]

sns.histplot(depths);

In [22]:
rfr_model[-1]

In [20]:
rfr_model[-1].

array([1.26363851e-01, 1.25038145e-01, 4.08497083e-02, 1.65899366e-02,
       2.49324661e-02, 4.02504973e-02, 2.53374154e-02, 5.47802862e-04,
       4.55970061e-03, 2.96421644e-03, 1.84315955e-03, 1.57731754e-02,
       1.87332684e-03, 3.58554706e-03, 9.91072004e-03, 4.10958981e-03,
       1.14857856e-01, 2.74186687e-02, 9.00961671e-03, 5.83443443e-03,
       3.89800555e-03, 1.44379228e-02, 2.09719799e-02, 2.09681409e-02,
       2.12886402e-02, 8.05253228e-03, 6.92591055e-03, 3.34776095e-02,
       7.99888916e-02, 2.46426224e-03, 3.50579509e-03, 5.18126913e-03,
       2.76248070e-05, 6.78714139e-03, 3.30406758e-03, 1.52724314e-02,
       1.24831084e-03, 8.68772429e-03, 2.76163644e-03, 1.39100269e-01])

## SGDRegressor

In [21]:
rfr_model = create_and_test_bl_model(X_train,y_train, X_test, y_test, 
                                     SGDRegressor())


The MAE is: 26.52 
The RMSE is: 35.52
The R2 is: 0.46




In [14]:
# ### ---  Creating ColumnTransformer and sub-transformers for imputation and encoding --- ###
# num_cols = X.select_dtypes('number').columns
# cat_cols = X.select_dtypes('object').columns

# cat_pipe = Pipeline(steps=[('cat_imp', SimpleImputer(strategy = 'most_frequent')),
#                            ('ohe',
#                             OneHotEncoder(drop = 'if_binary',
#                                           handle_unknown='ignore',
#                                           sparse_output=False))])

# num_pipe = Pipeline(steps=[('cat_imp', SimpleImputer(strategy = 'most_frequent')),
#                            ('scaler', StandardScaler())])

# preprocessor = ColumnTransformer(transformers=[('num', num_pipe, num_cols),
#                                                ('cat', cat_pipe, cat_cols)])

# # Integrating the preprocessor with the SGDRegressor into a pipeline
# pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                            ('regressor', SGDRegressor(loss='huber',
#                                                       penalty='elasticnet',
#                                                       random_state=903))])

# pipeline.fit(X_train, y_train)


# preds = pipeline.predict(X_test)
# mae = metrics.mean_absolute_error(y_test, preds)
# rmse = metrics.root_mean_squared_error(y_test, preds)
# r2 = metrics.r2_score(y_test, preds)

# print(f'\nThe MAE is: {mae:.2f}',
#       f'\nThe RMSE is: {rmse:.2f}'
#       f'\nThe R2 is: {r2:.2f}')

# XGBRegressor

In [None]:
# pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                            ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42))])

# # Fit the pipeline to the training data
# pipeline.fit(X_train, y_train)

# # Make predictions on the test data
# y_pred = pipeline.predict(X_test)

# # Evaluate the model
# mae = metrics.mean_absolute_error(y_test, y_pred)
# mse = metrics.mean_squared_error(y_test, y_pred)
# r2 = metrics.r2_score(y_test, y_pred)

# # Print the results
# print(f"Mean Absolute Error (MAE): {mae:,.2f}",)
# print(f"Mean Squared Error (MSE): {mse:,.2f}",)
# print(f"R-squared (R²): {r2:,.2f}")

# Results

---

The best model was the Random Forest Regressor model, with an MAE of # and R^2 of #. This model performed well with minor pre-processing, leading me to believe there may be features that are strongly predictive of the ADR. I will need to investigate further to confirm.

---