# Baseline Modeling - Regression

---

* Goal: to develop baseline models prior to feature engineering to compare performance vs. post-engineered models.

---

In [17]:
import numpy as np
import pandas as pd
import seaborn as sns

In [18]:
## SKLearn and Modeling Tools

from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.outliers import Winsorizer
from feature_engine.pipeline import Pipeline as fePipeline

from sklearn import metrics
from sklearn import set_config
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler

set_config(transform_output='pandas')

from xgboost import XGBRegressor

## Load Data

In [19]:
df_data_full = pd.read_feather('../../data/source/full_data.feather')

df_data_full

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelNumber
0,0,342,2015,July,27,1,0,0,2,0.0,...,,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1
1,0,737,2015,July,27,1,0,0,2,0.0,...,,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1
2,0,7,2015,July,27,1,0,1,1,0.0,...,,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1
3,0,13,2015,July,27,1,0,1,1,0.0,...,304,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1
4,0,14,2015,July,27,1,0,2,2,0.0,...,240,,0,Transient,98.00,0,1,Check-Out,2015-07-03,H1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79325,0,23,2017,August,35,30,2,5,2,0.0,...,394,,0,Transient,96.14,0,0,Check-Out,2017-09-06,H2
79326,0,102,2017,August,35,31,2,5,3,0.0,...,9,,0,Transient,225.43,0,2,Check-Out,2017-09-07,H2
79327,0,34,2017,August,35,31,2,5,2,0.0,...,9,,0,Transient,157.71,0,4,Check-Out,2017-09-07,H2
79328,0,109,2017,August,35,31,2,5,2,0.0,...,89,,0,Transient,104.40,0,0,Check-Out,2017-09-07,H2


## Set Target Feature

In [20]:
target_feature = 'ADR'

## Quick Overview

In [21]:
df_data_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119390 entries, 0 to 79329
Data columns (total 32 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   IsCanceled                   119390 non-null  int64  
 1   LeadTime                     119390 non-null  int64  
 2   ArrivalDateYear              119390 non-null  int64  
 3   ArrivalDateMonth             119390 non-null  object 
 4   ArrivalDateWeekNumber        119390 non-null  int64  
 5   ArrivalDateDayOfMonth        119390 non-null  int64  
 6   StaysInWeekendNights         119390 non-null  int64  
 7   StaysInWeekNights            119390 non-null  int64  
 8   Adults                       119390 non-null  int64  
 9   Children                     119386 non-null  float64
 10  Babies                       119390 non-null  int64  
 11  Meal                         119390 non-null  object 
 12  Country                      118902 non-null  object 
 13  Marke

In [22]:
df_data_full.describe(include = 'number').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
IsCanceled,119390.0,0.370416,0.482918,0.0,0.0,0.0,1.0,1.0
LeadTime,119390.0,104.011416,106.863097,0.0,18.0,69.0,160.0,737.0
ArrivalDateYear,119390.0,2016.156554,0.707476,2015.0,2016.0,2016.0,2017.0,2017.0
ArrivalDateWeekNumber,119390.0,27.165173,13.605138,1.0,16.0,28.0,38.0,53.0
ArrivalDateDayOfMonth,119390.0,15.798241,8.780829,1.0,8.0,16.0,23.0,31.0
StaysInWeekendNights,119390.0,0.927599,0.998613,0.0,0.0,1.0,2.0,19.0
StaysInWeekNights,119390.0,2.500302,1.908286,0.0,1.0,2.0,3.0,50.0
Adults,119390.0,1.856403,0.579261,0.0,2.0,2.0,2.0,55.0
Children,119386.0,0.10389,0.398561,0.0,0.0,0.0,0.0,10.0
Babies,119390.0,0.007949,0.097436,0.0,0.0,0.0,0.0,10.0


In [23]:
df_data_full.describe(exclude = 'number').T

Unnamed: 0,count,unique,top,freq
ArrivalDateMonth,119390,12,August,13877
Meal,119390,5,BB,92310
Country,118902,177,PRT,48590
MarketSegment,119390,8,Online TA,56477
DistributionChannel,119390,5,TA/TO,97870
ReservedRoomType,119390,10,A,85994
AssignedRoomType,119390,12,A,74053
DepositType,119390,3,No Deposit,104641
Agent,119390,334,9,31961
Company,119390,353,,112593


In [24]:
df_data_full[target_feature].describe().round(2)

count    119390.00
mean        101.83
std          50.54
min          -6.38
25%          69.29
50%          94.58
75%         126.00
max        5400.00
Name: ADR, dtype: float64

# Drop Questionable Features

---

There are several features that I can identify as being too strongly predictive of the ADR. These features indicate whether or not a guest stayed (if they cancel or no-show, the revenue is zero).
* `IsCanceled`, `ReservationStatus`

Additionally, there are some temporal features that are either irrelevant to predictive modeling (`ArrivalDateYear`) or too closely related to the predictive features above (`ReservationStatusDate`).

I will drop these features to match real-world data more closely/realistically.

---

In [25]:
df_data_full = df_data_full.drop(columns = ['IsCanceled', 'ReservationStatus', 'ReservationStatusDate', 'ArrivalDateYear'])

In [26]:
df_data_full.head().T

Unnamed: 0,0,1,2,3,4
LeadTime,342,737,7,13,14
ArrivalDateMonth,July,July,July,July,July
ArrivalDateWeekNumber,27,27,27,27,27
ArrivalDateDayOfMonth,1,1,1,1,1
StaysInWeekendNights,0,0,0,0,0
StaysInWeekNights,0,0,1,1,2
Adults,2,2,1,1,2
Children,0.0,0.0,0.0,0.0,0.0
Babies,0,0,0,0,0
Meal,BB,BB,BB,BB,BB


# Train-Test Split and Preprocessor

In [27]:
df_data_full.head()

Unnamed: 0,LeadTime,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,...,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,HotelNumber
0,342,July,27,1,0,0,2,0.0,0,BB,...,3,No Deposit,,,0,Transient,0.0,0,0,H1
1,737,July,27,1,0,0,2,0.0,0,BB,...,4,No Deposit,,,0,Transient,0.0,0,0,H1
2,7,July,27,1,0,1,1,0.0,0,BB,...,0,No Deposit,,,0,Transient,75.0,0,0,H1
3,13,July,27,1,0,1,1,0.0,0,BB,...,0,No Deposit,304.0,,0,Transient,75.0,0,0,H1
4,14,July,27,1,0,2,2,0.0,0,BB,...,0,No Deposit,240.0,,0,Transient,98.0,0,1,H1


In [28]:
X = df_data_full.drop(columns = target_feature)
y = df_data_full[target_feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 903)

# Create Feature-Engine Pipeline

In [29]:
cff_encoder = CountFrequencyEncoder(encoding_method = 'frequency', unseen = 'encode', missing_values = 'ignore')
winsorizer = Winsorizer(tail = 'both', add_indicators= False, missing_values= 'ignore')

# pipeline = fePipeline([('encoder', CountFrequencyEncoder(encoding_method = 'frequency', unseen = 'encode', missing_values = 'ignore')),
#                       'outlier_handling', Winsorizer(tail = 'both', add_indicators= True, missing_values= 'ignore'),
#                       'rfr', RandomForestRegressor(n_jobs = -1)])

In [30]:
cff_encoder.fit(X_train)
X_train = cff_encoder.transform(X_train)

winsorizer.fit(X_train)
X_train = winsorizer.transform(X_train)
X_train

Unnamed: 0,LeadTime,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,...,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,RequiredCarParkingSpaces,TotalOfSpecialRequests,HotelNumber
50660,118.0,0.091354,24,6,1.0,3.0,1.0,0.000000,0.0,0.773101,...,0.210594,1.0,0.876840,0.001698,0.943166,0.0,0.211722,0.0,0.0,0.663119
29994,41.0,0.057057,45,5,2.0,5.0,2.0,0.000000,0.0,0.773101,...,0.620111,0.0,0.876840,0.117185,0.943166,0.0,0.749313,0.0,2.0,0.336881
29551,129.0,0.091354,22,2,1.0,2.0,2.0,0.000000,0.0,0.773101,...,0.620111,0.0,0.121697,0.001530,0.943166,0.0,0.749313,0.0,0.0,0.663119
72603,227.0,0.099149,21,24,0.0,3.0,2.0,0.000000,0.0,0.773101,...,0.620111,0.0,0.876840,0.266925,0.943166,0.0,0.749313,0.0,2.0,0.663119
10892,309.0,0.099149,20,12,0.0,2.0,2.0,0.000000,0.0,0.773101,...,0.620111,0.0,0.121697,0.059972,0.943166,0.0,0.749313,0.0,0.0,0.663119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11592,98.0,0.099149,20,19,2.0,6.0,2.0,1.312338,0.0,0.773101,...,0.019778,0.0,0.876840,0.117185,0.943166,0.0,0.749313,0.0,2.0,0.336881
29022,230.0,0.099149,21,25,0.0,3.0,2.0,0.000000,0.0,0.773101,...,0.620111,1.0,0.876840,0.266925,0.943166,0.0,0.749313,0.0,1.0,0.663119
73246,118.0,0.091354,22,2,1.0,2.0,2.0,0.000000,0.0,0.773101,...,0.620111,0.0,0.876840,0.030611,0.943166,0.0,0.749313,0.0,1.0,0.663119
36592,105.0,0.099149,20,18,2.0,5.0,2.0,0.000000,0.0,0.773101,...,0.210594,0.0,0.876840,0.004300,0.943166,0.0,0.034096,0.0,1.0,0.336881


In [31]:
X_train.describe().round(2)

Unnamed: 0,LeadTime,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,...,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,RequiredCarParkingSpaces,TotalOfSpecialRequests,HotelNumber
count,89542.0,89542.0,89542.0,89542.0,89542.0,89542.0,89542.0,89539.0,89542.0,89542.0,...,89542.0,89542.0,89542.0,89542.0,89542.0,89542.0,89542.0,89542.0,89542.0,89542.0
mean,103.01,0.09,27.14,15.8,0.92,2.45,1.85,0.08,0.0,0.62,...,0.44,0.2,0.78,0.11,0.9,1.32,0.61,0.05,0.57,0.55
std,103.88,0.02,13.61,8.79,0.95,1.67,0.49,0.3,0.03,0.28,...,0.24,0.51,0.25,0.1,0.16,7.85,0.25,0.19,0.78,0.15
min,0.0,0.05,1.0,1.0,0.0,0.0,0.07,0.0,0.0,0.01,...,0.0,0.0,0.04,0.0,0.24,0.0,0.0,0.0,0.0,0.34
25%,18.0,0.08,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.77,...,0.21,0.0,0.88,0.01,0.94,0.0,0.21,0.0,0.0,0.34
50%,69.0,0.09,27.0,16.0,1.0,2.0,2.0,0.0,0.0,0.77,...,0.62,0.0,0.88,0.12,0.94,0.0,0.75,0.0,0.0,0.66
75%,160.0,0.1,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.77,...,0.62,0.0,0.88,0.27,0.94,0.0,0.75,0.0,1.0,0.66
max,424.25,0.12,53.0,31.0,3.92,8.22,3.65,1.31,0.31,0.77,...,0.62,2.19,0.88,0.27,0.94,53.77,0.75,0.8,2.95,0.66


In [33]:
X_test = cff_encoder.transform(X_test)
X_test = winsorizer.transform(X_test)

# Test Models with Feature-Engine

In [35]:
rfr = RandomForestRegressor(n_jobs = -1)

rfr.fit(X_train, y_train)

preds = rfr.predict(X_test)

In [38]:
rfr.score(X_test, y_test)

0.8270545207960885

In [40]:
metrics.mean_absolute_error(preds, y_test)

8.580115402254467

In [32]:
raise Exception('End of Testing')

Exception: End of Testing

# Convert NaN and Negative ADRs to .0001

In [None]:
df_data_target = pd.Series(np.where(df_data_target <= 0,.0001,df_data_target))
df_data_target

In [None]:
df_data_target.describe()

# Pipeline

In [None]:
def create_and_test_bl_model(X_train, y_train,
                             X_test, y_test,
                             regressor,
                             show_metrics = True):

### ---  Creating ColumnTransformer and sub-transformers for imputation and encoding --- ###
    num_cols = X_train.select_dtypes('number').columns
    cat_cols = X_train.select_dtypes('object').columns
    
    cat_pipe = Pipeline(steps=[('cat_imp', SimpleImputer(strategy = 'most_frequent')),
                               ('ohe',OneHotEncoder(drop = 'if_binary',
                                              handle_unknown='ignore',
                                              sparse_output=False))])
    
    num_pipe = Pipeline(steps=[('cat_imp', SimpleImputer(strategy = 'most_frequent')),
                               ('scaler', StandardScaler())])
    
    preprocessor = ColumnTransformer(transformers=[('num', num_pipe, num_cols),
                                                   ('cat', cat_pipe, cat_cols)])
        
    # Integrating the preprocessor with the regressor into a pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', regressor)])
    
    pipeline.fit(X_train, y_train)
    
    if show_metrics == True:
        preds = pipeline.predict(X_test)
        mae = metrics.mean_absolute_error(y_test, preds)
        rmse = metrics.root_mean_squared_error(y_test, preds)
        r2 = metrics.r2_score(y_test, preds)
        
        print(f'\nThe MAE is: {mae:.2f}',
              f'\nThe RMSE is: {rmse:.2f}'
              f'\nThe R2 is: {r2:.2f}')
    else:
        pass

    return pipeline

## DummyRegressor

In [None]:
create_and_test_bl_model(X_train,y_train, X_test, y_test, DummyRegressor(random_state = 903))

In [None]:
create_and_test_bl_model(X_train,y_train, X_test, y_test,
                         HistGradientBoostingRegressor(random_state = 903))

In [None]:
rfr_model = create_and_test_bl_model(X_train,y_train, X_test, y_test, 
                                     RandomForestRegressor(n_jobs = -1,
                                                           min_samples_split=2,
                                                           max_depth=75))

In [None]:
depths = [tree.get_depth() for tree in rfr_model[-1].estimators_]

sns.histplot(depths);

In [None]:
rfr_model[-1]

In [None]:
rfr_model[-1].feature_importances_

## SGDRegressor

In [None]:
# ### ---  Creating ColumnTransformer and sub-transformers for imputation and encoding --- ###
# num_cols = X.select_dtypes('number').columns
# cat_cols = X.select_dtypes('object').columns

# cat_pipe = Pipeline(steps=[('cat_imp', SimpleImputer(strategy = 'most_frequent')),
#                            ('ohe',
#                             OneHotEncoder(drop = 'if_binary',
#                                           handle_unknown='ignore',
#                                           sparse_output=False))])

# num_pipe = Pipeline(steps=[('cat_imp', SimpleImputer(strategy = 'most_frequent')),
#                            ('scaler', StandardScaler())])

# preprocessor = ColumnTransformer(transformers=[('num', num_pipe, num_cols),
#                                                ('cat', cat_pipe, cat_cols)])

# # Integrating the preprocessor with the SGDRegressor into a pipeline
# pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                            ('regressor', SGDRegressor(loss='huber',
#                                                       penalty='elasticnet',
#                                                       random_state=903))])

# pipeline.fit(X_train, y_train)


# preds = pipeline.predict(X_test)
# mae = metrics.mean_absolute_error(y_test, preds)
# rmse = metrics.root_mean_squared_error(y_test, preds)
# r2 = metrics.r2_score(y_test, preds)

# print(f'\nThe MAE is: {mae:.2f}',
#       f'\nThe RMSE is: {rmse:.2f}'
#       f'\nThe R2 is: {r2:.2f}')

# XGBRegressor

In [None]:
# pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                            ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42))])

# # Fit the pipeline to the training data
# pipeline.fit(X_train, y_train)

# # Make predictions on the test data
# y_pred = pipeline.predict(X_test)

# # Evaluate the model
# mae = metrics.mean_absolute_error(y_test, y_pred)
# mse = metrics.mean_squared_error(y_test, y_pred)
# r2 = metrics.r2_score(y_test, y_pred)

# # Print the results
# print(f"Mean Absolute Error (MAE): {mae:,.2f}",)
# print(f"Mean Squared Error (MSE): {mse:,.2f}",)
# print(f"R-squared (R²): {r2:,.2f}")

# Results

---

The best model was the Random Forest Regressor model, with an MAE of # and R^2 of #. This model performed well with minor pre-processing, leading me to believe there may be features that are strongly predictive of the ADR. I will need to investigate further to confirm.

---