# Kalman Filter - Testing and Evaluation for Modeling Process

- Possible additional modeling approach
- Still linear style; requires validation of assumptions

- Exploration shows strong performance with predicting ADR *(no assumption validations)*:
    - MAE:  6.81
    - RMSE:	8.58
    - MAPE:	0.06%
    - R^2:	0.91

In [1]:
## Used to re-import custom functions during development
%load_ext autoreload
%autoreload 2

In [2]:
## Enabling access to custom functions in separate directory

# Import necessary modules
import sys
import os

# Construct the absolute path to the 'src' directory
src_path = os.path.abspath(os.path.join('..', 'src'))

# Append the path to 'sys.path'
if src_path not in sys.path:
    sys.path.append(src_path)

import db_utils,eda

In [3]:
## Data Handling
import numpy as np
import pandas as pd


from pykalman import KalmanFilter
from sklearn.metrics import (mean_absolute_error, mean_squared_error,
                            mean_absolute_percentage_error, r2_score)
from sklearn.model_selection import train_test_split

In [4]:
## Settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
pd.set_option('display.max_rows', 50)

In [5]:
data_uuid = '../data/data_condensed_with_uuid.parquet'
data_fe_dates = '../data/engineered_data_dates.parquet'

df_data_uuid = pd.read_parquet(data_uuid)
df_data_fe_dates = pd.read_parquet(data_fe_dates)

data = df_data_uuid.merge(right = df_data_fe_dates, how = 'left', on = 'UUID')
data.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,Country,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate_x,HotelNumber,UUID,ReservationStatusDate_y,ArrivalDate,DepartureDate,BookingDate,ArrivalDate_DaysBeforeHoliday,ArrivalDate_DaysAfterHoliday,DepartureDate_DaysBeforeHoliday,DepartureDate_DaysAfterHoliday,BookingDate_DaysBeforeHoliday,BookingDate_DaysAfterHoliday,ArrivalDate_WeekNumber,ArrivalDate_DayOfWeek,DepartureDate_WeekNumber,DepartureDate_DayOfWeek,BookingDate_WeekNumber,BookingDate_DayOfWeek
0,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01,1,8ca998d6-fae7-4ee4-a706-3765721aaff5,2015-07-01,2015-07-01,2015-07-01,2014-07-24,45,21,45,21,22,44,27,3,27,3,30,4
1,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01,1,e535835e-b19a-4e32-9e9f-6d70a0182d4b,2015-07-01,2015-07-01,2015-07-01,2013-06-24,45,21,45,21,52,14,27,3,27,3,26,1
2,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02,1,9429383d-0efd-4c37-bb9b-0aaa63d5aade,2015-07-02,2015-07-01,2015-07-02,2015-06-24,45,21,44,22,52,14,27,3,27,4,26,3
3,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02,1,dd6424ee-6838-4007-ad85-de9ff96be14b,2015-07-02,2015-07-01,2015-07-02,2015-06-18,45,21,44,22,58,8,27,3,27,4,25,4
4,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03,1,50ff56ee-6a72-40dc-8ff1-4246b831c779,2015-07-03,2015-07-01,2015-07-03,2015-06-17,45,21,43,23,59,7,27,3,27,5,25,3


In [8]:
# Adjusting the approach to manually construct the 'ArrivalDate' string
data['ArrivalDateString'] = data['ArrivalDateYear'].astype(str) + '-' + \
                                  data['ArrivalDateMonth'].astype(str) + '-' + \
                                  data['ArrivalDateDayOfMonth'].astype(str)

# Now, convert the 'ArrivalDateString' to a datetime object
data['ArrivalDate'] = pd.to_datetime(data['ArrivalDateString'])

# Recalculate the daily ADR based on the corrected 'ArrivalDate'
daily_adr_corrected = data.groupby('ArrivalDate')['ADR'].mean().round(2).reset_index()

# Display the first few rows of the corrected aggregated data
daily_adr_corrected.head()

Unnamed: 0,ArrivalDate,ADR
0,2015-07-01,92.83
1,2015-07-02,82.21
2,2015-07-03,97.18
3,2015-07-04,85.58
4,2015-07-05,100.0


In [9]:
# Splitting the dataset
train, test = train_test_split(daily_adr_corrected, test_size=0.25, shuffle=False)

# Initialize the Kalman Filter
kf = KalmanFilter(initial_state_mean=train['ADR'][0], n_dim_obs=1)

# Estimate parameters using the EM algorithm based on observed ADR
kf = kf.em(train['ADR'], n_iter=5)

# Initialize lists to store predictions and predictions' covariance
predicted_adr = []
predicted_adr_covariance = []

# Current state is the last state of the training period
current_state = train['ADR'].iloc[-1]
current_covariance = kf.initial_state_covariance

for index, row in test.iterrows():
    # Predict the next state without observing yet
    next_state_prediction, next_state_covariance = kf.filter_update(
        current_state, current_covariance, observation=None)
    
    # Store the prediction
    predicted_adr.append(next_state_prediction[0][0])
    predicted_adr_covariance.append(next_state_covariance[0][0])
    
    # Update current state with the actual observation (if available)
    current_state, current_covariance = kf.filter_update(
        next_state_prediction, next_state_covariance, observation=row['ADR'])

# Add predictions to the test dataframe for comparison
test['Predicted_ADR'] = predicted_adr

test

Unnamed: 0,ArrivalDate,ADR,Predicted_ADR
594,2017-02-14,83.10,80.55
595,2017-02-15,80.54,81.63
596,2017-02-16,77.85,81.05
597,2017-02-17,77.45,79.26
598,2017-02-18,72.56,78.24
...,...,...,...
788,2017-08-27,153.05,164.06
789,2017-08-28,151.28,157.85
790,2017-08-29,144.66,154.14
791,2017-08-30,150.54,148.80


In [16]:
## Evaluate performance across multiple metrics

mae = mean_absolute_error(test['ADR'], test['Predicted_ADR'])
rmse = np.sqrt(mean_squared_error(test['ADR'], test['Predicted_ADR']))
mape = mean_absolute_percentage_error(test['ADR'], test['Predicted_ADR'])
r_squared = r2_score(test['ADR'], test['Predicted_ADR'])

print(f"MAE:\t{mae:.2f}")
print(f"RMSE:\t{rmse:.2f}")
print(f"MAPE:\t{mape:.2f}%")
print(f"R^2:\t{r_squared:.2f}")

MAE:	6.81
RMSE:	8.58
MAPE:	0.06%
R^2:	0.91


## Kalman Smoother - Future Forecasting

In [34]:
raise Exception('Computationally intensive calculations below. Avoid running!')

Exception: Computationally intensive calculations below. Avoid running!

In [27]:
# Training data
observations_train = train['ADR'].values

# Test data
observations_test = test['ADR'].values

# Initialize the Kalman Filter with parameters estimated from the training set
kf = KalmanFilter(initial_state_mean=observations_train[0], n_dim_obs=1)
kf = kf.em(observations_train, n_iter=5)

# Prepare a list to collect smoothed states iteratively
iterative_smoothed_states = []

# Start with training data as the initial dataset
current_observations = observations_train.tolist()

for new_observation in observations_test:
    # Append each new observation from the test set to the current observations
    current_observations.append(new_observation)
    
    # Apply the smoother to the updated dataset
    smoothed_states, _ = kf.smooth(np.array(current_observations))
    
    # Store the last smoothed state
    iterative_smoothed_states.append(smoothed_states[-1])

# Optionally, convert smoothed states to a numpy array for easier handling
iterative_smoothed_states = np.array(iterative_smoothed_states)

# Example: Print the last few smoothed states
print("Last few smoothed states from the iterative process:")
print(iterative_smoothed_states[-5:])

Last few smoothed states from the iterative process:
[[158.83667099]
 [155.44720618]
 [150.608719  ]
 [150.57789582]
 [150.5967812 ]]


In [18]:
# Assuming observations_test are your actual values for the test set
y_true = observations_test

# The smoothed predictions for the test set
y_pred = iterative_smoothed_states[:, 0]  # Adjust depending on your state vector structure

# Calculate MSE
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
r2 = r2_score(y_true, y_pred)

print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAPE: {mape:.2f}%")
print(f"R2: {r2:.2f}")

MSE: 21.64
RMSE: 4.65
MAPE: 3.20%
R2: 0.97
