# Demand Forecast: LSTM Model

In [2]:
# Import necessary libraries
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from itertools import product
from skopt import gp_minimize
from skopt.space import Real, Integer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from datetime import datetime, timedelta
from statsmodels.tsa.stattools import adfuller
import itertools
import warnings
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scikeras.wrappers import KerasClassifier, KerasRegressor
from sklearn.model_selection import GridSearchCV
from kerastuner.tuners import Hyperband
from tensorflow import keras
from scipy.stats import skewnorm
import re
import json


  from kerastuner.tuners import Hyperband


### scrapping weather data

In [4]:
def read_weather_data():
    try:
        # Open the JSON file
        with open('weather_data.json', 'r') as file:
            # Load data from the JSON file
            weather_data = json.load(file)
        
        # Print the weather data
        for entry in weather_data:
            print(entry)
    except FileNotFoundError:
        print("File not found. Please check the filename and try again.")
    except json.JSONDecodeError:
        print("Error decoding JSON. Please check the JSON file for errors.")

# Run the function
#read_weather_data()


### weather data processing

In [8]:
from datetime import timedelta, date
def clean_text(text):
    """Remove HTML tags and unwanted spaces from text."""
    text = re.sub('<[^<]+?>', '', text)  # Remove HTML tags
    text = text.replace('&nbsp;', ' ')   # Replace non-breaking spaces
    text = re.sub('\s+', ' ', text)      # Replace multiple spaces with a single space
    return text.strip()

def extract_date(time_str, current_year):
    """Extract the date from the time string and append the current year."""
    match = re.search(r'\d{2}:\d{2}(\w{3}, \d{1,2} \w{3})', time_str)
    if match:
        date_str = match.group(1)
        date = datetime.strptime(date_str + f' {current_year}', '%a, %d %b %Y')
        return date.strftime('%Y-%m-%d')
    return None

def read_and_process_data(file_path, starting_date=date.today().strftime('%Y-%m-%d')):
    """Read JSON data from a file and process it into a DataFrame."""
    with open(file_path, 'r') as file:
        weather_data = json.load(file)
    
    cleaned_data = []
    current_date = starting_date
    last_time = None
    for entry in weather_data:
        time = clean_text(entry['time'])
        if "ConditionsComfortPrecipitation" in time:
            continue

        new_date = extract_date(time, current_date[:4])
        if new_date:
            current_date = new_date
            time = re.sub(r'\w{3}, \d{1,2} \w{3}', '', time).strip()

        if last_time and int(last_time[:2]) > int(time[:2]):
            current_date = (datetime.strptime(current_date, '%Y-%m-%d') + timedelta(days=1)).strftime('%Y-%m-%d')
        
        temperature = clean_text(entry['temperature'])
        weather_condition = entry['weatherCondition']
        temperature = int(re.sub('[^\d]', '', temperature))
        
        cleaned_data.append({
            'Date': current_date,
            'Time': time,
            'Temperature': temperature,
            'Weather Condition': weather_condition
        })
        last_time = time
    
    df = pd.DataFrame(cleaned_data)
    # Adding the 'Is Heavy Rainfall' column based on weather conditions
    df['Is Heavy Rainfall'] = df['Weather Condition'].apply(lambda x: 1 if 'heavy' in x.lower() or 'shower' in x.lower() else 0)
    return df

def process_weather_data(file_path, starting_date=date.today().strftime('%Y-%m-%d')):
    """Orchestrate the process of reading, cleaning, and organizing weather data."""
    return read_and_process_data(file_path, starting_date)

# Assuming your JSON file is named 'weather_data.json' in the current directory
weather_df = process_weather_data('weather_data.json')
weather_df


Unnamed: 0,Date,Time,Temperature,Weather Condition,Is Heavy Rainfall
0,2024-04-18,16:00,30,Thundershowers. Overcast.,1
1,2024-04-18,17:00,30,Thundershowers. Overcast.,1
2,2024-04-18,18:00,29,Thundershowers. Overcast.,1
3,2024-04-18,19:00,28,Rain showers. Overcast.,1
4,2024-04-18,20:00,28,Light showers. Overcast.,1
5,2024-04-18,21:00,27,Light showers. Overcast.,1
6,2024-04-18,22:00,27,A few showers. Overcast.,1
7,2024-04-18,23:00,27,Passing showers. Overcast.,1
8,2024-04-18,00:00,27,A few showers. Overcast.,1
9,2024-04-18,01:00,27,Light showers. Overcast.,1


### prepare sg public holiday dataset in 2024

In [11]:
def generate_holiday_df(year):
    import datetime  # Importing datetime module within the function scope
    start_date = datetime.date(year, 1, 1)
    end_date = datetime.date(year, 12, 31)

    # List of holidays in Singapore for the given year
    holidays_sg = [
        datetime.date(year, 1, 1), datetime.date(year, 2, 10), datetime.date(year, 2, 11),
        datetime.date(year, 3, 29), datetime.date(year, 4, 10), datetime.date(year, 5, 1),
        datetime.date(year, 5, 20), datetime.date(year, 6, 17), datetime.date(year, 8, 9),
        datetime.date(year, 10, 31), datetime.date(year, 12, 25)
    ]

    # Dictionary to convert day names to numbers
    day_to_number = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7
    }

    holiday_sg_24 = []
    current_date = start_date
    while current_date <= end_date:
        day_of_week = current_date.strftime('%A')
        holiday_flg_sg = 1 if current_date in holidays_sg else 0
        holiday_sg_24.append([current_date.strftime('%Y-%m-%d'), day_of_week, holiday_flg_sg])
        current_date += datetime.timedelta(days=1)

    # Create DataFrame
    holiday_sg_24 = pd.DataFrame(holiday_sg_24, columns=['calendar_date', 'day_of_week', 'holiday_flg_sg'])

    # Map day_of_week from name to number
    holiday_sg_24['day_of_week'] = holiday_sg_24['day_of_week'].map(day_to_number)

    return holiday_sg_24

holiday_sg_24 = generate_holiday_df(2024)

### LSTM Model Training

In [16]:
# Load visitor data
visitor_data = pd.read_csv("../data/raw/synthetic_visit_data.csv")
visitor_data['visit_date'] = pd.to_datetime(visitor_data['visit_date'])
#visitor_data.head()

weather_data = pd.read_csv("../data/processed/weather_data_cleaned.csv")
weather_data['Date'] = pd.to_datetime(weather_data['Date'])
columns_to_drop = weather_data.columns[0:3].tolist()  # Dropping columns by indices
weather_data.drop(columns=columns_to_drop, inplace=True)
#weather_data.head()

holiday_data = pd.read_csv("../data/raw/date_info_2324.csv")
holiday_data['calendar_date'] = pd.to_datetime(holiday_data['calendar_date'])
holiday_data.rename(columns={'calendar_date': 'calender_date'}, inplace=True)
holiday_data.drop(columns="day_of_week", inplace=True)
#holiday_data.head()

merged_data = pd.merge(visitor_data, weather_data, left_on='visit_date', right_on='Date', how='left')
merged_data = pd.merge(merged_data, holiday_data, left_on='visit_date', right_on='calender_date', how='left')
merged_data.drop(columns=["Date","calender_date","Highest 30 min Rainfall (mm)","Highest 60 min Rainfall (mm)","Highest 120 min Rainfall (mm)","Maximum Temperature (°C)","Minimum Temperature (°C)","Mean Wind Speed (km/h)","Max Wind Speed (km/h)"], inplace=True)
merged_data.head()

# 1. Encoding Categorical Variables
df = pd.get_dummies(merged_data, columns=['day_of_week'])

# 2. Creating a binary feature for heavy rainfall
merged_data['heavy_rainfall_flg'] = (merged_data['Daily Rainfall Total (mm)'] > 20).astype(int)

# 3. Categorizing temperature
merged_data['temperature_category'] = pd.cut(df['Mean Temperature (°C)'],
                                    bins=[-np.inf, 15, 25, np.inf],
                                    labels=['cold', 'mild', 'hot'])

# 4. Consolidating Holiday Flags
merged_data['is_holiday'] = merged_data[['holiday_flg_sg', 'holiday_flg_cn', 'holiday_flg_in']].max(axis=1)

# 5. Adding week of the year
merged_data['week_of_year'] = pd.to_datetime(merged_data['visit_date']).dt.isocalendar().week

# It's essential to transform these new categorical variables into a form suitable for modeling
merged_data = pd.get_dummies(merged_data, columns=['temperature_category'])

merged_data.drop(columns=["Daily Rainfall Total (mm)","Mean Temperature (°C)", "holiday_flg_sg", "holiday_flg_cn", "holiday_flg_in"], inplace = True)
merged_data.head()

# Split the data into training and testing sets
train_percent = 0.8  # Use 80% of the data for training
split_index = int(len(merged_data) * train_percent)
train_data = visitor_data.iloc[:split_index]
test_data = visitor_data.iloc[split_index:]

# Convert the index (visit_date) to datetime
merged_data.index = pd.to_datetime(merged_data.index)

# Create lag features for visitors
for lag in range(1, 8):  # 7 days lag
    merged_data[f'visitors_lag_{lag}'] = merged_data['visitors'].shift(lag)

# Create rolling window features (7-day rolling mean and std deviation)
merged_data['rolling_mean_visitors'] = merged_data['visitors'].rolling(window=7).mean().shift(1)
merged_data['rolling_std_visitors'] = merged_data['visitors'].rolling(window=7).std().shift(1)

# Fill any NaN values that have been introduced by lag/rolling features
merged_data.fillna(method='bfill', inplace=True)

merged_data['year'] = merged_data['visit_date'].dt.year
merged_data['month'] = merged_data['visit_date'].dt.month
merged_data['day'] = merged_data['visit_date'].dt.day
lstm_data = merged_data.drop(['visit_date','day_of_week'], axis=1, inplace=False)
X = lstm_data.drop(['visitors'], axis=1)
y = lstm_data['visitors']
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y.values.reshape(-1,1))
X_scaled = np.reshape(X_scaled, (X_scaled.shape[0], 1, X_scaled.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Select features and target
X = lstm_data.drop(['visitors'], axis=1)
y = lstm_data['visitors']

# Scale features
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y.values.reshape(-1,1))

# Reshape for LSTM [samples, time steps, features]
X_scaled = np.reshape(X_scaled, (X_scaled.shape[0], 1, X_scaled.shape[1]))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Build the LSTM model with dropout
model = Sequential()
model.add(LSTM(100, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.1, verbose=2)

''''
# Plot training history
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()
'''

# Predict and inverse transform predictions
predictions = model.predict(X_test)
predictions_inv = scaler.inverse_transform(predictions)

# Inversely transform y_test for comparison
y_test_inv = scaler.inverse_transform(y_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test_inv, predictions_inv))
print(f'Test RMSE: {rmse:.3f}')

'''
# Plot predictions against actual values
plt.figure(figsize=(10,6))
plt.plot(y_test_inv, label='Actual')
plt.plot(predictions_inv, label='Predicted')
plt.legend()
plt.show()
'''

def build_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units1', min_value=50, max_value=200, step=50),
                   return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(rate=hp.Float('dropout1', min_value=0.1, max_value=0.5, step=0.1)))
    model.add(LSTM(units=hp.Int('units2', min_value=20, max_value=100, step=20), return_sequences=False))
    model.add(Dropout(rate=hp.Float('dropout2', min_value=0.1, max_value=0.5, step=0.1)))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

tuner = Hyperband(
    build_model,
    objective='val_loss',
    max_epochs=50,
    factor=3,
    directory='my_dir',
    project_name='intro_to_kt'
)

tuner.search(X_train, y_train, epochs=50, validation_split=0.2, callbacks=[keras.callbacks.EarlyStopping(patience=10)])

best_model = tuner.get_best_models(num_models=1)[0]
loss = best_model.evaluate(X_test, y_test)

2024-04-18 15:09:00.659039: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-04-18 15:09:00.659171: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-04-18 15:09:00.659191: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-04-18 15:09:00.659225: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-18 15:09:00.659257: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
  super().__init__(**kwargs)


Epoch 1/100


2024-04-18 15:09:01.385331: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


5/5 - 2s - 469ms/step - loss: 0.2130 - val_loss: 0.2179
Epoch 2/100
5/5 - 0s - 42ms/step - loss: 0.1594 - val_loss: 0.1543
Epoch 3/100
5/5 - 0s - 35ms/step - loss: 0.1104 - val_loss: 0.0964
Epoch 4/100
5/5 - 0s - 36ms/step - loss: 0.0756 - val_loss: 0.0576
Epoch 5/100
5/5 - 0s - 36ms/step - loss: 0.0655 - val_loss: 0.0492
Epoch 6/100
5/5 - 0s - 37ms/step - loss: 0.0683 - val_loss: 0.0477
Epoch 7/100
5/5 - 0s - 33ms/step - loss: 0.0632 - val_loss: 0.0478
Epoch 8/100
5/5 - 0s - 34ms/step - loss: 0.0556 - val_loss: 0.0517
Epoch 9/100
5/5 - 0s - 36ms/step - loss: 0.0557 - val_loss: 0.0507
Epoch 10/100
5/5 - 0s - 36ms/step - loss: 0.0528 - val_loss: 0.0476
Epoch 11/100
5/5 - 0s - 34ms/step - loss: 0.0491 - val_loss: 0.0426
Epoch 12/100
5/5 - 0s - 34ms/step - loss: 0.0450 - val_loss: 0.0403
Epoch 13/100
5/5 - 0s - 35ms/step - loss: 0.0400 - val_loss: 0.0393
Epoch 14/100
5/5 - 0s - 34ms/step - loss: 0.0371 - val_loss: 0.0386
Epoch 15/100
5/5 - 0s - 33ms/step - loss: 0.0358 - val_loss: 0.0383


  super().__init__(**kwargs)
  model.build_from_config(
  trackable.load_own_variables(weights_store.get(inner_path))


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 0.0249 


### prediction dataframe

In [26]:
def prediction_data(visitor_data, weather_df, holiday_sg_24):
    visitor_data['visit_date'] = pd.to_datetime(visitor_data['visit_date'])
    weather_df['Date'] = pd.to_datetime(weather_df['Date'])
    holiday_sg_24['calendar_date'] = pd.to_datetime(holiday_sg_24['calendar_date'])
    prediction_df = pd.merge(visitor_data, weather_df, left_on='visit_date', right_on='Date', how='right')
    prediction_df = pd.merge(prediction_df, holiday_sg_24, left_on='Date', right_on='calendar_date', how='left')
    prediction_df['day_of_week'] = prediction_df['Date'].dt.day_name()
    prediction_df['week_of_year'] = pd.to_datetime(prediction_df['Date']).dt.isocalendar().week
    prediction_df.drop_duplicates(inplace=True)
    prediction_df['is_heavy_rainfall_flg'] = prediction_df['Weather Condition'].apply(lambda x: 1 if 'shower' in x else 0)
    prediction_df.drop(columns=['visit_date', 'Weather Condition','Time','day_of_week_y', 'calendar_date','day_of_week_x'], inplace=True)
    prediction_df.rename(columns={'Date': 'visit_date', 'holiday_flg_sg': 'is_holiday'}, inplace=True)
    prediction_df['temperature_category'] = pd.cut(prediction_df['Temperature'],
                                        bins=[-np.inf, 15, 25, np.inf],
                                        labels=['cold', 'mild', 'hot'])
    prediction_df.drop(columns='Temperature',inplace=True)
    prediction_df = pd.get_dummies(prediction_df, columns=['temperature_category'])
    
    # Create lag features based on your model needs
    for lag in range(1, 8):  # Example: Creating 7 days of lag features
        prediction_df[f'visitors_lag_{lag}'] = prediction_df['visitors'].shift(lag)

    # Rolling window features (mean and std deviation)
    prediction_df['rolling_mean_visitors'] = prediction_df['visitors'].rolling(window=7).mean().shift(1)
    prediction_df['rolling_std_visitors'] = prediction_df['visitors'].rolling(window=7).std().shift(1)

    # Fill any NaN values that have been introduced by lag/rolling features
    prediction_df.fillna(method='bfill', inplace=True)

    # Adding date parts
    prediction_df['year'] = prediction_df['visit_date'].dt.year
    prediction_df['month'] = prediction_df['visit_date'].dt.month
    prediction_df['day'] = prediction_df['visit_date'].dt.day

    # Prepare feature matrix X for the LSTM model
    X = prediction_df.drop(['visit_date', 'day_of_week'], axis=1)
    X_scaled = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)
    X_scaled = np.reshape(X_scaled, (X_scaled.shape[0], 1, X_scaled.shape[1]))

    return X_scaled

weather_df = process_weather_data('weather_data.json')
holiday_sg_24 = generate_holiday_df(2024)
visitor_data = pd.read_csv("../data/raw/synthetic_visit_data.csv")
X_predict = prediction_data(visitor_data, weather_df, holiday_sg_24)
X_predict

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


array([[[nan,  1.,  0.,  0.,  1.,  0.,  0.,  0., nan, nan, nan, nan,
         nan, nan, nan, nan, nan,  0.,  0.,  0.]],

       [[nan,  1.,  0.,  0.,  1.,  0.,  0.,  0., nan, nan, nan, nan,
         nan, nan, nan, nan, nan,  0.,  0.,  0.]],

       [[nan,  1.,  0.,  0.,  1.,  0.,  0.,  0., nan, nan, nan, nan,
         nan, nan, nan, nan, nan,  0.,  0.,  0.]],

       [[nan,  1.,  0.,  0.,  1.,  0.,  0.,  0., nan, nan, nan, nan,
         nan, nan, nan, nan, nan,  0.,  0.,  0.]],

       [[nan,  1.,  0.,  0.,  1.,  0.,  0.,  0., nan, nan, nan, nan,
         nan, nan, nan, nan, nan,  0.,  0.,  0.]],

       [[nan,  1.,  0.,  0.,  1.,  0.,  0.,  0., nan, nan, nan, nan,
         nan, nan, nan, nan, nan,  0.,  0.,  0.]],

       [[nan,  1.,  0.,  0.,  1.,  0.,  0.,  0., nan, nan, nan, nan,
         nan, nan, nan, nan, nan,  0.,  0.,  0.]],

       [[nan,  1.,  0.,  0.,  1.,  0.,  0.,  0., nan, nan, nan, nan,
         nan, nan, nan, nan, nan,  0.,  0.,  0.]],

       [[nan,  1.,  0.,  0.,  1.

In [27]:
predictions = best_model.predict(X_predict)
#predictions = best_model.predict(X_predict) 
'用X_predict的时候kernal crash了。目前用X_test先跑'

predictions_inv = scaler.inverse_transform(predictions)

# Create a DataFrame for predictions 7 days from today
future_dates = pd.date_range(start=datetime.today() + timedelta(days=1), periods=7, freq='D')
future_predictions = pd.DataFrame({
    'date': future_dates,
    'predicted_visitors': np.round(predictions_inv.flatten()[:7])  # Assuming you have at least 7 predictions
})

# Function to simulate hourly arrival based on daily visitor prediction
def simulate_hourly_arrival(predicted_daily_visitors):
    time_intervals_1 = np.linspace(8, 16, 9)  # From 8 AM to 4 PM (9 hours)
    time_intervals_2 = np.linspace(17, 23, 7)  # From 5 PM to 11 PM (7 hours)
    time_intervals_strings = [f'{int(hour)}:00' for hour in np.concatenate((time_intervals_1, time_intervals_2), axis=None)]
    
    visitor_counts_1 = skewnorm.pdf(time_intervals_1, a=2, loc=12)  # Adjust skew and location as needed
    visitor_counts_2 = skewnorm.pdf(time_intervals_2, a=5, loc=19)
    visitor_counts_1 = visitor_counts_1 / visitor_counts_1.max()  # Normalize to [0, 1]
    visitor_counts_2 = visitor_counts_2 / visitor_counts_2.max()
    
    visitor_counts_1 = np.round(visitor_counts_1 * predicted_daily_visitors * 0.6 / visitor_counts_1.sum()).astype(int)  # Adjust ratio as needed
    visitor_counts_2 = np.round(visitor_counts_2 * predicted_daily_visitors * 0.4 / visitor_counts_2.sum()).astype(int)
    
    hourly_counts = np.concatenate((visitor_counts_1, visitor_counts_2), axis=None)
    return time_intervals_strings, hourly_counts

def generate_and_merge_hourly_data(future_predictions, holiday_sg_24):
    # Empty DataFrame to store the results
    output_df = pd.DataFrame(columns=['date', 'time', 'estimated_arrival_count'])

    # Generate hourly data for each predicted day
    for idx, row in future_predictions.iterrows():
        time_strings, hourly_visitors = simulate_hourly_arrival(row['predicted_visitors'])
        day_df = pd.DataFrame({
            'date': [row['date']] * len(time_strings),
            'time': time_strings,
            'estimated_arrival_count': hourly_visitors
        })
        output_df = pd.concat([output_df, day_df], ignore_index=True)

    # Ensure the 'date' columns are in datetime format
    output_df['date'] = pd.to_datetime(output_df['date']).dt.date
    holiday_sg_24['calendar_date'] = pd.to_datetime(holiday_sg_24['calendar_date']).dt.date

    # Merge output_df with holiday_sg_24 on date
    merged_df = pd.merge(output_df, holiday_sg_24, left_on='date', right_on='calendar_date', how='left')

    # Drop the extra date columns if no longer needed
    merged_df.drop(columns=['calendar_date'], inplace=True)

    return merged_df

result_df = generate_and_merge_hourly_data(future_predictions, holiday_sg_24)
result_df

: 