# Feature Engineering: Datetime Features

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from bmc_functions import db_utils
import holidays
import pandas as pd

# Read Data from DuckDB

In [3]:
# Path to the DuckDB database file
db_path = './data/hotel_reservations.duckdb'

## Select subset of data for review
q = 'SELECT * FROM res_data LIMIT 5'

with db_utils.duckdb_connection(db_path) as conn:
    display(conn.execute(q).df())

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelNumber,UUID
0,0,342,2015,July,27,1,0,0,2,0,...,,0,Transient,0.0,0,0,Check-Out,2015-07-01,1,6f4f201b-62de-4c33-b9da-15081cf7e359
1,0,737,2015,July,27,1,0,0,2,0,...,,0,Transient,0.0,0,0,Check-Out,2015-07-01,1,ad3d542c-41eb-4957-9724-32d3638d6bec
2,0,7,2015,July,27,1,0,1,1,0,...,,0,Transient,75.0,0,0,Check-Out,2015-07-02,1,6a0684e0-9157-4456-9a53-1685d506a951
3,0,13,2015,July,27,1,0,1,1,0,...,,0,Transient,75.0,0,0,Check-Out,2015-07-02,1,512c5081-3be1-4f78-87dc-2b0c8a9e9bde
4,0,14,2015,July,27,1,0,2,2,0,...,,0,Transient,98.0,0,1,Check-Out,2015-07-03,1,444ea515-6e6b-45b4-9870-fcf46b5b529c


In [4]:
## Convert Arrival columns to strings

q = ('''
SELECT uuid, ArrivalDateYear, ArrivalDateMonth, ArrivalDateDayOfMonth,
StaysInWeekNights, StaysInWeekendNights, LeadTime 
FROM res_data''')

with db_utils.duckdb_connection(db_path) as conn:
    df_data = conn.execute(q).df()

# df_data = arrival_cols.astype(str)
df_data.head()

Unnamed: 0,UUID,ArrivalDateYear,ArrivalDateMonth,ArrivalDateDayOfMonth,StaysInWeekNights,StaysInWeekendNights,LeadTime
0,6f4f201b-62de-4c33-b9da-15081cf7e359,2015,July,1,0,0,342
1,ad3d542c-41eb-4957-9724-32d3638d6bec,2015,July,1,0,0,737
2,6a0684e0-9157-4456-9a53-1685d506a951,2015,July,1,1,0,7
3,512c5081-3be1-4f78-87dc-2b0c8a9e9bde,2015,July,1,1,0,13
4,444ea515-6e6b-45b4-9870-fcf46b5b529c,2015,July,1,2,0,14


# Feature Engineering: Arrival, Departure, and Booking Dates

## Arrival Date

In [5]:
## Create new column of strings formatted as YYYY-MM-DD, then convert to datetime

arrival_details = ['ArrivalDateYear', 'ArrivalDateMonth', 'ArrivalDateDayOfMonth']

df_data[arrival_details] = df_data[arrival_details].astype(str)

df_data['ArrivalDate'] = df_data['ArrivalDateYear'].str.cat(df_data[['ArrivalDateMonth',
                                                                     'ArrivalDateDayOfMonth']],
                                                            '-')

df_data['ArrivalDate'] = pd.to_datetime(df_data['ArrivalDate'], yearfirst = True)

df_data.head()

Unnamed: 0,UUID,ArrivalDateYear,ArrivalDateMonth,ArrivalDateDayOfMonth,StaysInWeekNights,StaysInWeekendNights,LeadTime,ArrivalDate
0,6f4f201b-62de-4c33-b9da-15081cf7e359,2015,July,1,0,0,342,2015-07-01
1,ad3d542c-41eb-4957-9724-32d3638d6bec,2015,July,1,0,0,737,2015-07-01
2,6a0684e0-9157-4456-9a53-1685d506a951,2015,July,1,1,0,7,2015-07-01
3,512c5081-3be1-4f78-87dc-2b0c8a9e9bde,2015,July,1,1,0,13,2015-07-01
4,444ea515-6e6b-45b4-9870-fcf46b5b529c,2015,July,1,2,0,14,2015-07-01


## Departure Date

In [6]:
timedelta_wknd = pd.to_timedelta(df_data.loc[:, 'StaysInWeekendNights'], unit = 'D')
timedelta_wk = pd.to_timedelta(df_data.loc[:, 'StaysInWeekNights'], unit = 'D')

df_data['DepartureDate'] = df_data.loc[:, 'ArrivalDate'] + timedelta_wk + timedelta_wknd

df_data.head()

Unnamed: 0,UUID,ArrivalDateYear,ArrivalDateMonth,ArrivalDateDayOfMonth,StaysInWeekNights,StaysInWeekendNights,LeadTime,ArrivalDate,DepartureDate
0,6f4f201b-62de-4c33-b9da-15081cf7e359,2015,July,1,0,0,342,2015-07-01,2015-07-01
1,ad3d542c-41eb-4957-9724-32d3638d6bec,2015,July,1,0,0,737,2015-07-01,2015-07-01
2,6a0684e0-9157-4456-9a53-1685d506a951,2015,July,1,1,0,7,2015-07-01,2015-07-02
3,512c5081-3be1-4f78-87dc-2b0c8a9e9bde,2015,July,1,1,0,13,2015-07-01,2015-07-02
4,444ea515-6e6b-45b4-9870-fcf46b5b529c,2015,July,1,2,0,14,2015-07-01,2015-07-03


## Booking Date

In [7]:
df_data['LeadTime']

0         342
1         737
2           7
3          13
4          14
         ... 
119385     23
119386    102
119387     34
119388    109
119389    205
Name: LeadTime, Length: 119390, dtype: int64

In [8]:
df_data['LeadTimeDelta'] = pd.to_timedelta(df_data['LeadTime'], unit = 'D')
df_data['LeadTimeDelta']

0        342 days
1        737 days
2          7 days
3         13 days
4         14 days
           ...   
119385    23 days
119386   102 days
119387    34 days
119388   109 days
119389   205 days
Name: LeadTimeDelta, Length: 119390, dtype: timedelta64[ns]

In [9]:
df_data['BookingDate'] = df_data['ArrivalDate'] - df_data['LeadTimeDelta']

df_data.head(10)

Unnamed: 0,UUID,ArrivalDateYear,ArrivalDateMonth,ArrivalDateDayOfMonth,StaysInWeekNights,StaysInWeekendNights,LeadTime,ArrivalDate,DepartureDate,LeadTimeDelta,BookingDate
0,6f4f201b-62de-4c33-b9da-15081cf7e359,2015,July,1,0,0,342,2015-07-01,2015-07-01,342 days,2014-07-24
1,ad3d542c-41eb-4957-9724-32d3638d6bec,2015,July,1,0,0,737,2015-07-01,2015-07-01,737 days,2013-06-24
2,6a0684e0-9157-4456-9a53-1685d506a951,2015,July,1,1,0,7,2015-07-01,2015-07-02,7 days,2015-06-24
3,512c5081-3be1-4f78-87dc-2b0c8a9e9bde,2015,July,1,1,0,13,2015-07-01,2015-07-02,13 days,2015-06-18
4,444ea515-6e6b-45b4-9870-fcf46b5b529c,2015,July,1,2,0,14,2015-07-01,2015-07-03,14 days,2015-06-17
5,5c55ead3-f96e-44b6-b167-f0f0c87e013c,2015,July,1,2,0,14,2015-07-01,2015-07-03,14 days,2015-06-17
6,ba6dccfe-4082-4ce8-86dd-9fc130c84d3c,2015,July,1,2,0,0,2015-07-01,2015-07-03,0 days,2015-07-01
7,ea4d5557-6767-458a-8f3d-929cea401bc5,2015,July,1,2,0,9,2015-07-01,2015-07-03,9 days,2015-06-22
8,d5ef1deb-f28a-45bb-b7bc-aac85dad3425,2015,July,1,3,0,85,2015-07-01,2015-07-04,85 days,2015-04-07
9,f07c2799-6be5-4a8d-93a5-ee099cfa3e3c,2015,July,1,3,0,75,2015-07-01,2015-07-04,75 days,2015-04-17


In [10]:
drop_cols = ['ArrivalDateYear', 'ArrivalDateMonth', 'ArrivalDateDayOfMonth',
'StaysInWeekNights', 'StaysInWeekendNights', 'LeadTime', 'LeadTimeDelta']
df_data = df_data.drop(columns = drop_cols)
df_data.head()

Unnamed: 0,UUID,ArrivalDate,DepartureDate,BookingDate
0,6f4f201b-62de-4c33-b9da-15081cf7e359,2015-07-01,2015-07-01,2014-07-24
1,ad3d542c-41eb-4957-9724-32d3638d6bec,2015-07-01,2015-07-01,2013-06-24
2,6a0684e0-9157-4456-9a53-1685d506a951,2015-07-01,2015-07-02,2015-06-24
3,512c5081-3be1-4f78-87dc-2b0c8a9e9bde,2015-07-01,2015-07-02,2015-06-18
4,444ea515-6e6b-45b4-9870-fcf46b5b529c,2015-07-01,2015-07-03,2015-06-17


In [11]:
df_data[['ArrivalDate', 'DepartureDate', 'BookingDate']].min()

ArrivalDate     2015-07-01
DepartureDate   2015-07-01
BookingDate     2013-06-24
dtype: datetime64[ns]

In [12]:
# Fetch holidays for the specific range of years (2014-2017)
pt_holidays = holidays.CountryHoliday('PT', years=[2013, 2014, 2015, 2016, 2017])

# Function to calculate the proximity to holidays for a list of dates
def calculate_holiday_proximity(dates, holidays):
    days_after_recent_holiday = []
    days_before_next_holiday = []

    for dt in dates:
        date = dt.date()  # Convert Timestamp to datetime.date
        # Find the closest past holiday
        past_holidays = [(date - h_date).days for h_date in holidays if h_date < date]
        if past_holidays:
            days_after = min((d for d in past_holidays if d >= 0), default=None)
        else:
            days_after = None

        # Find the closest upcoming holiday
        future_holidays = [(h_date - date).days for h_date in holidays if h_date > date]
        if future_holidays:
            days_before = min((d for d in future_holidays if d >= 0), default=None)
        else:
            days_before = None

        days_after_recent_holiday.append(days_after)
        days_before_next_holiday.append(days_before)

    return days_after_recent_holiday, days_before_next_holiday

In [13]:
# Apply the function to each date column in the dataframe
for column in ['ArrivalDate', 'DepartureDate', 'BookingDate']:
    after, before = calculate_holiday_proximity(df_data[column], pt_holidays)
    df_data[f'{column}_DaysBeforeHoliday'] = before
    df_data[f'{column}_DaysAfterHoliday'] = after

df_data

Unnamed: 0,UUID,ArrivalDate,DepartureDate,BookingDate,ArrivalDate_DaysBeforeHoliday,ArrivalDate_DaysAfterHoliday,DepartureDate_DaysBeforeHoliday,DepartureDate_DaysAfterHoliday,BookingDate_DaysBeforeHoliday,BookingDate_DaysAfterHoliday
0,6f4f201b-62de-4c33-b9da-15081cf7e359,2015-07-01,2015-07-01,2014-07-24,45,21,45,21,22,44
1,ad3d542c-41eb-4957-9724-32d3638d6bec,2015-07-01,2015-07-01,2013-06-24,45,21,45,21,52,14
2,6a0684e0-9157-4456-9a53-1685d506a951,2015-07-01,2015-07-02,2015-06-24,45,21,44,22,52,14
3,512c5081-3be1-4f78-87dc-2b0c8a9e9bde,2015-07-01,2015-07-02,2015-06-18,45,21,44,22,58,8
4,444ea515-6e6b-45b4-9870-fcf46b5b529c,2015-07-01,2015-07-03,2015-06-17,45,21,43,23,59,7
...,...,...,...,...,...,...,...,...,...,...
119385,11bdfedc-3836-43f6-b052-477b5e09129a,2017-08-30,2017-09-06,2017-08-07,36,15,29,22,8,53
119386,b05f9186-a99c-4929-aedd-ac9589b90a66,2017-08-31,2017-09-07,2017-05-21,35,16,28,23,20,20
119387,1d946cd4-5f2e-4c5b-99b6-745cc33ecc82,2017-08-31,2017-09-07,2017-07-28,35,16,28,23,18,43
119388,11e03b45-b117-4d7f-acec-e2a303b4e573,2017-08-31,2017-09-07,2017-05-14,35,16,28,23,27,13


In [14]:
df_data['ArrivalDate'].dt.dayofweek

0         2
1         2
2         2
3         2
4         2
         ..
119385    2
119386    3
119387    3
119388    3
119389    1
Name: ArrivalDate, Length: 119390, dtype: int32

In [15]:
df_data['ArrivalDate'].dt.isocalendar()

Unnamed: 0,year,week,day
0,2015,27,3
1,2015,27,3
2,2015,27,3
3,2015,27,3
4,2015,27,3
...,...,...,...
119385,2017,35,3
119386,2017,35,4
119387,2017,35,4
119388,2017,35,4


In [16]:
for column in ['ArrivalDate', 'DepartureDate', 'BookingDate']:
    df_data[f'{column}_WeekNumber'] = df_data[column].dt.isocalendar()['week']
    df_data[f'{column}_DayOfWeek'] = df_data[column].dt.isocalendar()['day']
    
df_data

Unnamed: 0,UUID,ArrivalDate,DepartureDate,BookingDate,ArrivalDate_DaysBeforeHoliday,ArrivalDate_DaysAfterHoliday,DepartureDate_DaysBeforeHoliday,DepartureDate_DaysAfterHoliday,BookingDate_DaysBeforeHoliday,BookingDate_DaysAfterHoliday,ArrivalDate_WeekNumber,ArrivalDate_DayOfWeek,DepartureDate_WeekNumber,DepartureDate_DayOfWeek,BookingDate_WeekNumber,BookingDate_DayOfWeek
0,6f4f201b-62de-4c33-b9da-15081cf7e359,2015-07-01,2015-07-01,2014-07-24,45,21,45,21,22,44,27,3,27,3,30,4
1,ad3d542c-41eb-4957-9724-32d3638d6bec,2015-07-01,2015-07-01,2013-06-24,45,21,45,21,52,14,27,3,27,3,26,1
2,6a0684e0-9157-4456-9a53-1685d506a951,2015-07-01,2015-07-02,2015-06-24,45,21,44,22,52,14,27,3,27,4,26,3
3,512c5081-3be1-4f78-87dc-2b0c8a9e9bde,2015-07-01,2015-07-02,2015-06-18,45,21,44,22,58,8,27,3,27,4,25,4
4,444ea515-6e6b-45b4-9870-fcf46b5b529c,2015-07-01,2015-07-03,2015-06-17,45,21,43,23,59,7,27,3,27,5,25,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,11bdfedc-3836-43f6-b052-477b5e09129a,2017-08-30,2017-09-06,2017-08-07,36,15,29,22,8,53,35,3,36,3,32,1
119386,b05f9186-a99c-4929-aedd-ac9589b90a66,2017-08-31,2017-09-07,2017-05-21,35,16,28,23,20,20,35,4,36,4,20,7
119387,1d946cd4-5f2e-4c5b-99b6-745cc33ecc82,2017-08-31,2017-09-07,2017-07-28,35,16,28,23,18,43,35,4,36,4,30,5
119388,11e03b45-b117-4d7f-acec-e2a303b4e573,2017-08-31,2017-09-07,2017-05-14,35,16,28,23,27,13,35,4,36,4,19,7


In [17]:
df_data.to_parquet('./data/engineered_data_dates.parquet', compression = 'snappy')