# Dataset Explosion

---

The hotel data is originally in a form such that each record represents a reservation in it entirety.

To gain more insight into the impact of temporal features (e.g., month, week, day of week, day of year), I will "explode" the data to expand each reservation into a separate row for each night of the guest's stay.

---

# Import Packages and Load Data

In [2]:
import numpy as np
import pandas as pd

In [3]:
# data = pd.read_parquet('../../data/3.2_data_with_occupancies.parquet')
data = pd.read_parquet('../../data/source/full_data.parquet')
data

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelNumber,UUID
0,0,342,2015,July,27,1,0,0,2,0.0,...,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1,9af79666-f290-45c5-868c-2f9601b8f98b
1,0,737,2015,July,27,1,0,0,2,0.0,...,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1,81440274-e84e-4502-89f3-e01681d0672a
2,0,7,2015,July,27,1,0,1,1,0.0,...,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1,60fe936c-f7ba-48d9-ac73-71c21e1b3978
3,0,13,2015,July,27,1,0,1,1,0.0,...,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1,5b2aae61-1d0c-4314-b4c1-603595e43163
4,0,14,2015,July,27,1,0,2,2,0.0,...,,0,Transient,98.00,0,1,Check-Out,2015-07-03,H1,e92881a3-faf8-402b-beff-64dad4707236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,23,2017,August,35,30,2,5,2,0.0,...,,0,Transient,96.14,0,0,Check-Out,2017-09-06,H2,834c6a25-b7c4-4170-b8eb-3a00d830e397
119386,0,102,2017,August,35,31,2,5,3,0.0,...,,0,Transient,225.43,0,2,Check-Out,2017-09-07,H2,1c6d7a7a-2ffe-4d85-b44a-717666d8d7cc
119387,0,34,2017,August,35,31,2,5,2,0.0,...,,0,Transient,157.71,0,4,Check-Out,2017-09-07,H2,12af19d6-1594-4eef-bba8-643bc102cfea
119388,0,109,2017,August,35,31,2,5,2,0.0,...,,0,Transient,104.40,0,0,Check-Out,2017-09-07,H2,1e62e04f-8fd3-4304-818c-07b37c4208e0


# Create ArrivalDate Column Using Existing Features

In [4]:
arrival_date_cols = ['ArrivalDateYear', 'ArrivalDateMonth',	'ArrivalDateDayOfMonth']

data[arrival_date_cols]

Unnamed: 0,ArrivalDateYear,ArrivalDateMonth,ArrivalDateDayOfMonth
0,2015,July,1
1,2015,July,1
2,2015,July,1
3,2015,July,1
4,2015,July,1
...,...,...,...
119385,2017,August,30
119386,2017,August,31
119387,2017,August,31
119388,2017,August,31


In [5]:
## Combine the columns into a single datetime column
data['ArrivalDate'] = pd.to_datetime(
                        (data[arrival_date_cols]
                         .astype(str)
                         .agg('-'.join,
                              axis=1)))
data['ArrivalDate']

0        2015-07-01
1        2015-07-01
2        2015-07-01
3        2015-07-01
4        2015-07-01
            ...    
119385   2017-08-30
119386   2017-08-31
119387   2017-08-31
119388   2017-08-31
119389   2017-08-29
Name: ArrivalDate, Length: 119390, dtype: datetime64[ns]

# Calculate Length of Stay (LoS)

## As Numeric

In [6]:
data['LoS_Numeric'] = data[['StaysInWeekendNights','StaysInWeekNights']].sum(axis = 1)
data['LoS_Numeric']

0         0
1         0
2         1
3         1
4         2
         ..
119385    7
119386    7
119387    7
119388    7
119389    9
Name: LoS_Numeric, Length: 119390, dtype: int64

## As TimeDelta

In [7]:
data['LoS_Days'] = pd.to_timedelta(data['LoS_Numeric'], unit='D')
data['LoS_Days']

0        0 days
1        0 days
2        1 days
3        1 days
4        2 days
          ...  
119385   7 days
119386   7 days
119387   7 days
119388   7 days
119389   9 days
Name: LoS_Days, Length: 119390, dtype: timedelta64[ns]

# Calculate Departure Date

In [8]:
# Calculate DepartureDate
data['DepartureDate'] = data['ArrivalDate'] + data['LoS_Days']
data['DepartureDate']

0        2015-07-01
1        2015-07-01
2        2015-07-02
3        2015-07-02
4        2015-07-03
            ...    
119385   2017-09-06
119386   2017-09-07
119387   2017-09-07
119388   2017-09-07
119389   2017-09-07
Name: DepartureDate, Length: 119390, dtype: datetime64[ns]

# Drop Unnecessary Columns

In [9]:
drop_cols = ['ArrivalDateYear','ArrivalDateMonth','ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth','LoS_Days']

data = data.drop(columns=drop_cols)
data

Unnamed: 0,IsCanceled,LeadTime,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,Country,MarketSegment,...,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelNumber,UUID,ArrivalDate,LoS_Numeric,DepartureDate
0,0,342,0,0,2,0.0,0,BB,PRT,Direct,...,0.00,0,0,Check-Out,2015-07-01,H1,9af79666-f290-45c5-868c-2f9601b8f98b,2015-07-01,0,2015-07-01
1,0,737,0,0,2,0.0,0,BB,PRT,Direct,...,0.00,0,0,Check-Out,2015-07-01,H1,81440274-e84e-4502-89f3-e01681d0672a,2015-07-01,0,2015-07-01
2,0,7,0,1,1,0.0,0,BB,GBR,Direct,...,75.00,0,0,Check-Out,2015-07-02,H1,60fe936c-f7ba-48d9-ac73-71c21e1b3978,2015-07-01,1,2015-07-02
3,0,13,0,1,1,0.0,0,BB,GBR,Corporate,...,75.00,0,0,Check-Out,2015-07-02,H1,5b2aae61-1d0c-4314-b4c1-603595e43163,2015-07-01,1,2015-07-02
4,0,14,0,2,2,0.0,0,BB,GBR,Online TA,...,98.00,0,1,Check-Out,2015-07-03,H1,e92881a3-faf8-402b-beff-64dad4707236,2015-07-01,2,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,23,2,5,2,0.0,0,BB,BEL,Offline TA/TO,...,96.14,0,0,Check-Out,2017-09-06,H2,834c6a25-b7c4-4170-b8eb-3a00d830e397,2017-08-30,7,2017-09-06
119386,0,102,2,5,3,0.0,0,BB,FRA,Online TA,...,225.43,0,2,Check-Out,2017-09-07,H2,1c6d7a7a-2ffe-4d85-b44a-717666d8d7cc,2017-08-31,7,2017-09-07
119387,0,34,2,5,2,0.0,0,BB,DEU,Online TA,...,157.71,0,4,Check-Out,2017-09-07,H2,12af19d6-1594-4eef-bba8-643bc102cfea,2017-08-31,7,2017-09-07
119388,0,109,2,5,2,0.0,0,BB,GBR,Online TA,...,104.40,0,0,Check-Out,2017-09-07,H2,1e62e04f-8fd3-4304-818c-07b37c4208e0,2017-08-31,7,2017-09-07


In [10]:
data

Unnamed: 0,IsCanceled,LeadTime,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,Country,MarketSegment,...,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelNumber,UUID,ArrivalDate,LoS_Numeric,DepartureDate
0,0,342,0,0,2,0.0,0,BB,PRT,Direct,...,0.00,0,0,Check-Out,2015-07-01,H1,9af79666-f290-45c5-868c-2f9601b8f98b,2015-07-01,0,2015-07-01
1,0,737,0,0,2,0.0,0,BB,PRT,Direct,...,0.00,0,0,Check-Out,2015-07-01,H1,81440274-e84e-4502-89f3-e01681d0672a,2015-07-01,0,2015-07-01
2,0,7,0,1,1,0.0,0,BB,GBR,Direct,...,75.00,0,0,Check-Out,2015-07-02,H1,60fe936c-f7ba-48d9-ac73-71c21e1b3978,2015-07-01,1,2015-07-02
3,0,13,0,1,1,0.0,0,BB,GBR,Corporate,...,75.00,0,0,Check-Out,2015-07-02,H1,5b2aae61-1d0c-4314-b4c1-603595e43163,2015-07-01,1,2015-07-02
4,0,14,0,2,2,0.0,0,BB,GBR,Online TA,...,98.00,0,1,Check-Out,2015-07-03,H1,e92881a3-faf8-402b-beff-64dad4707236,2015-07-01,2,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,23,2,5,2,0.0,0,BB,BEL,Offline TA/TO,...,96.14,0,0,Check-Out,2017-09-06,H2,834c6a25-b7c4-4170-b8eb-3a00d830e397,2017-08-30,7,2017-09-06
119386,0,102,2,5,3,0.0,0,BB,FRA,Online TA,...,225.43,0,2,Check-Out,2017-09-07,H2,1c6d7a7a-2ffe-4d85-b44a-717666d8d7cc,2017-08-31,7,2017-09-07
119387,0,34,2,5,2,0.0,0,BB,DEU,Online TA,...,157.71,0,4,Check-Out,2017-09-07,H2,12af19d6-1594-4eef-bba8-643bc102cfea,2017-08-31,7,2017-09-07
119388,0,109,2,5,2,0.0,0,BB,GBR,Online TA,...,104.40,0,0,Check-Out,2017-09-07,H2,1e62e04f-8fd3-4304-818c-07b37c4208e0,2017-08-31,7,2017-09-07


In [11]:
## Ensure ArrivalDate and DepartureDate are in datetime format
data['ArrivalDate'] = pd.to_datetime(data['ArrivalDate'])
data['DepartureDate'] = pd.to_datetime(data['DepartureDate'])

## Create a date range for each row
data['DateRange'] = data.apply(lambda row: pd.date_range(row['ArrivalDate'],
                                                         row['DepartureDate']), 
                               axis=1)
## Explode the DataFrame
exploded_data = data.explode('DateRange')

## Rename the exploded column to 'Date'
exploded_data = exploded_data.rename(columns={'DateRange': 'Date'})

## Drop the original ArrivalDate and DepartureDate columns if no longer needed
exploded_data = exploded_data.drop(columns=['ArrivalDate', 'DepartureDate'])

## Reset the index
exploded_data = exploded_data.reset_index(drop=True)

exploded_data

Unnamed: 0,IsCanceled,LeadTime,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,Country,MarketSegment,...,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelNumber,UUID,LoS_Numeric,Date
0,0,342,0,0,2,0.0,0,BB,PRT,Direct,...,Transient,0.0,0,0,Check-Out,2015-07-01,H1,9af79666-f290-45c5-868c-2f9601b8f98b,0,2015-07-01
1,0,737,0,0,2,0.0,0,BB,PRT,Direct,...,Transient,0.0,0,0,Check-Out,2015-07-01,H1,81440274-e84e-4502-89f3-e01681d0672a,0,2015-07-01
2,0,7,0,1,1,0.0,0,BB,GBR,Direct,...,Transient,75.0,0,0,Check-Out,2015-07-02,H1,60fe936c-f7ba-48d9-ac73-71c21e1b3978,1,2015-07-01
3,0,7,0,1,1,0.0,0,BB,GBR,Direct,...,Transient,75.0,0,0,Check-Out,2015-07-02,H1,60fe936c-f7ba-48d9-ac73-71c21e1b3978,1,2015-07-02
4,0,13,0,1,1,0.0,0,BB,GBR,Corporate,...,Transient,75.0,0,0,Check-Out,2015-07-02,H1,5b2aae61-1d0c-4314-b4c1-603595e43163,1,2015-07-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528642,0,205,2,7,2,0.0,0,HB,DEU,Online TA,...,Transient,151.2,0,2,Check-Out,2017-09-07,H2,732b34e2-bbcf-4a43-b63b-73839173be8e,9,2017-09-03
528643,0,205,2,7,2,0.0,0,HB,DEU,Online TA,...,Transient,151.2,0,2,Check-Out,2017-09-07,H2,732b34e2-bbcf-4a43-b63b-73839173be8e,9,2017-09-04
528644,0,205,2,7,2,0.0,0,HB,DEU,Online TA,...,Transient,151.2,0,2,Check-Out,2017-09-07,H2,732b34e2-bbcf-4a43-b63b-73839173be8e,9,2017-09-05
528645,0,205,2,7,2,0.0,0,HB,DEU,Online TA,...,Transient,151.2,0,2,Check-Out,2017-09-07,H2,732b34e2-bbcf-4a43-b63b-73839173be8e,9,2017-09-06


# Save Results

In [13]:
exploded_data.to_parquet('../../data/5.1_dataset_exploded.parquet', compression = 'zstd')