# Calculating Hotel Occupancy

---

> Hotel occupancy is a critical factor during the booking process and can provide additional insight into the likelihood of cancellations and/or forecasting future ADR.
> 
> However, *there's no clear indication of the total number of guest rooms for either hotel.*
>
> 
> I will determine the maximum number of rooms occupied for each date for each hotel, which can be used as a placeholder max occupancy number.

---

# Import Packages and Read Data

In [18]:
## Used to upload 
%load_ext autoreload
%autoreload 2

In [19]:
## Enabling access to custom functions in separate directory

# # Import necessary modules
# import os
# import sys

# # Construct the absolute path to the 'src' directory
# src_path = os.path.abspath(os.path.join('../..', 'src'))

# # Append the path to 'sys.path'
# if src_path not in sys.path:
#     sys.path.append(src_path)

# import db_utils, eda

## Data Handling
import datetime as dt
import pandas as pd
import numpy as np
import sweetviz as sv

## Settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
pd.set_option('display.max_rows', 50)
%matplotlib inline

## Load Pre-Reviewed Data

In [26]:
data_path = '../../data/2.2_temporally_updated_data.feather'
df_data = pd.read_feather(data_path)
df_data

Unnamed: 0_level_0,IsCanceled,Adults,Children,Babies,Meal,Country,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,HotelNumber,ArrivalDate,DepartureDate,Length of Stay,BookingDate,DaysSinceBooking,ArrivalDate_DaysBeforeHoliday,ArrivalDate_DaysAfterHoliday,DepartureDate_DaysBeforeHoliday,DepartureDate_DaysAfterHoliday,BookingDate_DaysBeforeHoliday,BookingDate_DaysAfterHoliday,ArrivalWeek,ArrivalDay,ArrivalDateDayName,ArrivalDateMonthName,ADR_lag_1,ADR_lag_7,ADR_7d_avg,ADR_30d_avg,ADR_7d_std,ADR_30d_std,ADR_ewm_3,ADR_ewm_7
UUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
b6bcab0c-31d0-44e4-b75f-8829827e31e9,0,2,0.00,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.00,0,0,Check-Out,H1,2015-07-01,2015-07-01,0,2014-07-24,342,45,21,45,21,22,44,27,3,Wednesday,July,,,,,,,,
208b168f-6b5f-48f0-a59a-608e7762301b,0,1,0.00,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,A,A,1,No Deposit,6,,0,Transient,80.00,0,0,Check-Out,H2,2015-07-01,2015-07-03,2,2014-10-17,259,45,21,43,23,52,63,27,3,Wednesday,July,0.00,,,,,,0.00,0.00
80e31a27-8602-4488-86ea-4a82bb9370db,0,2,0.00,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,6,,0,Transient,101.50,0,0,Check-Out,H2,2015-07-01,2015-07-03,2,2014-10-17,259,45,21,43,23,52,63,27,3,Wednesday,July,80.00,,,,,,40.00,20.00
3d65d16f-bc6e-4b2b-acc4-fce547978cb7,0,2,0.00,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,6,,0,Transient,101.50,0,0,Check-Out,H2,2015-07-01,2015-07-03,2,2014-10-17,259,45,21,43,23,52,63,27,3,Wednesday,July,101.50,,60.50,,53.49,,70.75,40.38
bf95f775-71a4-45b1-9c21-6f81ffa661a0,0,2,0.00,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,6,,0,Transient,101.50,0,0,Check-Out,H2,2015-07-01,2015-07-03,2,2014-10-17,259,45,21,43,23,52,63,27,3,Wednesday,July,101.50,,94.33,,12.41,,86.12,55.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
00dae7f5-4f45-44fd-8859-b4d70dcb2185,0,2,0.00,0,HB,GBR,Online TA,TA/TO,0,0,0,E,E,0,No Deposit,241,,0,Transient,207.03,0,1,Check-Out,H1,2017-08-31,2017-09-07,7,2017-05-15,115,35,16,28,23,26,14,35,4,Thursday,August,196.67,174.00,216.39,172.47,17.48,60.99,197.45,183.19
75010c8c-4902-46f8-a487-bb876adea6a4,0,2,1.00,0,HB,ITA,Online TA,TA/TO,0,0,0,G,G,3,No Deposit,240,,0,Transient,312.29,1,1,Check-Out,H1,2017-08-31,2017-09-07,7,2017-02-18,201,35,16,28,23,55,48,35,4,Thursday,August,207.03,207.50,211.23,177.19,17.06,62.39,202.24,189.15
291fa564-267f-421b-92eb-b17a782ffaa9,1,2,0.00,0,HB,ESP,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240,,0,Transient,207.00,0,2,Canceled,H1,2017-08-31,2017-09-03,3,2017-08-14,0,35,16,32,19,1,60,35,4,Thursday,August,312.29,104.40,238.66,192.16,63.97,80.74,257.26,219.93
8bb4727f-ef00-4c8e-930a-3d0d92a69f9d,0,2,0.00,0,HB,GBR,Offline TA/TO,TA/TO,0,0,0,D,D,0,No Deposit,40,,0,Contract,114.80,0,0,Check-Out,H1,2017-08-31,2017-09-07,7,2017-02-21,198,35,16,28,23,52,51,35,4,Thursday,August,207.00,72.20,242.11,206.81,60.78,70.87,232.13,216.70


# Calculate Daily Occupancies

# Hotel 1

In [5]:
df_data['HotelNumber'].unique()

['H1', 'H2']
Categories (2, object): ['H1', 'H2']

In [6]:
hotel = 'H1'

hotel_filter = (df_data['HotelNumber'] == hotel)

df_data_h1 = df_data[hotel_filter]

df_data_h1

Unnamed: 0_level_0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,Country,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelNumber
UUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
d2bdadb4-b32b-4568-ae20-bf70afd6e1d0,0.00,342.00,2015.00,July,27.00,1.00,0.00,0.00,2.00,0.00,0.00,BB,PRT,Direct,Direct,0.00,0.00,0.00,C,C,3.00,No Deposit,,,0.00,Transient,0.00,0.00,0.00,Check-Out,2015-07-01,H1
6a7bbfe5-94e9-4f91-bcbd-61e2a4052ba1,0.00,737.00,2015.00,July,27.00,1.00,0.00,0.00,2.00,0.00,0.00,BB,PRT,Direct,Direct,0.00,0.00,0.00,C,C,4.00,No Deposit,,,0.00,Transient,0.00,0.00,0.00,Check-Out,2015-07-01,H1
9032ebc0-52c0-42be-9458-c8b1db77b301,0.00,7.00,2015.00,July,27.00,1.00,0.00,1.00,1.00,0.00,0.00,BB,GBR,Direct,Direct,0.00,0.00,0.00,A,C,0.00,No Deposit,,,0.00,Transient,75.00,0.00,0.00,Check-Out,2015-07-02,H1
4f560d57-49e9-4862-8856-0e9ca45218fa,0.00,13.00,2015.00,July,27.00,1.00,0.00,1.00,1.00,0.00,0.00,BB,GBR,Corporate,Corporate,0.00,0.00,0.00,A,A,0.00,No Deposit,304,,0.00,Transient,75.00,0.00,0.00,Check-Out,2015-07-02,H1
b624acb5-2a05-4356-af79-19be0fb1ddee,0.00,14.00,2015.00,July,27.00,1.00,0.00,2.00,2.00,0.00,0.00,BB,GBR,Online TA,TA/TO,0.00,0.00,0.00,A,A,0.00,No Deposit,240,,0.00,Transient,98.00,0.00,1.00,Check-Out,2015-07-03,H1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6aa86b9f-a383-47c5-8322-052d5a177bc8,0.00,212.00,2017.00,August,35.00,31.00,2.00,8.00,2.00,1.00,0.00,BB,GBR,Offline TA/TO,TA/TO,0.00,0.00,0.00,A,A,1.00,No Deposit,143,,0.00,Transient,89.75,0.00,0.00,Check-Out,2017-09-10,H1
84f42c28-aba9-4d1d-96f9-70b7d2c19060,0.00,169.00,2017.00,August,35.00,30.00,2.00,9.00,2.00,0.00,0.00,BB,IRL,Direct,Direct,0.00,0.00,0.00,E,E,0.00,No Deposit,250,,0.00,Transient-Party,202.27,0.00,1.00,Check-Out,2017-09-10,H1
659d1db9-2ba0-4ecf-b4d1-c9f5f9f8077e,0.00,204.00,2017.00,August,35.00,29.00,4.00,10.00,2.00,0.00,0.00,BB,IRL,Direct,Direct,0.00,0.00,0.00,E,E,0.00,No Deposit,250,,0.00,Transient,153.57,0.00,3.00,Check-Out,2017-09-12,H1
c0be582d-cb37-4351-b435-9ea491009939,0.00,211.00,2017.00,August,35.00,31.00,4.00,10.00,2.00,0.00,0.00,HB,GBR,Offline TA/TO,TA/TO,0.00,0.00,0.00,D,D,0.00,No Deposit,40,,0.00,Contract,112.80,0.00,1.00,Check-Out,2017-09-14,H1


In [7]:
min_date = df_data_h1['ArrivalDate'].min()
max_date = df_data_h1['DepartureDate'].max()

print(f'The earliest date is: {min_date}.\nThe latest date is: {max_date}.')

KeyError: 'ArrivalDate'

### ChatGPT-Generated Code

#### Walkthrough of Code Below

Certainly! Let's break down the corrected code into its key steps, explaining each part and its purpose in calculating the number of active hotel reservations for each date:

**3. Generate Counts for Arrivals and Departures**

```python
arrivals = df_data_h1['ArrivalDate'].value_counts().rename('count')
departures = df_data_h1['DepartureDate'].value_counts().rename('count')
```
- Counts how many reservations start (`arrivals`) and end (`departures`) on each date. The `value_counts()` method tallies occurrences of each date, and `rename('count')` changes the Series name to `'count'`, which aids in clarity for later operations.

**4. Combine Arrival and Departure Counts**

```python
df_counts = pd.concat([arrivals, -departures]).sort_index().reset_index()
df_counts.columns = ['Date', 'Count']
```
- Combines the arrivals and departures into a single DataFrame, `df_counts`, with arrivals contributing positively to the count and departures negatively (indicating the end of a reservation). The data is then sorted by date.

**5. Aggregate Counts on the Same Date**

```python
df_counts = df_counts.groupby('Date').sum()
```
- Since the combination of arrivals and departures could result in multiple entries for the same date, this step aggregates (sums) all counts for each date. This ensures each date is unique, addressing the initial issue of duplicate labels.

**6. Generate a Complete Date Range and Reindex**

```python
date_range = pd.date_range(start=df_counts.index.min(), end=df_counts.index.max())
df_counts = df_counts.reindex(date_range, fill_value=0)
```
- Creates a continuous range of dates covering the entire period from the earliest to the latest date in `df_counts`. It then reindexes `df_counts` to include every date in this range, filling any dates without data with `0`, ensuring there's a record for every single day in the period.

**7. Calculate Cumulative Sum for Active Reservations**

```python
df_counts['Active_Reservations'] = df_counts['Count'].cumsum()
```
- Computes the cumulative sum of the daily net reservation counts (`Count`). This step effectively calculates the total number of active reservations for each date by adding up the arrivals and subtracting the departures as they occur over time.

**Conclusion**

- The final output, `df_counts['Active_Reservations']`, shows the total number of active reservations for each date in the range. This method is efficient and avoids the problem of duplicate labels by ensuring that each date is unique before the reindexing step, leveraging pandas' capabilities for handling time series data.

### Code

In [10]:
def get_counts(dataframe, arrivaldate, departuredate, name = 'count'):
    
    '''Generate counts for arrivals and departures on their respective dates.'''
    
    arrivals = (dataframe[arrivaldate]
                .value_counts()
                .rename(name))
    
    departures = (dataframe[departuredate]
                  .value_counts()
                  .rename(name))
    
    return arrivals, departures


def aggregate_counts_by_date(arrivals, departures):
    
    '''Create a DataFrame from arrivals and departures,
    marking departures as negative.'''
    
    df_counts = (pd.concat([arrivals, -departures])
                 .sort_index()
                 .reset_index())
    
    df_counts.columns = ['Date', 'Count']
    
    return df_counts.groupby('Date').sum()


def generate_date_range(df_counts):
    '''Generate a complete date range covering the period in df_data.'''
    
    return pd.date_range(start=df_counts.index.min(),
                         end=df_counts.index.max())


def reindex_and_fill_zero(df_counts, date_range):
    '''Reindex the aggregated count DataFrame to include all dates in the range,
    filling missing dates with 0'''
    
    return df_counts.reindex(date_range, fill_value=0)


def calculate_daily_active_res(df_counts):
    '''Calculate the cumulative sum to determine active reservations for each date'''
   
    return df_counts['Count'].cumsum()


def calculate_daily_occupancy(dataframe, arrivaldate, departuredate, name = 'count'):
    
    arrivals, departures = get_counts(dataframe, arrivaldate, departuredate, name = 'count')
    
    daily_counts = aggregate_counts_by_date(arrivals, departures)
    
    date_range = generate_date_range((daily_counts))
    
    df_reindexed = reindex_and_fill_zero(daily_counts, date_range)
    
    return calculate_daily_active_res(df_reindexed)

In [12]:
df_counts = calculate_daily_occupancy(df_data_h1, 'ArrivalDate', 'DepartureDate')
df_counts.name = 'Active_Reservations'
df_counts

2015-07-01     41
2015-07-02     78
2015-07-03    108
2015-07-04    147
2015-07-05    154
             ... 
2017-09-10      4
2017-09-11      3
2017-09-12      2
2017-09-13      2
2017-09-14      0
Freq: D, Name: Active_Reservations, Length: 807, dtype: int64

In [13]:
# # Generate counts for arrivals and departures on their respective dates
# arrivals = df_data_h1['ArrivalDate'].value_counts().rename('count')
# departures = df_data_h1['DepartureDate'].value_counts().rename('count')
# arrivals.head(), departures.head()

In [14]:
# # Create a DataFrame from arrivals and departures, marking departures as negative
# df_counts = pd.concat([arrivals, -departures]).sort_index().reset_index()
# df_counts.columns = ['Date', 'Count']
# df_counts

In [15]:
# # Aggregate counts on the same date to avoid duplicate labels
# df_counts = df_counts.groupby('Date').sum()
# df_counts

In [16]:
# # Generate a complete date range covering the period in df_data
# date_range = pd.date_range(start=df_counts.index.min(),
#                            end=df_counts.index.max())
# date_range

In [17]:
# # Reindex the aggregated count DataFrame to include all dates in the range, filling missing dates with 0
# df_counts = df_counts.reindex(date_range, fill_value=0)
# df_counts

In [18]:
# # Calculate the cumulative sum to determine active reservations for each date
# df_counts['Active_Reservations'] = df_counts['Count'].cumsum()

# df_counts['Active_Reservations']

#### Groupby.Sum vs. Cumsum

The use of both `groupby().sum()` and the `cumsum()` methods serves two different purposes in the process of calculating the total number of active reservations for each date. Here's a clarification of the roles each step plays in the computation:

**1. GroupBy().sum()**

- **Purpose:** This step aggregates the daily net changes in reservations (arrivals and departures) for each unique date. Since arrivals are counted positively and departures negatively, the sum for each date tells us the net reservation change on that day. 
- **What It Solves:** If, for instance, 5 reservations start (arrive) and 3 end (depart) on a particular date, the net change in reservations for that day would be +2. This calculation consolidates all changes into a single value per date, ensuring there's no duplication of dates in the dataset, which is necessary for the next steps.

**2. cumsum()**

- **Purpose:** The cumulative sum (`cumsum()`) takes these daily net changes and accumulates them over the entire period to calculate the total number of active reservations for each date. It essentially adds up the net changes from the start date, rolling forward, to show how many reservations are active on any given day.
- **What It Solves:** This step provides the running total of active reservations. It accounts for the ongoing balance of reservations as they begin and end over time, showing the total active reservations on each date. This is crucial for understanding the capacity or occupancy on any given day.

**Illustrative Example:**

Let's say you have data for three days:

- **Day 1:** 5 arrivals, 0 departures (net +5)
- **Day 2:** 3 arrivals, 1 departure (net +2)
- **Day 3:** 2 arrivals, 4 departures (net -2)

After `groupby().sum()`, you'd have a net change sequence of [+5, +2, -2].

Applying `cumsum()` to this sequence gives you the total active reservations for each day: [5, 7, 5]. This demonstrates how the occupancy evolves:

- **Day 1:** Starts with 5,
- **Day 2:** Increases to 7,
- **Day 3:** Decreases back to 5.

**Conclusion:**

- **`groupby().sum()`** is used for condensing the dataset into a form where each date has a single net change value, resolving any issues with duplicate dates.
- **`cumsum()`** transforms these net changes into a running total of active reservations, reflecting how the number of active reservations builds up or reduces over time.

## Adding Arrival/Departure Occupancies to Original Data

In [19]:
df_counts.head(10)

2015-07-01     41
2015-07-02     78
2015-07-03    108
2015-07-04    147
2015-07-05    154
2015-07-06    172
2015-07-07    190
2015-07-08    201
2015-07-09    213
2015-07-10    222
Freq: D, Name: Active_Reservations, dtype: int64

In [20]:
# Define active_reservations
active_reservations = df_counts

# Find the maximum occupancy
max_occupancy = active_reservations.max()

# Map the occupancy on arrival and departure dates to each reservation
df_data_h1.loc[:, 'occupancy_at_arrival'] = df_data_h1.loc[:, 'ArrivalDate'].map(active_reservations)
df_data_h1.loc[:, 'occupancy_at_departure'] = df_data_h1.loc[:, 'DepartureDate'].map(active_reservations)

# Convert these occupancies to percentages of the maximum occupancy
df_data_h1.loc[:, 'occupancy_pct_at_arrival'] = (df_data_h1.loc[:, 'occupancy_at_arrival'] / max_occupancy)
df_data_h1.loc[:, 'occupancy_pct_at_departure'] = (df_data_h1.loc[:, 'occupancy_at_departure'] / max_occupancy)

df_data_h1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_h1.loc[:, 'occupancy_at_arrival'] = df_data_h1.loc[:, 'ArrivalDate'].map(active_reservations)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_h1.loc[:, 'occupancy_at_departure'] = df_data_h1.loc[:, 'DepartureDate'].map(active_reservations)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Unnamed: 0,IsCanceled,ArrivalDateWeekNumber,Adults,Children,Babies,Meal,Country,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelNumber,ArrivalDate,DepartureDate,Length of Stay,BookingDate,DaysSinceBooking,ArrivalDate_DaysBeforeHoliday,ArrivalDate_DaysAfterHoliday,DepartureDate_DaysBeforeHoliday,DepartureDate_DaysAfterHoliday,BookingDate_DaysBeforeHoliday,BookingDate_DaysAfterHoliday,ArrivalWeek,ArrivalDay,occupancy_at_arrival,occupancy_at_departure,occupancy_pct_at_arrival,occupancy_pct_at_departure
0,0.00,27.00,2.00,0.00,0.00,BB,PRT,Direct,Direct,0.00,0.00,0.00,C,C,3.00,No Deposit,,,0.00,Transient,0.00,0.00,0.00,Check-Out,2015-07-01,H1,2015-07-01,2015-07-01,0.00,2014-07-24,342.00,45.00,21.00,45.00,21.00,22.00,44.00,27.00,3.00,41,41,0.12,0.12
1,0.00,27.00,2.00,0.00,0.00,BB,PRT,Direct,Direct,0.00,0.00,0.00,C,C,4.00,No Deposit,,,0.00,Transient,0.00,0.00,0.00,Check-Out,2015-07-01,H1,2015-07-01,2015-07-01,0.00,2013-06-24,737.00,45.00,21.00,45.00,21.00,52.00,14.00,27.00,3.00,41,41,0.12,0.12
2,0.00,27.00,1.00,0.00,0.00,BB,GBR,Direct,Direct,0.00,0.00,0.00,A,C,0.00,No Deposit,,,0.00,Transient,75.00,0.00,0.00,Check-Out,2015-07-02,H1,2015-07-01,2015-07-02,1.00,2015-06-24,8.00,45.00,21.00,44.00,22.00,52.00,14.00,27.00,3.00,41,78,0.12,0.22
3,0.00,27.00,1.00,0.00,0.00,BB,GBR,Corporate,Corporate,0.00,0.00,0.00,A,A,0.00,No Deposit,304,,0.00,Transient,75.00,0.00,0.00,Check-Out,2015-07-02,H1,2015-07-01,2015-07-02,1.00,2015-06-18,14.00,45.00,21.00,44.00,22.00,58.00,8.00,27.00,3.00,41,78,0.12,0.22
4,0.00,27.00,2.00,0.00,0.00,BB,GBR,Online TA,TA/TO,0.00,0.00,0.00,A,A,0.00,No Deposit,240,,0.00,Transient,98.00,0.00,1.00,Check-Out,2015-07-03,H1,2015-07-01,2015-07-03,2.00,2015-06-17,16.00,45.00,21.00,43.00,23.00,59.00,7.00,27.00,3.00,41,108,0.12,0.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40055,0.00,35.00,2.00,1.00,0.00,BB,GBR,Offline TA/TO,TA/TO,0.00,0.00,0.00,A,A,1.00,No Deposit,143,,0.00,Transient,89.75,0.00,0.00,Check-Out,2017-09-10,H1,2017-08-31,2017-09-10,10.00,2017-01-31,222.00,35.00,16.00,25.00,26.00,73.00,30.00,35.00,4.00,271,4,0.76,0.01
40056,0.00,35.00,2.00,0.00,0.00,BB,IRL,Direct,Direct,0.00,0.00,0.00,E,E,0.00,No Deposit,250,,0.00,Transient-Party,202.27,0.00,1.00,Check-Out,2017-09-10,H1,2017-08-30,2017-09-10,11.00,2017-03-14,180.00,36.00,15.00,25.00,26.00,31.00,72.00,35.00,3.00,283,4,0.79,0.01
40057,0.00,35.00,2.00,0.00,0.00,BB,IRL,Direct,Direct,0.00,0.00,0.00,E,E,0.00,No Deposit,250,,0.00,Transient,153.57,0.00,3.00,Check-Out,2017-09-12,H1,2017-08-29,2017-09-12,14.00,2017-02-06,218.00,37.00,14.00,23.00,28.00,67.00,36.00,35.00,2.00,295,2,0.83,0.01
40058,0.00,35.00,2.00,0.00,0.00,HB,GBR,Offline TA/TO,TA/TO,0.00,0.00,0.00,D,D,0.00,No Deposit,40,,0.00,Contract,112.80,0.00,1.00,Check-Out,2017-09-14,H1,2017-08-31,2017-09-14,14.00,2017-02-01,225.00,35.00,16.00,21.00,30.00,72.00,31.00,35.00,4.00,271,0,0.76,0.00


In [None]:
df_data_h1

# Hotel 2

In [None]:
hotel = 'H2'

hotel_filter = (df_data['HotelNumber'] == hotel)

df_data_h2 = df_data[hotel_filter]

df_counts_h2 = calculate_daily_occupancy(df_data_h2, 'ArrivalDate', 'DepartureDate')
df_counts_h2.name = 'Active_Reservations'
df_counts_h2

In [None]:
# Define active_reservations
active_reservations = df_counts

# Find the maximum occupancy
max_occupancy = active_reservations.max()

# Map the occupancy on arrival and departure dates to each reservation
df_data_h2.loc[:, 'occupancy_at_arrival'] = df_data_h2.loc[:, 'ArrivalDate'].map(active_reservations)
df_data_h2.loc[:, 'occupancy_at_departure'] = df_data_h2.loc[:, 'DepartureDate'].map(active_reservations)

# Convert these occupancies to percentages of the maximum occupancy
df_data_h2.loc[:, 'occupancy_pct_at_arrival'] = (df_data_h2.loc[:, 'occupancy_at_arrival'] / max_occupancy)
df_data_h2.loc[:, 'occupancy_pct_at_departure'] = (df_data_h2.loc[:, 'occupancy_at_departure'] / max_occupancy)

df_data_h2

In [None]:
full_dataset = pd.concat([df_data_h1, df_data_h2], axis = 0)

full_dataset.to_feather('../../data/2.3_data_with_occupancies.feather', compression = 'lz4')

# Time Series Metrics and Analysis

In [None]:
# # Assume df_data is the DataFrame name, and the data is already sorted by Arrival_Date in ascending order
# # and that Arrival_Date is in datetime format

# # Calculate the 7-day rolling average of occupancy_pct_at_arrival
# df_data_h1.loc[:, 'occupancy_pct_at_arrival_7d_avg'] = df_data_h1['occupancy_pct_at_arrival'].rolling(window=7, min_periods=1).mean()

# # Display the updated DataFrame to verify the calculation
# df_data_h1[['ArrivalDate', 'occupancy_pct_at_arrival', 'occupancy_pct_at_arrival_7d_avg']].head()


In [None]:
# df_data_rollavg = df_data_h1[['ArrivalDate', 'occupancy_pct_at_arrival']].copy()
# df_data_rollavg

In [None]:
# df_data_rollavg = df_data_rollavg.set_index(keys = 'ArrivalDate')
# df_data_rollavg

In [None]:
# df_data_rollavg = df_data_rollavg.resample('D').mean()
# df_data_rollavg

In [None]:
# df_data_rollavg.index

In [None]:
# # Assume df_data is the DataFrame name, and the data is already sorted by Arrival_Date in ascending order
# # and that Arrival_Date is in datetime format

# # Calculate the 7-day rolling average of occupancy_pct_at_arrival
# df_data_rollavg.loc[:,'occupancy_pct_at_arrival_3d_avg'] = df_data_rollavg['occupancy_pct_at_arrival'].rolling(window=3, min_periods=1).mean()
# df_data_rollavg.loc[:,'occupancy_pct_at_arrival_7d_avg'] = df_data_rollavg['occupancy_pct_at_arrival'].rolling(window=7, min_periods=1).mean()
# df_data_rollavg.loc[:,'occupancy_pct_at_arrival_14d_avg'] = df_data_rollavg['occupancy_pct_at_arrival'].rolling(window=14, min_periods=1).mean()
# df_data_rollavg.loc[:,'occupancy_pct_at_arrival_28d_avg'] = df_data_rollavg['occupancy_pct_at_arrival'].rolling(window=28, min_periods=1).mean()
# # df_data_rollavg['occupancy_pct_at_arrival_90d_avg'] = df_data_rollavg['occupancy_pct_at_arrival'].rolling(window=90, min_periods=1).mean()

# # Display the updated DataFrame to verify the calculation
# df_data_rollavg.head()

In [None]:
# px.line(df_data_rollavg)

In [None]:
# rolling_avg_bookings = df_data_rollavg['occupancy_pct_at_arrival_7d_avg']

In [None]:
# # Define high-demand threshold as the 75th percentile of the 7-day rolling average
# high_demand_threshold = rolling_avg_bookings.quantile(0.75).round(4)
# high_demand_threshold

In [None]:
# # Identify days that are considered high demand
# high_demand_days = rolling_avg_bookings[rolling_avg_bookings > high_demand_threshold].index
# high_demand_days

In [None]:
# # Initialize the indicator column with 0 (normal pricing)
# df_data['Dynamic_Pricing_Indicator'] = 0

# # For each booking, check if the booking date falls within a high-demand period
# # Assuming Booking_Date is already in datetime format and corresponds to the date the booking was made
# for booking_date in df_data['Booking_Date']:
#     if booking_date in high_demand_days:
#         df_data.loc[df_data['Booking_Date'] == booking_date, 'Dynamic_Pricing_Indicator'] = 1


In [None]:
# df_data.loc[:,'Dynamic_Pricing_Indicator'].value_counts(dropna=0, normalize = 1, ascending=0)

In [None]:
# df_data.head()