In [11]:
import pandas as pd

In [12]:
data_path = '../../data/3.1_temporally_updated_data.feather'
data = pd.read_feather(data_path, columns = ['ADR', 'ArrivalDate', 'DepartureDate'])
data

Unnamed: 0,ADR,ArrivalDate,DepartureDate
0,0.00,2015-07-01,2015-07-01
1,80.00,2015-07-01,2015-07-03
2,101.50,2015-07-01,2015-07-03
3,101.50,2015-07-01,2015-07-03
4,101.50,2015-07-01,2015-07-03
...,...,...,...
119385,207.03,2017-08-31,2017-09-07
119386,312.29,2017-08-31,2017-09-07
119387,207.00,2017-08-31,2017-09-03
119388,114.80,2017-08-31,2017-09-07


In [14]:
def explode_reservations(df, start_date_col, end_date_col):
    """
    Explodes reservations in a DataFrame to individual days.

    Args:
        df (pandas.DataFrame): The DataFrame containing the reservations.
        start_date_col (str): The name of the column in `df` that represents the start date of each reservation.
        end_date_col (str): The name of the column in `df` that represents the end date of each reservation.

    Returns:
        pandas.DataFrame: The DataFrame with each reservation expanded to individual days.

    Raises:
        None

    Example:
        df = pd.DataFrame({'check_in': ['2022-01-01', '2022-02-01'],
                           'check_out': ['2022-01-04', '2022-02-03']})
        explode_reservations(df, 'check_in', 'check_out')

    """
    # Create a DataFrame with each guest's stay expanded to individual days
    # Generate a range of dates for each row and store it in a new column
    df['date_range'] = df.apply(lambda row: pd.date_range(start=row[start_date_col],
                                                          end=row[end_date_col] - pd.Timedelta(days=1)),
                                axis=1)

    # Explode the date_range column to create a row for each date in the range
    expanded_df = df.explode('date_range')

    # Drop the original check_in and check_out columns if they are no longer needed
    expanded_df = expanded_df.drop(columns=[start_date_col,
                                            end_date_col])

    # Rename the date_range column to date for clarity
    expanded_df = (expanded_df
                   .rename(columns={'date_range': 'Stay Date'})
                   .reset_index(drop=True)
                   .dropna(subset = 'Stay Date')
                   .sort_values(by = ['Stay Date', 'ADR']))

    # Reset index if needed
    expanded_df = expanded_df

    return expanded_df

In [15]:
data_exploded = explode_reservations(data, 'ArrivalDate', 'DepartureDate')
data_exploded

Unnamed: 0,ADR,Stay Date
164,0.00,2015-07-01
163,4.00,2015-07-01
135,55.68,2015-07-01
227,62.00,2015-07-01
154,62.50,2015-07-01
...,...,...
409000,153.57,2017-09-11
409699,99.06,2017-09-12
409822,112.80,2017-09-12
409700,99.06,2017-09-13


In [16]:
data_exploded.to_feather('../../data/3.3_data_exploded.feather')