# Preparing our Dataset to Model Demand

In [1]:
import os
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import OneHotEncoder as ohe

from dbds import generate_hotel_dfs
from agg import prep_demand_features
from agg_utils import stly_cols_agg, ly_cols_agg, drop_cols_agg, stly_pace_cols, ty_pace_cols

pd.options.display.max_rows = 150
pd.options.display.max_columns = 250
pd.options.display.max_colwidth = None

DATE_FMT = "%Y-%m-%d"
h1_capacity = 187
h2_capacity = 226
AOD = "2017-08-01"
AOD_dt = pd.to_datetime(AOD)

h1_res = pd.read_pickle("pickle/h1_res.pick")
h2_res = pd.read_pickle("pickle/h2_res.pick")
h1_dbd = pd.read_pickle("pickle/h1_dbd.pick")
h2_dbd = pd.read_pickle("pickle/h2_dbd.pick")

In [2]:
h1_res, h1_dbd = generate_hotel_dfs("../data/H1.csv", capacity=h1_capacity)
h2_res, h2_dbd = generate_hotel_dfs("../data/H2.csv", capacity=h2_capacity)

h1_res.to_pickle("pickle/h1_res.pick")
h1_dbd.to_pickle("pickle/h1_dbd.pick")
h2_res.to_pickle("pickle/h2_res.pick")
h2_dbd.to_pickle("pickle/h2_dbd.pick")

Hotel dataframes generated successfully!
Hotel capacity: 187 rooms
Hotel data date range: 2015-07-01 to 2017-08-31
Hotel dataframes generated successfully!
Hotel capacity: 226 rooms
Hotel data date range: 2015-07-01 to 2017-08-31


## Combine Files Generated by save_sims.py

In [8]:
# generate list of relevant files
import datetime as dt
import pandas as pd
hotel_num = 1
h = 'h' + str(hotel_num)
SIM_AOD = pd.to_datetime(dt.date(2017, 8, 1), format=DATE_FMT)
sim_start = SIM_AOD - pd.DateOffset(365*2) # need > 364 days of actuals for each date, the rest future-looking

# FOLDER = "./sims2/"
# Get the directory of the current script
# IMPORTANT: Ensure your Jupyter Notebook or interactive session
# is started from the 'rms' directory (e.g., ~/Desktop/rms/rms/)
current_working_dir = os.getcwd()

# Now define paths relative to this current working directory
# Assuming 'code' and 'sims2' are direct subdirectories of the current working directory
code_dir = os.path.join(current_working_dir, "code")
FOLDER = os.path.join(current_working_dir, "sims2", "") # Path to the sims2 folder

# You'll also need to define pickle_dir if you're loading pickle files in this same notebook:
pickle_dir = os.path.join(code_dir, "pickle")
lam_include = lambda x: x[:2] == h and pd.to_datetime(x[7:17]) >= sim_start
h1_files = [f for f in os.listdir(FOLDER) if lam_include(f)]
m =len(h1_files)
print(m)
h1_files.sort()
len(h1_files), h1_files[0], h1_files[-1] # note STLY date of 8/1/17 == 8/2/16 (matching weekday)

0


IndexError: list index out of range

In [None]:
%%time
df_sim = pd.DataFrame()
df_list = [pd.read_pickle(FOLDER + otb_data) for otb_data in h1_files]
df_sim = pd.concat(df_list, ignore_index=True)

df_sim.shape

## Adding calculated features

In [None]:
df_sim.shape

In [None]:
# Add AsOfDate

def apply_aod(row):
    stay_date = row["Date"]
    stly_stay_date = pd.to_datetime(row["STLY_Date"])
    n_days_b4 = int(row["DaysUntilArrival"])
    as_of_date = pd.to_datetime(
        stay_date - pd.DateOffset(n_days_b4), format=DATE_FMT
    )
    stly_as_of_date = pd.to_datetime(
        stly_stay_date - pd.DateOffset(n_days_b4), format=DATE_FMT
    )
    return as_of_date, stly_as_of_date

df_sim[["AsOfDate","STLY_AsOfDate"]] = df_sim[["Date", "STLY_Date", "DaysUntilArrival"]].apply(apply_aod, axis=1, result_type='expand')
df_sim.rename(columns={"Date": "StayDate", "STLY_Date": "STLY_StayDate"}, inplace=True)

df_sim.head()

In [None]:
df_sim.shape
df_sim["AsOfDate"]

In [None]:
# add remaining supply ('RemSupply')
capacity = 187
df_sim["RemSupply"] = (
    capacity - df_sim.RoomsOTB.astype(int) + df_sim.CxlForecast.astype(int)
)

In [None]:
df_sim.shape

In [None]:
# add one-hot-encoded DOW ('Day of Week') columns

ohe_dow = pd.get_dummies(df_sim.DOW, drop_first=True)
dow_ohe_cols = list(ohe_dow.columns)
df_sim[dow_ohe_cols] = ohe_dow

In [None]:
df_sim.shape

In [None]:
# add NONTRN cols

df_sim["NONTRN_RoomsOTB"] = (
    df_sim.RoomsOTB - df_sim.TRN_RoomsOTB
)
df_sim["NONTRN_RevOTB"] = df_sim.RevOTB - df_sim.TRN_RevOTB
df_sim["NONTRN_ADR_OTB"] = round(df_sim["NONTRN_RevOTB"] / df_sim["NONTRN_RoomsOTB"], 2)
df_sim["NONTRN_CxlForecast"] = df_sim.CxlForecast - df_sim.TRN_CxlForecast

# df_sim["LYA_NONTRN_RoomsOTB"] = (
#     df_sim.LYA_TRNP_RoomsOTB + df_sim.LYA_GRP_RoomsOTB + df_sim.LYA_CNT_RoomsOTB
# )
# df_sim["LYA_NONTRN_RevOTB"] = df_sim.LYA_TRNP_RevOTB + df_sim.LYA_GRP_RevOTB + df_sim.LYA_CNT_RevOTB



In [None]:
df_sim.shape

In [None]:
len(ly_cols_agg)

In [None]:
tuple(np.zeros(7))

In [None]:
# Add last-year actual columns ("LYA_")

def apply_ly_cols(row):
    try:
        stly_date = pd.to_datetime(row["STLY_StayDate"])
        cutoff_date = pd.to_datetime('2015-08-01')
        if stly_date < cutoff_date:
            return tuple(np.zeros(len(ly_cols_agg)))
        stly_date_str = stly_date.strftime(DATE_FMT)
        df_lya = list(h1_dbd.loc[stly_date_str, ly_cols_agg])
        return tuple(df_lya)
    except:
        return tuple(np.zeros(len(ly_cols_agg)))

ly_new_cols = ["LYA_" + col for col in ly_cols_agg]
df_sim[ly_new_cols] = df_sim[["STLY_StayDate"]].apply(apply_ly_cols, axis=1, result_type="expand")

df_sim.fillna(0, inplace=True)

df_sim.tail()

In [None]:
actual_cols = ['RoomsSold', "ADR", "RoomRev", "NumCancels"]
def apply_ty_actuals(row):
    date = row["StayDate"]
    date_str = dt.datetime.strftime(date, format=DATE_FMT)
    results = list(h1_dbd.loc[date_str, actual_cols])
    return tuple(results)

new_actual_cols = ["ACTUAL_" + col for col in actual_cols]
df_sim[new_actual_cols] = df_sim[["StayDate"]].apply(apply_ty_actuals, axis=1, result_type="expand")

df_sim.fillna(0, inplace=True)

df_sim.tail()

In [None]:
df_sim["AsOfDate"]

In [None]:
mask = df_sim.StayDate == '2017-08-09'
df_sim[mask][["ACTUAL_RoomsSold"]]

In [None]:
h1_dbd.loc["2017-08-09"]

In [None]:
h1_dbd.columns

In [None]:
df_sim.columns
# df_sim["AsOfDate"]

In [None]:
df_sim["NONTRN_ADR_OTB"] = round(df_sim["NONTRN_RevOTB"] / df_sim["NONTRN_RoomsOTB"], 2)
# df_sim["TM30_NONTRN_RevOTB"]

In [None]:
df_sim["AsOfDate"]

In [None]:
# Calculate ADR for all segments first

df_sim["ADR_OTB"] = round(df_sim["RevOTB"] / df_sim["RoomsOTB"], 2)
df_sim["TRN_ADR_OTB"] = round(df_sim["TRN_RevOTB"] / df_sim["TRN_RoomsOTB"], 2)
df_sim["NONTRN_ADR_OTB"] = round(df_sim["NONTRN_RevOTB"] / df_sim["NONTRN_RoomsOTB"], 2)

# get recent pickup (tminus) columns
tms = ["TM30_", "TM15_", "TM05_"]
segs = ["", "TRN_"] # "" for total hotel

for tm in tms:
    # Calculate ADR for tminus windows first
    df_sim[tm + "ADR_OTB"] = round(df_sim[tm + "RevOTB"] / df_sim[tm + "RoomsOTB"], 2)
    df_sim[tm + "TRN_ADR_OTB"] = round(df_sim[tm + "TRN_RevOTB"] / df_sim[tm + "TRN_RoomsOTB"], 2)

    # Calculate NONTRN ADR for tminus windows
    # df_sim[tm + "NONTRN_ADR_OTB"] = round(df_sim[tm + "NONTRN_RevOTB"] / df_sim[tm + "NONTRN_RoomsOTB"], 2)
    
    for seg in segs:
        # Calculate pickup stats
        df_sim[tm + seg + "RoomsPickup"] = round(
            df_sim[seg + "RoomsOTB"] - df_sim[tm + seg + "RoomsOTB"], 2
        )
        df_sim[tm + seg + "RevPickup"] = round(
            df_sim[seg + "RevOTB"] - df_sim[tm + seg + "RevOTB"], 2
        )
        df_sim[tm + seg + "ADR_Pickup"] = round(
            df_sim[seg + "ADR_OTB"] - df_sim[tm + seg + "ADR_OTB"], 2
        )
    
    # Calculate NONTRN pickup stats
    tm_nontrn_rooms_otb = tm + "NONTRN_RoomsOTB"
    nontrn_rooms_otb = "NONTRN_RoomsOTB"
    tm_nontrn_rev_otb = tm + "NONTRN_RevOTB"
    nontrn_rev_otb = "NONTRN_RevOTB"
    tm_nontrn_adr_otb = tm + "NONTRN_ADR_OTB"
    nontrn_adr_otb = "NONTRN_ADR_OTB"

    if nontrn_rooms_otb in df_sim.columns:
        if tm_nontrn_rooms_otb not in df_sim.columns:
            df_sim[tm_nontrn_rooms_otb] = 0  # Fill missing column with 0
        df_sim[tm + "NONTRN_RoomsPickup"] = (
            df_sim[nontrn_rooms_otb] - df_sim[tm_nontrn_rooms_otb]
        )
    if nontrn_rev_otb in df_sim.columns:
        if tm_nontrn_rev_otb not in df_sim.columns:
            df_sim[tm_nontrn_rev_otb] = 0  # Fill missing column with 0
        df_sim[tm + "NONTRN_RevPickup"] = (
            df_sim[nontrn_rev_otb] - df_sim[tm_nontrn_rev_otb]
        )
    if nontrn_adr_otb in df_sim.columns:
        if tm_nontrn_adr_otb not in df_sim.columns:
            df_sim[tm_nontrn_adr_otb] = 0  # Fill missing column with 0
        df_sim[tm + "NONTRN_ADR_Pickup"] = (
            df_sim[nontrn_adr_otb] - df_sim[tm_nontrn_adr_otb]
        )

df_sim.head()

In [None]:
df_sim.shape
df_sim["AsOfDate"]

In [None]:
# add gap to LYA columns (by segment)
# must be done AFTER NONTRN cols added
df_sim["RoomsGapToLYA"] = df_sim.LYA_RoomsSold - df_sim.RoomsOTB
df_sim["RevGapToLYA"] = df_sim.LYA_RoomRev - df_sim.RevOTB
df_sim["ADR_GapToLYA"] = df_sim.LYA_ADR - df_sim.ADR_OTB

df_sim["TRN_RoomsGapToLYA"] = df_sim.LYA_TRN_RoomsSold - df_sim.TRN_RoomsOTB
df_sim["TRN_RevGapToLYA"] = df_sim.LYA_TRN_RoomRev - df_sim.TRN_RevOTB
df_sim["TRN_ADR_GapToLYA"] = df_sim.LYA_TRN_ADR - df_sim.TRN_ADR_OTB

df_sim["NONTRN_RoomsGapToLYA"] = df_sim["RoomsGapToLYA"] - df_sim["TRN_RoomsGapToLYA"]
df_sim["NONTRN_RevGapToLYA"] = df_sim["RevGapToLYA"] - df_sim["TRN_RevGapToLYA"]
df_sim["NONTRN_ADR_GapToLYA"] = df_sim["ADR_GapToLYA"] - df_sim["TRN_ADR_GapToLYA"]

In [None]:
df_sim.shape

In [None]:
df_sim.columns

In [None]:
# remove all non-gap, non-pickup actual/tminus columns
# I will want to move this down in our script to combine with removing stly cols (we only want pace)
# removing them here just to make it cleaner

df_sim.drop(columns=drop_cols_agg, inplace=True, errors='ignore')
df_sim.columns

In [None]:
df_sim.sample(random_state=0)

In [None]:
# df_sim.loc["2016-04-24"]

# EW- NEXT STEPS (THU 5PM)

1. drop unneeded, post-processed TM_nn columns in blank cell above (create list in agg_utils.py)
2. pull stly cols via merge below
3. calculate pace
4. drop unneeded, post-processed stly cols
5. add all of the steps in this notebook to agg.py
6. pull features from list at top of this NB
7. train/test split
8. linear regression (predict RoomsSold)
9. randomForest (predict RoomsSold)

**Time to pull STLY columns. I will accomplish this by merging df_sim on top of itself and pulling the below columns into the next year's row with the `'STLY_'` prefix.***

But before we do that, let's make sure we add in the ADR columns.

NEVERMIND - THIS STEP NEEDS TO COME LAST ONCE WE HAVE ALL OF THE OTHER COLUMNS

In [None]:
df_sim.head(2)

In [None]:
df_sim.shape

In [None]:
# pull STLY columns with self-merge to STLY date

# first, we need to create unique ID col (id) for each as-of-date/stay-date combo
# then, we manipulate strings to add a stly_id column that we can use as right key for our merge

df_sim_ids = df_sim.AsOfDate.astype(str) + ' - ' + df_sim.StayDate.astype(str)
df_sim.insert(0, "id", df_sim_ids)

df_sim_stly_ids = df_sim.STLY_AsOfDate.astype(str) + ' - ' + df_sim.STLY_StayDate.astype(str)
df_sim.insert(1, "stly_id", df_sim_stly_ids)
df_sim.head()

In [None]:
df_sim["DayOfWeek"] df_sim.StayDate.map(lambda x: dt.datetime.strftime(x, format="%a"))

In [None]:
df_sim.shape

In [None]:
# self-join df_sim to pull stly stats using the above keys

df_sim = df_sim.merge(df_sim[stly_cols_agg], left_on='stly_id', right_on='id', suffixes=(None, "_STLY"))
df_sim.head(2)

In [None]:
df_sim.shape

In [None]:
df_sim[['id', 'stly_id', 'AsOfDate', 'StayDate', 'AsOfDate_STLY', 'StayDate_STLY', 'RoomsOTB_STLY', 'RevOTB_STLY']]




In [None]:
len(df_sim[df_sim.AsOfDate_STLY.isna()])

In [None]:
len(df_sim.dropna())

In [None]:
[c for c in df_sim.columns if c[-5:] == '_STLY']

In [None]:
df_sim.shape

In [None]:
df_test_stly = pd.read_pickle("./sims/pickle/h1_sim_2016-08-02.pick")
df_test_stly.loc["2016-08-06"]