# Adding UUIDs to Reservations

In [16]:
## Import pandas to handle data and `uuid` to create unique reservation IDs
import pandas as pd
import uuid

# Load and Concatenate Data

In [17]:
## Load datasets and add column to indicate hotel type/location

df_h1 = pd.read_csv('../../data/source/H1.csv')
df_h1['HotelNumber'] = 'H1'
df_h1['HotelNumber'] = df_h1['HotelNumber'].astype('category')

df_h2 = pd.read_csv('../../data/source/H2.csv')
df_h2['HotelNumber'] = 'H2'
df_h2['HotelNumber'] = df_h2['HotelNumber'].astype('category')

In [18]:
data = pd.concat([df_h1,df_h2],axis = 0).reset_index(drop = True)
data

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelNumber
0,0,342,2015,July,27,1,0,0,2,0.0,...,,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1
1,0,737,2015,July,27,1,0,0,2,0.0,...,,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1
2,0,7,2015,July,27,1,0,1,1,0.0,...,,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1
3,0,13,2015,July,27,1,0,1,1,0.0,...,304,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1
4,0,14,2015,July,27,1,0,2,2,0.0,...,240,,0,Transient,98.00,0,1,Check-Out,2015-07-03,H1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,23,2017,August,35,30,2,5,2,0.0,...,394,,0,Transient,96.14,0,0,Check-Out,2017-09-06,H2
119386,0,102,2017,August,35,31,2,5,3,0.0,...,9,,0,Transient,225.43,0,2,Check-Out,2017-09-07,H2
119387,0,34,2017,August,35,31,2,5,2,0.0,...,9,,0,Transient,157.71,0,4,Check-Out,2017-09-07,H2
119388,0,109,2017,August,35,31,2,5,2,0.0,...,89,,0,Transient,104.40,0,0,Check-Out,2017-09-07,H2


# Generate UUIDs and Append to DataFrame

In [19]:
# Generate UUIDs for each row in the dataframe
data['UUID'] = [uuid.uuid4() for _ in range(len(data))]
data

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelNumber,UUID
0,0,342,2015,July,27,1,0,0,2,0.0,...,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1,9af79666-f290-45c5-868c-2f9601b8f98b
1,0,737,2015,July,27,1,0,0,2,0.0,...,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1,81440274-e84e-4502-89f3-e01681d0672a
2,0,7,2015,July,27,1,0,1,1,0.0,...,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1,60fe936c-f7ba-48d9-ac73-71c21e1b3978
3,0,13,2015,July,27,1,0,1,1,0.0,...,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1,5b2aae61-1d0c-4314-b4c1-603595e43163
4,0,14,2015,July,27,1,0,2,2,0.0,...,,0,Transient,98.00,0,1,Check-Out,2015-07-03,H1,e92881a3-faf8-402b-beff-64dad4707236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,23,2017,August,35,30,2,5,2,0.0,...,,0,Transient,96.14,0,0,Check-Out,2017-09-06,H2,834c6a25-b7c4-4170-b8eb-3a00d830e397
119386,0,102,2017,August,35,31,2,5,3,0.0,...,,0,Transient,225.43,0,2,Check-Out,2017-09-07,H2,1c6d7a7a-2ffe-4d85-b44a-717666d8d7cc
119387,0,34,2017,August,35,31,2,5,2,0.0,...,,0,Transient,157.71,0,4,Check-Out,2017-09-07,H2,12af19d6-1594-4eef-bba8-643bc102cfea
119388,0,109,2017,August,35,31,2,5,2,0.0,...,,0,Transient,104.40,0,0,Check-Out,2017-09-07,H2,1e62e04f-8fd3-4304-818c-07b37c4208e0


## Convert UUIDs to String Datatype

In [20]:
data['UUID'] = data['UUID'].astype(str)
data['UUID']

0         9af79666-f290-45c5-868c-2f9601b8f98b
1         81440274-e84e-4502-89f3-e01681d0672a
2         60fe936c-f7ba-48d9-ac73-71c21e1b3978
3         5b2aae61-1d0c-4314-b4c1-603595e43163
4         e92881a3-faf8-402b-beff-64dad4707236
                          ...                 
119385    834c6a25-b7c4-4170-b8eb-3a00d830e397
119386    1c6d7a7a-2ffe-4d85-b44a-717666d8d7cc
119387    12af19d6-1594-4eef-bba8-643bc102cfea
119388    1e62e04f-8fd3-4304-818c-07b37c4208e0
119389    732b34e2-bbcf-4a43-b63b-73839173be8e
Name: UUID, Length: 119390, dtype: object

## Set UUIDs as Index

---

By setting the UUIDs as the dataframe's index, I maintain the unique identifier for each reservation and enable myself to perform joins with other datasets (such as engineered datasets later in the workflow0).

---

In [21]:
# data = data.set_index('UUID')
# data

# Save Results

In [22]:
data.to_parquet('../../data/source/full_data.parquet', compression = 'zstd')

## Subset: Classification Data

In [11]:
data.columns.to_list()

['IsCanceled',
 'LeadTime',
 'ArrivalDateYear',
 'ArrivalDateMonth',
 'ArrivalDateWeekNumber',
 'ArrivalDateDayOfMonth',
 'StaysInWeekendNights',
 'StaysInWeekNights',
 'Adults',
 'Children',
 'Babies',
 'Meal',
 'Country',
 'MarketSegment',
 'DistributionChannel',
 'IsRepeatedGuest',
 'PreviousCancellations',
 'PreviousBookingsNotCanceled',
 'ReservedRoomType',
 'AssignedRoomType',
 'BookingChanges',
 'DepositType',
 'Agent',
 'Company',
 'DaysInWaitingList',
 'CustomerType',
 'ADR',
 'RequiredCarParkingSpaces',
 'TotalOfSpecialRequests',
 'ReservationStatus',
 'ReservationStatusDate',
 'HotelNumber',
 'UUID']

In [13]:
classification_columns = [
'LeadTime',
 'ArrivalDateMonth',
 'ArrivalDateWeekNumber',
 'ArrivalDateDayOfMonth',
 'StaysInWeekendNights',
 'StaysInWeekNights',
 'Adults',
 'Children',
 'Babies',
 'Country',
 'MarketSegment',
 'DistributionChannel',
 'IsRepeatedGuest',
 'PreviousCancellations',
 'PreviousBookingsNotCanceled',
 'ReservedRoomType',
 'BookingChanges',
 'DepositType',
 'Agent',
 'Company',
 'DaysInWaitingList',
 'CustomerType',
 'ADR',
 'TotalOfSpecialRequests',
 'ReservationStatus',
 'HotelNumber']
data[classification_columns]

Unnamed: 0,LeadTime,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Country,...,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,TotalOfSpecialRequests,ReservationStatus,HotelNumber
0,342,July,27,1,0,0,2,0.0,0,PRT,...,3,No Deposit,,,0,Transient,0.00,0,Check-Out,H1
1,737,July,27,1,0,0,2,0.0,0,PRT,...,4,No Deposit,,,0,Transient,0.00,0,Check-Out,H1
2,7,July,27,1,0,1,1,0.0,0,GBR,...,0,No Deposit,,,0,Transient,75.00,0,Check-Out,H1
3,13,July,27,1,0,1,1,0.0,0,GBR,...,0,No Deposit,304,,0,Transient,75.00,0,Check-Out,H1
4,14,July,27,1,0,2,2,0.0,0,GBR,...,0,No Deposit,240,,0,Transient,98.00,1,Check-Out,H1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,23,August,35,30,2,5,2,0.0,0,BEL,...,0,No Deposit,394,,0,Transient,96.14,0,Check-Out,H2
119386,102,August,35,31,2,5,3,0.0,0,FRA,...,0,No Deposit,9,,0,Transient,225.43,2,Check-Out,H2
119387,34,August,35,31,2,5,2,0.0,0,DEU,...,0,No Deposit,9,,0,Transient,157.71,4,Check-Out,H2
119388,109,August,35,31,2,5,2,0.0,0,GBR,...,0,No Deposit,89,,0,Transient,104.40,0,Check-Out,H2


In [14]:
data[classification_columns].to_parquet('../../data/source/classification_data.parquet', compression = 'zstd')