# Converting Data Source Files to Database

In [1]:
import duckdb
import pandas as pd
import uuid

# Load Data

In [2]:
h1 = pd.read_csv('../../data/source/H1.csv')
h2 = pd.read_csv('../../data/source/H2.csv')

In [3]:
h1.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,342,2015,July,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,0,737,2015,July,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,0,7,2015,July,27,1,0,1,1,0,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,0,13,2015,July,27,1,0,1,1,0,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,0,14,2015,July,27,1,0,2,2,0,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [4]:
h1['HotelName'] = 'H1'
h2['HotelName'] = 'H2'

In [7]:
data = pd.concat([h1, h2], axis = 0)
data.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelName
0,0,342,2015,July,27,1,0,0,2,0.0,...,,,0,Transient,0.0,0,0,Check-Out,2015-07-01,H1
1,0,737,2015,July,27,1,0,0,2,0.0,...,,,0,Transient,0.0,0,0,Check-Out,2015-07-01,H1
2,0,7,2015,July,27,1,0,1,1,0.0,...,,,0,Transient,75.0,0,0,Check-Out,2015-07-02,H1
3,0,13,2015,July,27,1,0,1,1,0.0,...,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02,H1
4,0,14,2015,July,27,1,0,2,2,0.0,...,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03,H1


# Add UUIDs to Reservations

In [8]:
data

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelName
0,0,342,2015,July,27,1,0,0,2,0.0,...,,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1
1,0,737,2015,July,27,1,0,0,2,0.0,...,,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1
2,0,7,2015,July,27,1,0,1,1,0.0,...,,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1
3,0,13,2015,July,27,1,0,1,1,0.0,...,304,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1
4,0,14,2015,July,27,1,0,2,2,0.0,...,240,,0,Transient,98.00,0,1,Check-Out,2015-07-03,H1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79325,0,23,2017,August,35,30,2,5,2,0.0,...,394,,0,Transient,96.14,0,0,Check-Out,2017-09-06,H2
79326,0,102,2017,August,35,31,2,5,3,0.0,...,9,,0,Transient,225.43,0,2,Check-Out,2017-09-07,H2
79327,0,34,2017,August,35,31,2,5,2,0.0,...,9,,0,Transient,157.71,0,4,Check-Out,2017-09-07,H2
79328,0,109,2017,August,35,31,2,5,2,0.0,...,89,,0,Transient,104.40,0,0,Check-Out,2017-09-07,H2


In [9]:
# Generate UUIDs for each row in the dataframe
data['UUID'] = [uuid.uuid4() for _ in range(len(data))]
data['UUID'].head()

0    f7add1ce-8fdb-4351-a20a-3c5cc66fc87d
1    990a199b-069b-4e78-9036-fe8ea73a1f3e
2    91d6303b-16f8-41b2-a195-f6bcb9ee605a
3    df309bdf-5fd8-4789-9b52-01de65e43a22
4    007c51b0-4d0e-40ca-b550-34c93d2ca9a7
Name: UUID, dtype: object

In [10]:
data

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelName,UUID
0,0,342,2015,July,27,1,0,0,2,0.0,...,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1,f7add1ce-8fdb-4351-a20a-3c5cc66fc87d
1,0,737,2015,July,27,1,0,0,2,0.0,...,,0,Transient,0.00,0,0,Check-Out,2015-07-01,H1,990a199b-069b-4e78-9036-fe8ea73a1f3e
2,0,7,2015,July,27,1,0,1,1,0.0,...,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1,91d6303b-16f8-41b2-a195-f6bcb9ee605a
3,0,13,2015,July,27,1,0,1,1,0.0,...,,0,Transient,75.00,0,0,Check-Out,2015-07-02,H1,df309bdf-5fd8-4789-9b52-01de65e43a22
4,0,14,2015,July,27,1,0,2,2,0.0,...,,0,Transient,98.00,0,1,Check-Out,2015-07-03,H1,007c51b0-4d0e-40ca-b550-34c93d2ca9a7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79325,0,23,2017,August,35,30,2,5,2,0.0,...,,0,Transient,96.14,0,0,Check-Out,2017-09-06,H2,df4a8193-4b75-499a-980c-e965c9d0e078
79326,0,102,2017,August,35,31,2,5,3,0.0,...,,0,Transient,225.43,0,2,Check-Out,2017-09-07,H2,3151aa1f-530a-4650-b37e-9f1cbace4b9f
79327,0,34,2017,August,35,31,2,5,2,0.0,...,,0,Transient,157.71,0,4,Check-Out,2017-09-07,H2,0425bc44-e7d7-4c4a-988a-4596901c6028
79328,0,109,2017,August,35,31,2,5,2,0.0,...,,0,Transient,104.40,0,0,Check-Out,2017-09-07,H2,70b6aa09-9785-4205-8ee6-d51485789de1


# Create Database and Add Data

In [6]:
## Create connection to database and add data; create either if nonexistent
with duckdb.connect(database='../../data/reservation_data.duckdb', read_only=False) as conn:
    conn.execute("CREATE TABLE IF NOT EXISTS data AS SELECT * FROM data")

with duckdb.connect(database='reservation_data.duckdb', read_only=False) as conn:
    result = conn.execute("SELECT * FROM data LIMIT 100").fetchdf()
    display(result)

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Agent_1,Company_1,DaysInWaitingList_1,CustomerType_1,ADR_1,RequiredCarParkingSpaces_1,TotalOfSpecialRequests_1,ReservationStatus_1,ReservationStatusDate_1,HotelName_1
0,0.0,342.0,2015.0,July,27.0,1.0,0.0,0.0,2.0,0.0,...,6,,0,Transient,0.0,0,0,Check-Out,2015-07-03,H2
1,0.0,737.0,2015.0,July,27.0,1.0,0.0,0.0,2.0,0.0,...,9,,0,Transient,76.5,0,1,Canceled,2015-07-01,H2
2,0.0,7.0,2015.0,July,27.0,1.0,0.0,1.0,1.0,0.0,...,9,,0,Transient,68.0,0,1,Canceled,2015-04-30,H2
3,0.0,13.0,2015.0,July,27.0,1.0,0.0,1.0,1.0,0.0,...,9,,0,Transient,76.5,0,2,Canceled,2015-06-23,H2
4,0.0,14.0,2015.0,July,27.0,1.0,0.0,2.0,2.0,0.0,...,9,,0,Transient,76.5,0,1,Canceled,2015-04-02,H2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.0,72.0,2015.0,July,27.0,3.0,0.0,2.0,2.0,0.0,...,17,,0,Transient-Party,72.5,0,0,Check-Out,2015-07-14,H2
96,1.0,63.0,2015.0,July,27.0,3.0,2.0,5.0,2.0,0.0,...,17,,0,Transient-Party,94.5,0,0,Check-Out,2015-07-14,H2
97,0.0,63.0,2015.0,July,27.0,3.0,2.0,5.0,3.0,0.0,...,17,,0,Transient-Party,94.5,0,0,Canceled,2015-06-23,H2
98,0.0,101.0,2015.0,July,27.0,3.0,2.0,5.0,2.0,1.0,...,17,,0,Transient-Party,94.5,0,0,Canceled,2015-06-23,H2


In [7]:
conn.close()