# Converting Data Source Files to Database

In [1]:
import duckdb
import pandas as pd
from pathlib import Path

In [2]:
h1 = pd.read_csv('../../data/source/H1.csv')
h2 = pd.read_csv('../../data/source/H2.csv')

In [3]:
h1.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,342,2015,July,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,0,737,2015,July,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,0,7,2015,July,27,1,0,1,1,0,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,0,13,2015,July,27,1,0,1,1,0,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,0,14,2015,July,27,1,0,2,2,0,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [4]:
h1['HotelName'] = 'H1'
h2['HotelName'] = 'H2'

In [5]:
data = pd.concat([h1, h2], axis = 1)
data.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,HotelName
0,0.0,342.0,2015.0,July,27.0,1.0,0.0,0.0,2.0,0.0,...,6,,0,Transient,0.0,0,0,Check-Out,2015-07-03,H2
1,0.0,737.0,2015.0,July,27.0,1.0,0.0,0.0,2.0,0.0,...,9,,0,Transient,76.5,0,1,Canceled,2015-07-01,H2
2,0.0,7.0,2015.0,July,27.0,1.0,0.0,1.0,1.0,0.0,...,9,,0,Transient,68.0,0,1,Canceled,2015-04-30,H2
3,0.0,13.0,2015.0,July,27.0,1.0,0.0,1.0,1.0,0.0,...,9,,0,Transient,76.5,0,2,Canceled,2015-06-23,H2
4,0.0,14.0,2015.0,July,27.0,1.0,0.0,2.0,2.0,0.0,...,9,,0,Transient,76.5,0,1,Canceled,2015-04-02,H2


In [6]:
## Create connection to database and add data; create either if nonexistent
with duckdb.connect(database='../../data/reservation_data.duckdb', read_only=False) as conn:
    conn.execute("CREATE TABLE IF NOT EXISTS data AS SELECT * FROM data")

with duckdb.connect(database='reservation_data.duckdb', read_only=False) as conn:
    result = conn.execute("SELECT * FROM data LIMIT 100").fetchdf()
    display(result)

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Agent_1,Company_1,DaysInWaitingList_1,CustomerType_1,ADR_1,RequiredCarParkingSpaces_1,TotalOfSpecialRequests_1,ReservationStatus_1,ReservationStatusDate_1,HotelName_1
0,0.0,342.0,2015.0,July,27.0,1.0,0.0,0.0,2.0,0.0,...,6,,0,Transient,0.0,0,0,Check-Out,2015-07-03,H2
1,0.0,737.0,2015.0,July,27.0,1.0,0.0,0.0,2.0,0.0,...,9,,0,Transient,76.5,0,1,Canceled,2015-07-01,H2
2,0.0,7.0,2015.0,July,27.0,1.0,0.0,1.0,1.0,0.0,...,9,,0,Transient,68.0,0,1,Canceled,2015-04-30,H2
3,0.0,13.0,2015.0,July,27.0,1.0,0.0,1.0,1.0,0.0,...,9,,0,Transient,76.5,0,2,Canceled,2015-06-23,H2
4,0.0,14.0,2015.0,July,27.0,1.0,0.0,2.0,2.0,0.0,...,9,,0,Transient,76.5,0,1,Canceled,2015-04-02,H2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.0,72.0,2015.0,July,27.0,3.0,0.0,2.0,2.0,0.0,...,17,,0,Transient-Party,72.5,0,0,Check-Out,2015-07-14,H2
96,1.0,63.0,2015.0,July,27.0,3.0,2.0,5.0,2.0,0.0,...,17,,0,Transient-Party,94.5,0,0,Check-Out,2015-07-14,H2
97,0.0,63.0,2015.0,July,27.0,3.0,2.0,5.0,3.0,0.0,...,17,,0,Transient-Party,94.5,0,0,Canceled,2015-06-23,H2
98,0.0,101.0,2015.0,July,27.0,3.0,2.0,5.0,2.0,1.0,...,17,,0,Transient-Party,94.5,0,0,Canceled,2015-06-23,H2


In [7]:
conn.close()