In [24]:
import sqlite3
import pandas as pd

In [25]:
conn = sqlite3.connect('data/noshow.db')
df = pd.read_sql_query('SELECT * FROM noshow', conn)

In [26]:
df

Unnamed: 0,no_show,branch,booking_month,arrival_month,arrival_day,checkout_month,checkout_day,country,first_time,room,price,platform,num_adults,num_children
0,1.0,Changi,July,May,19.0,May,20.0,China,Yes,King,,Email,2,1.0
1,0.0,Orchard,December,February,28.0,March,-4.0,India,Yes,,USD$ 723.34,Email,1,0.0
2,0.0,Orchard,December,May,22.0,May,24.0,Australia,Yes,Single,SGD$ 650.94,Website,1,0.0
3,0.0,Orchard,October,September,2.0,September,4.0,China,Yes,,SGD$ 978.67,Agent,1,1.0
4,0.0,Orchard,February,February,7.0,February,8.0,China,Yes,King,,Website,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119386,0.0,Changi,October,March,17.0,March,19.0,China,Yes,King,SGD$ 953.94,Email,2,1.0
119387,0.0,Changi,August,June,2.0,June,3.0,Indonesia,Yes,King,USD$ 636.84,Email,1,1.0
119388,0.0,Changi,January,July,1.0,July,3.0,Indonesia,Yes,King,,Website,1,0.0
119389,0.0,Changi,January,May,26.0,May,29.0,Indonesia,Yes,King,USD$ 720.1,Email,1,1.0


We can already see that there are is some incorrectly entered data which will need to be dealt with as we do the EDA. For example, for the second data point, the checkout day is negative. Will need to investigate if that means something. Otherwise, will have to clean it. 

We will start by checking if there are any null values

In [27]:
df.isna().sum()

no_show               1
branch                1
booking_month         1
arrival_month         1
arrival_day           1
checkout_month        1
checkout_day          1
country               1
first_time            1
room              21613
price             24882
platform              1
num_adults            1
num_children          1
dtype: int64

There seems to be one missing from almost all the columns (apart from room and price which have way more). I suspect this might be the same data point. Checking. 

In [28]:
df[df['no_show'].isna()]

Unnamed: 0,no_show,branch,booking_month,arrival_month,arrival_day,checkout_month,checkout_day,country,first_time,room,price,platform,num_adults,num_children
115536,,,,,,,,,,,,,,


As suspected, this is just a datapoint with full null values. For ease of analysis, will just remove it now. 

In [29]:
df = df.drop(115536, axis = 0)

In [30]:
df.isna().sum()

no_show               0
branch                0
booking_month         0
arrival_month         0
arrival_day           0
checkout_month        0
checkout_day          0
country               0
first_time            0
room              21612
price             24881
platform              0
num_adults            0
num_children          0
dtype: int64

Yep so the na values for all of the features apart from room and price have been removed. Now lets take a closer look at room and price. 

In [31]:
df[df['room'].isna()]

Unnamed: 0,no_show,branch,booking_month,arrival_month,arrival_day,checkout_month,checkout_day,country,first_time,room,price,platform,num_adults,num_children
1,0.0,Orchard,December,February,28.0,March,-4.0,India,Yes,,USD$ 723.34,Email,1,0.0
3,0.0,Orchard,October,September,2.0,September,4.0,China,Yes,,SGD$ 978.67,Agent,1,1.0
5,0.0,Changi,January,April,17.0,April,19.0,Indonesia,Yes,,USD$ 659.68,Phone,1,0.0
7,1.0,Changi,December,March,31.0,April,2.0,China,Yes,,USD$ 665.39,Website,2,0.0
8,1.0,Changi,July,June,14.0,June,17.0,China,Yes,,USD$ 700.23,Website,one,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119363,0.0,Changi,October,November,19.0,November,20.0,Indonesia,Yes,,USD$ 587.81,Email,2,1.0
119368,1.0,Orchard,April,June,9.0,June,11.0,China,No,,SGD$ 1469.56,Phone,two,0.0
119377,1.0,Orchard,January,September,19.0,September,21.0,China,Yes,,USD$ 962.33,Website,2,2.0
119379,0.0,Orchard,April,June,6.0,June,7.0,Japan,Yes,,USD$ 651.44,Website,1,0.0


It's missing a lot of data points. We will probably have to end up filling up the data as 21612 data points are missing values. Lets take a look at the missing values for price.

In [32]:
df[df['price'].isna()]

Unnamed: 0,no_show,branch,booking_month,arrival_month,arrival_day,checkout_month,checkout_day,country,first_time,room,price,platform,num_adults,num_children
0,1.0,Changi,July,May,19.0,May,20.0,China,Yes,King,,Email,2,1.0
4,0.0,Orchard,February,February,7.0,February,8.0,China,Yes,King,,Website,1,2.0
19,0.0,Orchard,December,January,19.0,January,20.0,China,Yes,King,,Agent,1,0.0
27,1.0,Changi,December,March,12.0,March,13.0,China,Yes,King,,Phone,1,1.0
28,0.0,Changi,July,July,18.0,July,20.0,China,Yes,King,,Agent,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119364,0.0,Changi,October,October,28.0,October,31.0,China,Yes,King,,Website,2,1.0
119365,1.0,Changi,December,May,2.0,May,3.0,China,Yes,King,,Agent,2,1.0
119380,1.0,Orchard,March,February,2.0,February,3.0,China,Yes,King,,Website,1,1.0
119385,1.0,Orchard,July,April,25.0,April,27.0,China,Yes,Queen,,Website,1,0.0


I'm currently hoping that we aren't missing any values from both room and price in the same row. Lets take a look. As that would make it easier to fill in missing values. As room type would be a good predictor of price. 

In [33]:
df[df['price'].isna()&df['room'].isna()]

Unnamed: 0,no_show,branch,booking_month,arrival_month,arrival_day,checkout_month,checkout_day,country,first_time,room,price,platform,num_adults,num_children
