In [73]:
import sys
import os
import inspect
import numpy as np
import pandas as pd
from scipy.stats import skew
from scipy.stats import kurtosis
from IPython.display import display

# Allow the display of all columns in the DataFrame
pd.options.display.max_columns = None

parentPath = '/'.join(sys.path[0].split('/')[:-1])

In [62]:
df = pd.read_csv(parentPath+'/data/pd_dfExpediaSample.csv')

In [74]:
# Using the display function allows for the viewing of all columns in DataFrame
display(df.head(8))

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,channel,srch_ci,srch_co,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-12-25 03:03:12,24,2,3,64,9448,,2451,True,False,9,2015-02-05 00:00:00,2015-02-06 00:00:00,2,0,1,8785,1,False,1,6,105,35,2
1,2014-07-10 01:21:52,25,2,23,48,4924,,3972,True,False,1,2014-08-13 00:00:00,2014-08-14 00:00:00,2,1,1,8278,1,False,1,2,50,368,41
2,2014-03-04 07:15:11,2,3,66,311,31592,1713.1043,7030,False,True,0,2014-05-06 00:00:00,2014-05-12 00:00:00,2,0,1,8791,1,False,4,4,8,110,87
3,2014-07-23 07:26:18,2,3,215,646,51733,426.0613,7573,False,False,9,2014-11-20 00:00:00,2014-11-23 00:00:00,2,0,1,33782,6,False,1,4,8,1532,86
4,2013-04-02 11:14:35,24,2,3,50,5703,,9616,False,False,1,2013-04-30 00:00:00,2013-05-03 00:00:00,2,1,1,20225,6,False,1,3,182,46,29
5,2014-09-08 01:54:39,24,2,3,45,31840,,10163,False,False,5,2014-09-16 00:00:00,2014-09-19 00:00:00,2,0,1,8253,1,False,1,6,70,19,11
6,2014-05-08 12:52:35,2,3,66,184,34619,90.4694,14795,False,False,1,2014-06-07 00:00:00,2014-06-08 00:00:00,2,3,1,8236,1,False,3,2,50,656,91
7,2014-08-25 22:13:33,2,3,66,462,52482,5667.1004,14943,False,False,1,2014-09-07 00:00:00,2014-09-11 00:00:00,2,0,1,21787,6,False,1,6,105,29,8


In [64]:
df.dtypes

date_time                     object
site_name                      int64
posa_continent                 int64
user_location_country          int64
user_location_region           int64
user_location_city             int64
orig_destination_distance    float64
user_id                        int64
is_mobile                      int64
is_package                     int64
channel                        int64
srch_ci                       object
srch_co                       object
srch_adults_cnt                int64
srch_children_cnt              int64
srch_rm_cnt                    int64
srch_destination_id            int64
srch_destination_type_id       int64
is_booking                     int64
cnt                            int64
hotel_continent                int64
hotel_country                  int64
hotel_market                   int64
hotel_cluster                  int64
dtype: object

### Data Dictionary:

Column name	- Description

date_time - Timestamp

site_name - ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, ...)

posa_continent - ID of continent associated with site_name

user_location_country - The ID of the country the customer is located

user_location_region - The ID of the region the customer is located

user_location_city - The ID of the city the customer is located

orig_destination_distance - Physical distance between a hotel and a customer at the time of search. A null means the distance could not be calculated

user_id - ID of user

is_mobile - 1 when a user connected from a mobile device, 0 otherwise

is_package - 1 if the click/booking was generated as a part of a package (i.e. combined with a flight), 0 otherwise

channel - ID of a marketing channel

srch_ci - Checkin date

srch_co - Checkout date

srch_adults_cnt - The number of adults specified in the hotel room

srch_children_cnt - The number of (extra occupancy) children specified in the hotel room

srch_rm_cnt - The number of hotel rooms specified in the search

srch_destination_id - ID of the destination where the hotel search was performed

srch_destination_type_id - Type of destination

hotel_continent - Hotel continent

hotel_country - Hotel country

hotel_market - Hotel market

is_booking - 1 if a booking, 0 if a click

cnt -	Numer of similar events in the context of the same user session

hotel_cluster- ID of a hotel cluster

So, there are a lot of fields that have the incorrect datatypes assigned to them, so I will need to convert those to the correct types before taking a closer look at the data.

In [65]:
# Update the ID fields to be categorical features
df['site_name'] = df['site_name'].astype('category')
df['posa_continent'] = df['posa_continent'].astype('category')
df['user_location_country'] = df['user_location_country'].astype('category')
df['user_location_region'] = df['user_location_region'].astype('category')
df['user_location_city'] = df['user_location_city'].astype('category')
df['user_id'] = df['user_id'].astype('category')
df['channel'] = df['channel'].astype('category')
df['srch_destination_id'] = df['srch_destination_id'].astype('category')
df['srch_destination_type_id'] = df['srch_destination_type_id'].astype('category')
df['hotel_continent'] = df['hotel_continent'].astype('category')
df['hotel_country'] = df['hotel_country'].astype('category')
df['hotel_market'] = df['hotel_market'].astype('category')
df['hotel_cluster'] = df['hotel_cluster'].astype('category')

There are also boolean features that are currently passing for integers, so I will need to convert those to the correct types also.

In [66]:
df['is_mobile'] = df['is_mobile'].astype('bool')
df['is_package'] = df['is_package'].astype('bool')
df['is_booking'] = df['is_booking'].astype('bool')

In [67]:
df.dtypes

date_time                      object
site_name                    category
posa_continent               category
user_location_country        category
user_location_region         category
user_location_city           category
orig_destination_distance     float64
user_id                      category
is_mobile                        bool
is_package                       bool
channel                      category
srch_ci                        object
srch_co                        object
srch_adults_cnt                 int64
srch_children_cnt               int64
srch_rm_cnt                     int64
srch_destination_id          category
srch_destination_type_id     category
is_booking                       bool
cnt                             int64
hotel_continent              category
hotel_country                category
hotel_market                 category
hotel_cluster                category
dtype: object

Exploratory Data Analysis Report on catgorical variables

In [70]:
# Subset the object and categorical features
dfcat = df.select_dtypes(exclude=[np.number])
dfcat.columns

Index(['date_time', 'site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city', 'user_id', 'is_mobile',
       'is_package', 'channel', 'srch_ci', 'srch_co', 'srch_destination_id',
       'srch_destination_type_id', 'is_booking', 'hotel_continent',
       'hotel_country', 'hotel_market', 'hotel_cluster'],
      dtype='object')

In [71]:
# Create the categorical variable table
for i in range(0, len(dfcat.columns)):
    if i == 0:
        report = pd.DataFrame({
            'Variable Name': [dfcat.columns[i]],
            'Data Type': [str(dfcat.iloc[:,[i]].dtypes[0])],
            'Levels': [len(dfcat.iloc[:,i].unique())],
            'Missing Values': [dfcat.iloc[:,[i]].isna().sum().sum()],
            'Percent Missing': ['%.2f'%(dfcat.iloc[:,[i]].isna().sum().sum()/len(dfcat)*100)]
            })
    else:
        report.loc[len(report)] = {
            'Variable Name': dfcat.columns[i],
            'Data Type': str(dfcat.iloc[:,[i]].dtypes[0]),
            'Levels': len(dfcat.iloc[:,i].unique()),
            'Missing Values': dfcat.iloc[:,[i]].isna().sum().sum(),
            'Percent Missing': '%.2f'%(dfcat.iloc[:,[i]].isna().sum().sum()/len(dfcat)*100)
            }
report

Unnamed: 0,Variable Name,Data Type,Levels,Missing Values,Percent Missing
0,date_time,object,74906,0,0.0
1,site_name,category,42,0,0.0
2,posa_continent,category,5,0,0.0
3,user_location_country,category,197,0,0.0
4,user_location_region,category,740,0,0.0
5,user_location_city,category,9541,0,0.0
6,user_id,category,68475,0,0.0
7,is_mobile,bool,2,0,0.0
8,is_package,bool,2,0,0.0
9,channel,category,11,0,0.0


Immediately, 

Exploratory Data Analysis Report on numeric variables

In [None]:
# Subset the numeric features
dfnum = df[df.describe(include=[np.number]).columns]

# Create the numeric variable table
for i in range(0, len(dfnum.columns)):
    if i == 0:
        report = pd.DataFrame({
            'Variable Name': [dfnum.columns[i]],
            'Data Type': [str(dfnum.iloc[:,[i]].dtypes[0])],
            'Mean': round(np.mean(dfnum.iloc[:,[i]].dropna()).sum(), 2),
            'Standard Deviation': round(np.std(dfnum.iloc[:,[i]].dropna()).sum(), 2),
            'Skew.2SE': '%.2f'%(skew(dfnum.iloc[:,[i]])/(dfnum.iloc[:,[i]].sem()*2)),
            'Kurtosis.2SE': '%.2f'%(kurtosis(dfnum.iloc[:,[i]].dropna())[0]/(dfnum.iloc[:,[i]].sem()*2)),
            'Missing Values': [dfnum.iloc[:,[i]].isna().sum().sum()],
            'Percent Missing': ['%.2f'%(dfnum.iloc[:,[i]].isna().sum().sum()/len(dfnum)*100)]
            })
    else:
        report.loc[len(report)] = {
            'Variable Name': dfnum.columns[i],
            'Data Type': str(dfnum.iloc[:,[i]].dtypes[0]),
            'Mean': round(np.mean(dfnum.iloc[:,[i]].dropna()).sum(), 2),
            'Standard Deviation': round(np.std(dfnum.iloc[:,[i]].dropna()).sum(), 2),
            'Skew.2SE': '%.2f'%(skew(dfnum.iloc[:,[i]])/(dfnum.iloc[:,[i]].sem()*2)),
            'Kurtosis.2SE': '%.2f'%(kurtosis(dfnum.iloc[:,[i]].dropna())[0]/(dfnum.iloc[:,[i]].sem()*2)),
            'Missing Values': dfnum.iloc[:,[i]].isna().sum().sum(),
            'Percent Missing': '%.2f'%(dfnum.iloc[:,[i]].isna().sum().sum()/len(dfnum)*100)
            }
report

In [None]:
# correlation matrix looking only at the predictor variable
dfnum = pd_dfExpediaSamplenum[pd_dfExpediaSamplenum.describe(include=[np.number]).columns]

corr = dfnum.corr()

corr

In [None]:
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=64,
    horizontalalignment='right'
);