In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/air_visit_data', './input/air_store_info', './input/air_reserve',
                './input/hpg_reserve', './input/store_id_relation', './input/sample_submission',
                './input/date_info']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

# Feature Engineering based on`Suprise Me´
Because most of the Suprise Me kernels are very straight forward I created a more structured version of the feature engineering process with more comments. Maybe it can be a help for someone else too. At the end I added a small dictionary which describes all attributes that are created over the time.

In [1]:
import glob
import re
import numpy as np
# import pandas as pd
exec(os.environ['IREWR_IMPORTS'])

# Remove the restriction for the max dataframe width
# pd.options.display.max_columns = 250
# pd.options.display.max_rows = 250

# FIRST-AUTHOR: remove ML code
# from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from IPython.display import display

# Load the datasets with pandas

In [2]:
data = {
    'gt_visits': pd.read_csv('./input/air_visit_data.scaled.csv'),
    'air_store_info': pd.read_csv('./input/air_store_info.scaled.csv'),
    #'hs': pd.read_csv('../input/hpg_store_info.csv'),
    'air_reserve_history': pd.read_csv('./input/air_reserve.scaled.csv'),
    'hpg_reserve_history': pd.read_csv('./input/hpg_reserve.scaled.csv'),
    
    'hpg_to_air_id': pd.read_csv('./input/store_id_relation.scaled.csv'),
    'subm_visits': pd.read_csv('./input/sample_submission.scaled.csv'),
    'holidays': pd.read_csv('./input/date_info.scaled.csv').rename(columns={'calendar_date':'visit_date'})
}

## Merge the reservation histories of both plattforms

In [3]:
###############################################################################################
# Get the air-reserve id  of the hpg restaurants
###############################################################################################
data['hpg_reserve_history'] = pd.merge(
    data['hpg_reserve_history'], data['hpg_to_air_id'], 
    how='inner', on=['hpg_store_id']
)

###############################################################################################
# Drop the HPG id
###############################################################################################
data['hpg_reserve_history'] = data['hpg_reserve_history'].drop('hpg_store_id', axis=1)

display(data['hpg_reserve_history'].shape)

(28183, 4)

In [4]:
###############################################################################################
# Append the HPG reservations to the air-reserve history
###############################################################################################
print("Shape before: ", data['air_reserve_history'].shape)

reservation_history = data['air_reserve_history'].append(data['hpg_reserve_history'], sort="True")
reservation_history = data['air_reserve_history'].sort_values(by=['air_store_id', 'reserve_datetime'])
reservation_history = data['air_reserve_history'].reset_index()
reservation_history = reservation_history.drop('index', axis=1)

display(reservation_history.head())
print("Shape after: ", reservation_history.shape)

Shape before:  (92378, 4)


  reservation_history = data['air_reserve_history'].append(data['hpg_reserve_history'], sort="True")
  reservation_history = data['air_reserve_history'].append(data['hpg_reserve_history'], sort="True")


Unnamed: 0,air_store_id,visit_datetime,reserve_datetime,reserve_visitors
0,air_877f79706adbfb06,2016-01-01 19:00:00,2016-01-01 16:00:00,1
1,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,3
2,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,6
3,air_877f79706adbfb06,2016-01-01 20:00:00,2016-01-01 16:00:00,2
4,air_db80363d35f10926,2016-01-01 20:00:00,2016-01-01 01:00:00,5


Shape after:  (92378, 4)


In [5]:
###############################################################################################
# Log transform the the ammount of reserved visitors for this day
###############################################################################################
reservation_history['reserve_visitors'] = np.log1p(reservation_history['reserve_visitors'])

In [6]:
###############################################################################################
# Convert dates into datetime objects / get the day of the week / 
# cut off hours, minutes and seconds
###############################################################################################
reservation_history['visit_datetime'] = pd.to_datetime(reservation_history['visit_datetime'])
reservation_history['visit_dow'] = reservation_history['visit_datetime'].dt.dayofweek
reservation_history['visit_datetime'] = reservation_history['visit_datetime'].dt.date
reservation_history['reserve_datetime'] = pd.to_datetime(reservation_history['reserve_datetime'])
reservation_history['reserve_datetime'] = reservation_history['reserve_datetime'].dt.date

###############################################################################################
# Calculate the time difference between reservation and visit
###############################################################################################
reservation_history['reserve_datetime_diff'] = reservation_history.apply(
    lambda r: (r['visit_datetime'] - r['reserve_datetime']).days,
    axis=1
)

In [7]:
reservation_history[reservation_history.air_store_id == 'air_00a91d42b08b08d9']

Unnamed: 0,air_store_id,visit_datetime,reserve_datetime,reserve_visitors,visit_dow,reserve_datetime_diff
17963,air_00a91d42b08b08d9,2016-10-31,2016-10-31,1.098612,0,0
30808,air_00a91d42b08b08d9,2016-12-05,2016-12-01,2.302585,0,4
34613,air_00a91d42b08b08d9,2016-12-14,2016-12-08,2.944439,2,6
36458,air_00a91d42b08b08d9,2016-12-17,2016-12-11,1.098612,5,6
37992,air_00a91d42b08b08d9,2016-12-20,2016-12-18,1.609438,1,2
63768,air_00a91d42b08b08d9,2017-02-18,2017-02-13,2.302585,5,5
65859,air_00a91d42b08b08d9,2017-02-23,2017-02-21,2.564949,3,2
68499,air_00a91d42b08b08d9,2017-03-01,2017-02-18,1.386294,2,11
73921,air_00a91d42b08b08d9,2017-03-14,2017-03-14,1.609438,1,0
77076,air_00a91d42b08b08d9,2017-03-21,2017-03-16,1.386294,1,5


In [8]:
###############################################################################################
# Group the reservations in to subgroubs:
# 
# EARLY RESERVATIONS
# sum_res_diff_er ==> SUM reservation_diff on this day
# sum_vis_er ==> SUM reservated visitors this day
# avg_res_diff_er ==> AVG reservation_diff on this day
# avg_vis_er ==> AVG reservated visitors this day
#
# LATE RESERVATIONS
# sum_res_diff_lr ==> SUM reservation_diff on this day
# sum_vis_lr ==> SUM reservated visitors this day
# avg_res_diff_lr ==> AVG reservation_diff on this day
# avg_vis_lr ==> AVG reservated visitors this day
###############################################################################################
reservation_history['early_reservation'] = reservation_history['reserve_datetime_diff'] > 2
reservation_history['late_reservation'] = reservation_history['reserve_datetime_diff'] <= 2

# SUM early reservations
tmp1 = reservation_history[reservation_history['early_reservation']].groupby(
    ['air_store_id','visit_datetime'], as_index=False
)[['reserve_datetime_diff', 'reserve_visitors']]
tmp1 = tmp1.sum()
tmp1 = tmp1.rename(columns={
    'visit_datetime':'visit_date',
    'reserve_datetime_diff': 'sum_res_diff_er',
    'reserve_visitors':'sum_vis_er'
})

# AVG early reservations
tmp2 = reservation_history[reservation_history['early_reservation']].groupby(
    ['air_store_id','visit_datetime'], as_index=False
)[['reserve_datetime_diff', 'reserve_visitors']]
tmp2 = tmp2.mean()
tmp2 = tmp2.rename(columns={
    'visit_datetime':'visit_date',
    'reserve_datetime_diff': 'avg_res_diff_er',
    'reserve_visitors':'avg_vis_er'
})

# SUM late reservations
tmp3 = reservation_history[reservation_history['late_reservation']].groupby(
    ['air_store_id','visit_datetime'], as_index=False
)[['reserve_datetime_diff', 'reserve_visitors']]
tmp3 = tmp3.sum()
tmp3 = tmp3.rename(columns={
    'visit_datetime':'visit_date', 
    'reserve_datetime_diff': 'sum_res_diff_lr', 
    'reserve_visitors':'sum_vis_lr'
})

# AVG late reservations
tmp4 = reservation_history[reservation_history['late_reservation']].groupby(
    ['air_store_id','visit_datetime'], as_index=False
)[['reserve_datetime_diff', 'reserve_visitors']]
tmp4 = tmp4.mean()
tmp4 = tmp4.rename(columns={
    'visit_datetime':'visit_date', 
    'reserve_datetime_diff': 'avg_res_diff_lr',
    'reserve_visitors':'avg_vis_lr'
})

reservation_history = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','visit_date'])
reservation_history = pd.merge(reservation_history, tmp3, how='outer', on=['air_store_id','visit_date'])
reservation_history = pd.merge(reservation_history, tmp4, how='outer', on=['air_store_id','visit_date'])

reservation_history.head()

Unnamed: 0,air_store_id,visit_date,sum_res_diff_er,sum_vis_er,avg_res_diff_er,avg_vis_er,sum_res_diff_lr,sum_vis_lr,avg_res_diff_lr,avg_vis_lr
0,air_00a91d42b08b08d9,2016-12-05,4.0,2.302585,4.0,2.302585,,,,
1,air_00a91d42b08b08d9,2016-12-14,6.0,2.944439,6.0,2.944439,,,,
2,air_00a91d42b08b08d9,2016-12-17,6.0,1.098612,6.0,1.098612,,,,
3,air_00a91d42b08b08d9,2017-02-18,5.0,2.302585,5.0,2.302585,,,,
4,air_00a91d42b08b08d9,2017-03-01,11.0,1.386294,11.0,1.386294,,,,


In [9]:
###############################################################################################
# Get all unique stores from the submission file
# Because the submission file contains the restaurant id and visit date in one attribute, 
# this information has to be splitted up
###############################################################################################

data['subm_visits']['visit_date'] = data['subm_visits']['id'].map(lambda x: str(x).split('_')[2])
data['subm_visits']['air_store_id'] = data['subm_visits']['id'].map(lambda x: '_'.join(x.split('_')[:2]))

# Extract unique store ids and create an empty dataframe for the store meta information
unique_stores = data['subm_visits']['air_store_id'].unique()
stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'dow': [i]*len(unique_stores)}) for i in range(7)], axis=0, ignore_index=True).reset_index(drop=True)

stores.head()

Unnamed: 0,air_store_id,dow
0,air_00a91d42b08b08d9,0
1,air_0164b9927d20bcc3,0
2,air_0241aa3964b7f861,0
3,air_0328696196e46f18,0
4,air_034a3d5b40d5b1b1,0


## Resample timeseries of the ground truth datasets to fill missing days

In [10]:
###############################################################################################
# Fill the gaps in the training dataset for each restaurant
# So every row step is exactly one day
###############################################################################################
data['gt_visits']['visit_date'] = pd.to_datetime(data['gt_visits']['visit_date'])
data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
data['gt_visits'] = data['gt_visits'].reset_index()
data['gt_visits'].head()

  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
 

  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
 

  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
 

  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  data['gt_visits'] = data['gt_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
 

Unnamed: 0,air_store_id,visit_date,visitors
0,air_00a91d42b08b08d9,2016-07-01,35
1,air_00a91d42b08b08d9,2016-07-02,9
2,air_00a91d42b08b08d9,2016-07-03,0
3,air_00a91d42b08b08d9,2016-07-04,20
4,air_00a91d42b08b08d9,2016-07-05,25


In [11]:
###############################################################################################
# Also check if the submission data has the same stepsize for each restaurant
# One row step == one day
###############################################################################################
org_shape = data['subm_visits'].shape
data['subm_visits']['visit_date'] = pd.to_datetime(data['subm_visits']['visit_date'])
tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
tmp = tmp.reset_index()
resampled_shape = tmp.shape

if org_shape[0] == resampled_shape[0]:
    print('Submission has a stepsize of one day per row and restaurant')
    del org_shape, tmp, resampled_shape

  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = da

  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = da

  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = da

Submission has a stepsize of one day per row and restaurant


  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = data['subm_visits'].groupby('air_store_id').resample('D', on='visit_date').sum().fillna(0)
  tmp = da

## Prepare the training and submission data (transform date information into week, month, year + DOW)

In [12]:
###############################################################################################
# Transform to datetime objects and split the dates up
###############################################################################################
data['gt_visits']['visit_date'] = pd.to_datetime(data['gt_visits']['visit_date'])
data['gt_visits']['dow'] = data['gt_visits']['visit_date'].dt.dayofweek
data['gt_visits']['year'] = data['gt_visits']['visit_date'].dt.year
data['gt_visits']['month'] = data['gt_visits']['visit_date'].dt.month
data['gt_visits']['week'] = data['gt_visits']['visit_date'].dt.week
data['gt_visits']['visit_date'] = data['gt_visits']['visit_date'].dt.date

# Also store the visit date as an integer value
data['gt_visits']['visit_date_int'] = data['gt_visits']['visit_date'].apply(
    lambda x: x.strftime('%Y%m%d')
).astype(int)

# Also log-transform the ground truth visitor values
data['gt_visits']['visitors'] = np.log1p(data['gt_visits']['visitors'].values.astype(np.int))

data['gt_visits'].head()

  data['gt_visits']['week'] = data['gt_visits']['visit_date'].dt.week
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data['gt_visits']['visitors'] = np.log1p(data['gt_visits']['visitors'].values.astype(np.int))


Unnamed: 0,air_store_id,visit_date,visitors,dow,year,month,week,visit_date_int
0,air_00a91d42b08b08d9,2016-07-01,3.583519,4,2016,7,26,20160701
1,air_00a91d42b08b08d9,2016-07-02,2.302585,5,2016,7,26,20160702
2,air_00a91d42b08b08d9,2016-07-03,0.0,6,2016,7,26,20160703
3,air_00a91d42b08b08d9,2016-07-04,3.044522,0,2016,7,27,20160704
4,air_00a91d42b08b08d9,2016-07-05,3.258097,1,2016,7,27,20160705


In [13]:
###############################################################################################
# Transform to datetime objects and split the dates up
###############################################################################################
data['subm_visits']['visit_date'] = pd.to_datetime(data['subm_visits']['visit_date'])
data['subm_visits']['dow'] = data['subm_visits']['visit_date'].dt.dayofweek
data['subm_visits']['year'] = data['subm_visits']['visit_date'].dt.year
data['subm_visits']['month'] = data['subm_visits']['visit_date'].dt.month
data['subm_visits']['week'] = data['subm_visits']['visit_date'].dt.week
data['subm_visits']['visit_date'] = data['subm_visits']['visit_date'].dt.date

# Also store the visit date as an integer value
data['subm_visits']['visit_date_int'] = data['subm_visits']['visit_date'].apply(
    lambda x: x.strftime('%Y%m%d')
).astype(int)

data['subm_visits'].head()

  data['subm_visits']['week'] = data['subm_visits']['visit_date'].dt.week


Unnamed: 0,id,visitors,visit_date,air_store_id,dow,year,month,week,visit_date_int
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,4,16,20170423
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,2017,4,17,20170424
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,1,2017,4,17,20170425
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,2,2017,4,17,20170426
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,3,2017,4,17,20170427


## Generate some more meta information about each restaurant

In [14]:
###############################################################################################
# Calculate the min, max, avg, median and overall reservations for each day of a week
###############################################################################################
# Min visits
tmp = data['gt_visits'].groupby(['air_store_id','dow'], as_index=False)['visitors']
tmp = tmp.min()
tmp = tmp.rename(columns={'visitors':'min_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 

# Mean visits
tmp = data['gt_visits'].groupby(['air_store_id','dow'], as_index=False)['visitors']
tmp = tmp.mean()
tmp = tmp.rename(columns={'visitors':'mean_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])

# Median visits
tmp = data['gt_visits'].groupby(['air_store_id','dow'], as_index=False)['visitors']
tmp = tmp.median()
tmp = tmp.rename(columns={'visitors':'median_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])

# Max visits
tmp = data['gt_visits'].groupby(['air_store_id','dow'], as_index=False)['visitors']
tmp = tmp.max()
tmp = tmp.rename(columns={'visitors':'max_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])

# Overall visits on this week day
tmp = data['gt_visits'].groupby(['air_store_id','dow'], as_index=False)['visitors']
tmp = tmp.count()
tmp = tmp.rename(columns={'visitors':'count_observations'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 

###############################################################################################
# Merge this information with the remaining restaurant meta information
###############################################################################################
stores = pd.merge(stores, data['air_store_info'], how='left', on=['air_store_id'])

###############################################################################################
# Show one example
###############################################################################################
stores.loc[stores['air_store_id'] == 'air_00a91d42b08b08d9']

Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,air_genre_name,air_area_name,latitude,longitude
0,air_00a91d42b08b08d9,0,0.0,2.547287,2.944439,3.871201,42,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595
821,air_00a91d42b08b08d9,1,0.0,2.985165,3.218876,3.78419,42,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595
1642,air_00a91d42b08b08d9,2,0.0,3.168832,3.34975,3.970292,42,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595
2463,air_00a91d42b08b08d9,3,0.0,3.07784,3.401197,3.871201,42,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595
3284,air_00a91d42b08b08d9,4,0.0,3.344289,3.583519,4.060443,43,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595
4105,air_00a91d42b08b08d9,5,0.0,2.214277,2.302585,4.60517,43,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595
4926,air_00a91d42b08b08d9,6,0.0,0.026157,0.0,1.098612,42,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595


## Prepare the Area and Genre names of each store

In [15]:
###############################################################################################
# Remove some char from the Genre name and area name
###############################################################################################
stores['air_genre_name'] = stores['air_genre_name'].map(
    lambda x: str(str(x).replace('/',' '))
)
stores['air_area_name'] = stores['air_area_name'].map(
    lambda x: str(str(x).replace('-',' '))
)

###############################################################################################
# Label-Encoding the cleanded names
###############################################################################################
# FIRST-AUTHOR: remove ML code
# le = LabelEncoder()
# stores['air_genre_name'] = le.fit_transform(stores['air_genre_name'])
# stores['air_area_name'] = le.fit_transform(stores['air_area_name'])
stores['air_genre_name'] = stores['air_genre_name']
stores['air_area_name'] = stores['air_area_name']

## Create sme features based on longitude and latitude

In [16]:
stores['lon_plus_lat'] = stores['longitude'] + stores['latitude']
stores['var_max_lat'] = stores['latitude'].max() - stores['latitude']
stores['var_max_lon'] = stores['longitude'].max() - stores['longitude']

## Add the store_id as an seperate feature for the prediction

In [17]:
# FIRST-AUTHOR: remove ML code
# le = LabelEncoder()
# stores['air_store_id_feat'] = le.fit_transform(stores['air_store_id'])
stores['air_store_id_feat'] = stores['air_store_id']

In [18]:
display(stores.head())

Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,air_genre_name,air_area_name,latitude,longitude,lon_plus_lat,var_max_lat,var_max_lon,air_store_id_feat
0,air_00a91d42b08b08d9,0,0.0,2.547287,2.944439,3.871201,42,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,139.753595,175.447598,8.326629,4.519803,air_00a91d42b08b08d9
1,air_0164b9927d20bcc3,0,0.0,1.353902,1.386294,2.995732,29,Italian French,Tōkyō to Minato ku Shibakōen,35.658068,139.751599,175.409667,8.362564,4.521799,air_0164b9927d20bcc3
2,air_0241aa3964b7f861,0,0.0,2.025811,2.138333,3.178054,68,Izakaya,Tōkyō to Taitō ku Higashiueno,35.712607,139.779996,175.492603,8.308025,4.493403,air_0241aa3964b7f861
3,air_0328696196e46f18,0,0.0,0.498501,0.0,3.332205,42,Dining bar,Ōsaka fu Ōsaka shi Nakanochō,34.701279,135.52809,170.22937,9.319353,8.745308,air_0328696196e46f18
4,air_034a3d5b40d5b1b1,0,0.0,2.001122,2.191013,4.204693,42,Cafe Sweets,Ōsaka fu Ōsaka shi Ōhiraki,34.692337,135.472229,170.164566,9.328295,8.801169,air_034a3d5b40d5b1b1


## Add information about the holidays

In [19]:
###############################################################################################
# Prepare the datetime object and simplify it in a da and day of week
###############################################################################################
data['holidays']['visit_date'] = pd.to_datetime(data['holidays']['visit_date'])

# Attention: This day of week does not match the encoding of the 'dow' field
# FIRST-AUTHOR: remove ML code
# data['holidays']['day_of_week'] = le.fit_transform(data['holidays']['day_of_week'])
data['holidays']['day_of_week'] = data['holidays']['day_of_week']
data['holidays']['dow_holidays'] = data['holidays']['visit_date'].dt.dayofweek

data['holidays']['visit_date'] = data['holidays']['visit_date'].dt.date

display(data['holidays'].head())

Unnamed: 0,visit_date,day_of_week,holiday_flg,dow_holidays
0,2016-01-01,Friday,1,4
1,2016-01-02,Saturday,1,5
2,2016-01-03,Sunday,1,6
3,2016-01-04,Monday,0,0
4,2016-01-05,Tuesday,0,1


In [20]:
###############################################################################################
# Add holiday information to the training data
###############################################################################################
train = pd.merge(data['gt_visits'], data['holidays'], how='left', on=['visit_date'])
display(train.head())

Unnamed: 0,air_store_id,visit_date,visitors,dow,year,month,week,visit_date_int,day_of_week,holiday_flg,dow_holidays
0,air_00a91d42b08b08d9,2016-07-01,3.583519,4,2016,7,26,20160701,Friday,0,4
1,air_00a91d42b08b08d9,2016-07-02,2.302585,5,2016,7,26,20160702,Saturday,0,5
2,air_00a91d42b08b08d9,2016-07-03,0.0,6,2016,7,26,20160703,Sunday,0,6
3,air_00a91d42b08b08d9,2016-07-04,3.044522,0,2016,7,27,20160704,Monday,0,0
4,air_00a91d42b08b08d9,2016-07-05,3.258097,1,2016,7,27,20160705,Tuesday,0,1


In [21]:
###############################################################################################
# Add holiday information to the submission data
###############################################################################################
test = pd.merge(data['subm_visits'], data['holidays'], how='left', on=['visit_date'])
display(test.head())

Unnamed: 0,id,visitors,visit_date,air_store_id,dow,year,month,week,visit_date_int,day_of_week,holiday_flg,dow_holidays
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,4,16,20170423,Sunday,0,6
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,2017,4,17,20170424,Monday,0,0
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,1,2017,4,17,20170425,Tuesday,0,1
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,2,2017,4,17,20170426,Wednesday,0,2
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,3,2017,4,17,20170427,Thursday,0,3


## Merge reservation history with fixed restaurant meta information

In [22]:
##############################################################################################
# Merge the training data with the prepared meta information
##############################################################################################
train = pd.merge(train, stores, how='inner', on=['air_store_id','dow'])
train = pd.merge(train, reservation_history, how='left', on=['air_store_id','visit_date'])

# Create the same ID that is used in the submission file
train['id'] = train.apply(
    lambda r: '_'.join([str(r['air_store_id']), str(r['visit_date'])]), 
    axis=1
)

###############################################################################################
# Merge the submission dataset with the prepared meta information
###############################################################################################
test = pd.merge(test, stores, how='left', on=['air_store_id','dow'])
test = pd.merge(test, reservation_history, how='left', on=['air_store_id','visit_date'])

In [23]:
###############################################################################################
# Sort the train and test dataframes again
###############################################################################################
train = train.sort_values(by=['air_store_id', 'visit_date'])
test = test.sort_values(by=['air_store_id', 'visit_date'])

In [24]:
###############################################################################################
# Fill NaN with an -1 value
###############################################################################################
train = train.fillna(-1)
test = test.fillna(-1)

In [25]:
display(train.head())
display(train.shape)
display(test.head())
display(test.shape)

Unnamed: 0,air_store_id,visit_date,visitors,dow,year,month,week,visit_date_int,day_of_week,holiday_flg,dow_holidays,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,air_genre_name,air_area_name,latitude,longitude,lon_plus_lat,var_max_lat,var_max_lon,air_store_id_feat,sum_res_diff_er,sum_vis_er,avg_res_diff_er,avg_vis_er,sum_res_diff_lr,sum_vis_lr,avg_res_diff_lr,avg_vis_lr,id
0,air_00a91d42b08b08d9,2016-07-01,3.583519,4,2016,7,26,20160701,Friday,0,4,0.0,3.344289,3.583519,4.060443,43,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,139.753595,175.447598,8.326629,4.519803,air_00a91d42b08b08d9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,air_00a91d42b08b08d9_2016-07-01
43,air_00a91d42b08b08d9,2016-07-02,2.302585,5,2016,7,26,20160702,Saturday,0,5,0.0,2.214277,2.302585,4.60517,43,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,139.753595,175.447598,8.326629,4.519803,air_00a91d42b08b08d9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,air_00a91d42b08b08d9_2016-07-02
86,air_00a91d42b08b08d9,2016-07-03,0.0,6,2016,7,26,20160703,Sunday,0,6,0.0,0.026157,0.0,1.098612,42,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,139.753595,175.447598,8.326629,4.519803,air_00a91d42b08b08d9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,air_00a91d42b08b08d9_2016-07-03
128,air_00a91d42b08b08d9,2016-07-04,3.044522,0,2016,7,27,20160704,Monday,0,0,0.0,2.547287,2.944439,3.871201,42,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,139.753595,175.447598,8.326629,4.519803,air_00a91d42b08b08d9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,air_00a91d42b08b08d9_2016-07-04
170,air_00a91d42b08b08d9,2016-07-05,3.258097,1,2016,7,27,20160705,Tuesday,0,1,0.0,2.985165,3.218876,3.78419,42,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,139.753595,175.447598,8.326629,4.519803,air_00a91d42b08b08d9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,air_00a91d42b08b08d9_2016-07-05


(293886, 33)

Unnamed: 0,id,visitors,visit_date,air_store_id,dow,year,month,week,visit_date_int,day_of_week,holiday_flg,dow_holidays,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,air_genre_name,air_area_name,latitude,longitude,lon_plus_lat,var_max_lat,var_max_lon,air_store_id_feat,sum_res_diff_er,sum_vis_er,avg_res_diff_er,avg_vis_er,sum_res_diff_lr,sum_vis_lr,avg_res_diff_lr,avg_vis_lr
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,4,16,20170423,Sunday,0,6,0.0,0.026157,0.0,1.098612,42,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,139.753595,175.447598,8.326629,4.519803,air_00a91d42b08b08d9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,2017,4,17,20170424,Monday,0,0,0.0,2.547287,2.944439,3.871201,42,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,139.753595,175.447598,8.326629,4.519803,air_00a91d42b08b08d9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,1,2017,4,17,20170425,Tuesday,0,1,0.0,2.985165,3.218876,3.78419,42,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,139.753595,175.447598,8.326629,4.519803,air_00a91d42b08b08d9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,2,2017,4,17,20170426,Wednesday,0,2,0.0,3.168832,3.34975,3.970292,42,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,139.753595,175.447598,8.326629,4.519803,air_00a91d42b08b08d9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,3,2017,4,17,20170427,Thursday,0,3,0.0,3.07784,3.401197,3.871201,42,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,139.753595,175.447598,8.326629,4.519803,air_00a91d42b08b08d9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


(32019, 33)

## Sort the columns

In [26]:
# FIRST-AUTHOR: remove non pandas code
# FEATURES = {
#     'air_store_id_feat' : 'LabelEncoded store ID as an input feature. It allows the network to seperate between the stores',
#     'dow' : 'Day of the week e.g. Monday, Tuesday, Wednesday,...',
#     'year' : 'Year of the visit',
#     'month' : 'Month of the visit',
#     'week' : 'Week of the visit',
#     'visit_date_int' : 'Complete visit date as a integer value',
#     'holiday_flg' : 'Is the current day in the holidays',
#     'min_visitors' : 'Minimum visitors of the current week day',
#     'mean_visitors' : 'Mean visitors of the current week day',
#     'median_visitors' : 'Median visitors of the current week day',
#     'max_visitors' : 'Maximum visitors of the current week day',
#     'count_observations' : 'Total number of reservations for this week day',
#     'air_genre_name' : 'Label encoded name of the cusine genre',
#     'air_area_name' : 'Label encoded name of the area the restaurant is located in',
#     'latitude' : 'Latitude of the restaurant location',
#     'longitude' : 'Longitude of the restaurant location',
#     'lon_plus_lat' : 'Linear combination of longitude and latitude',
#     'var_max_lat' : 'Max(Latitude) - Latitude of the current restaurant',
#     'var_max_lon' : 'Max(Longitude) - Longitude of the current restaurant',
#     'sum_res_diff_er' : 'Summed up differences between the reservation date and the visit date [Diff > 2 days]',
#     'sum_vis_er' : 'Summed up reservated visitors for this day [Diff > 2 days]',
#     'avg_res_diff_er' : 'AVG of differences between the reservation date and the visit date [Diff > 2 days]',
#     'avg_vis_er' : 'AVG reservated visitors for this day [Diff > 2 days]',
#     'sum_res_diff_lr' : 'Summed up differences between the reservation date and the visit date [Diff <= 2 days]',
#     'sum_vis_lr' : 'Summed up reservated visitors for this day [Diff <= 2 days]',
#     'avg_res_diff_lr' : 'AVG of differences between the reservation date and the visit date [Diff > 2 days]',
#     'avg_vis_lr' : 'AVG reservated visitors for this day [Diff <= 2 days]' 
# }

# EXCLUDED_FEATURES = {
#     'id' : 'Air reservation id of the restaurant',
#     'visit_date' : 'Use the numeric value instead!',
#     'air_store_id' : 'Air reservation id of the restaurant',
#     'day_of_week' : 'Day of the week encoded in the date_info.csv file',
#     'dow_holidays' : 'Day of the week encoded in the date_info.csv file'
# }

# GROUND_TRUTH_FEATURES = {
#     'visitors' : 'Ground truth information. The number os visitors is transformed with np.log1p()'
# }

In [27]:
# FIRST-AUTHOR: remove non pandas code
# FEATURE_COLS = list(FEATURES.keys())
# EXCLUDED_COLS = list(EXCLUDED_FEATURES.keys())
# GROUND_TRUTH_COLS = list(GROUND_TRUTH_FEATURES.keys())

# print('Number of cols: ', len(FEATURE_COLS) + len(EXCLUDED_COLS) + len(GROUND_TRUTH_COLS))