In [1]:
import pandas as pd

# 1. Download data

In [2]:
!kaggle competitions download -c recruit-restaurant-visitor-forecasting

In [3]:
!unzip -o recruit-restaurant-visitor-forecasting.zip -d data

In [None]:
!mkdir data

In [4]:
files = ['air_reserve.csv.zip','air_store_info.csv.zip','air_visit_data.csv.zip','date_info.csv.zip','hpg_reserve.csv.zip','hpg_store_info.csv.zip','sample_submission.csv.zip','store_id_relation.csv.zip']
for file in files:
    !unzip -o data/{file} -d data
    !rm data/{file}

# 2. Explore Data

In [5]:
data = {
    'air_reserve': pd.read_csv('data/air_reserve.csv'),
    'air_store_info': pd.read_csv('data/air_store_info.csv'),
    'air_visit_data': pd.read_csv('data/air_visit_data.csv'),
    'date_info': pd.read_csv('data/date_info.csv'),
    'hpg_reserve': pd.read_csv('data/hpg_reserve.csv'),
    'hpg_store_info': pd.read_csv('data/hpg_store_info.csv'),
    'sample_submission': pd.read_csv('data/sample_submission.csv'),
    'store_id_relation': pd.read_csv('data/store_id_relation.csv'),
}

## 2.1 Training Data

In [6]:
data['air_reserve'].head(3)

Unnamed: 0,air_store_id,visit_datetime,reserve_datetime,reserve_visitors
0,air_877f79706adbfb06,2016-01-01 19:00:00,2016-01-01 16:00:00,1
1,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,3
2,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,6


In [7]:
data['hpg_reserve'].head(3)

Unnamed: 0,hpg_store_id,visit_datetime,reserve_datetime,reserve_visitors
0,hpg_c63f6f42e088e50f,2016-01-01 11:00:00,2016-01-01 09:00:00,1
1,hpg_dac72789163a3f47,2016-01-01 13:00:00,2016-01-01 06:00:00,3
2,hpg_c8e24dcf51ca1eb5,2016-01-01 16:00:00,2016-01-01 14:00:00,2


In [8]:
data['air_store_info'].head(3)

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude,longitude
0,air_0f0cdeee6c9bf3d7,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
1,air_7cc17a324ae5c7dc,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
2,air_fee8dcf4d619598e,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852


In [9]:
data['store_id_relation'].head(3)

Unnamed: 0,air_store_id,hpg_store_id
0,air_63b13c56b7201bd9,hpg_4bc649e72e2a239a
1,air_a24bf50c3e90d583,hpg_c34b496d0305a809
2,air_c7f78b4f3cba33ff,hpg_cd8ae0d9bbd58ff9


In [10]:
data['air_visit_data'].head(3)

Unnamed: 0,air_store_id,visit_date,visitors
0,air_ba937bf13d40fb24,2016-01-13,25
1,air_ba937bf13d40fb24,2016-01-14,32
2,air_ba937bf13d40fb24,2016-01-15,29


In [11]:
data['date_info'].head(7)

Unnamed: 0,calendar_date,day_of_week,holiday_flg
0,2016-01-01,Friday,1
1,2016-01-02,Saturday,1
2,2016-01-03,Sunday,1
3,2016-01-04,Monday,0
4,2016-01-05,Tuesday,0
5,2016-01-06,Wednesday,0
6,2016-01-07,Thursday,0


## 2.2 Submission data

In [42]:
data['sample_submission'].head(3)

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,0
1,air_00a91d42b08b08d9_2017-04-24,0
2,air_00a91d42b08b08d9_2017-04-25,0


In [43]:
sub_df = data['sample_submission'].copy()

def get_sub_store_id(sub_id):
    s = sub_id.split('_')
    return s[0] + '_' + s[1]

def get_sub_date(sub_id):
    s = sub_id.split('_')
    return s[2]

sub_df['store_id'] = sub_df.apply(lambda x: get_sub_store_id(x['id']),axis=1)
sub_df['date'] = sub_df.apply(lambda x: get_sub_date(x['id']),axis=1)

In [56]:
len(sub_df['store_id'].unique())

821

# 3. Naive model
calculate the avg visitor per genre name. 

In [73]:
naive_model_df = pd.merge(data['air_reserve'],data['air_store_info'][['air_store_id','air_genre_name']], left_on='air_store_id', right_on='air_store_id')
naive_model_df.head(3)

Unnamed: 0,air_store_id,visit_datetime,reserve_datetime,reserve_visitors,air_genre_name
0,air_877f79706adbfb06,2016-01-01 19:00:00,2016-01-01 16:00:00,1,Japanese food
1,air_877f79706adbfb06,2016-01-01 20:00:00,2016-01-01 16:00:00,2,Japanese food
2,air_877f79706adbfb06,2016-01-02 18:00:00,2016-01-01 16:00:00,2,Japanese food


In [102]:
naive_model_df.groupby('air_genre_name').mean()

Unnamed: 0_level_0,reserve_visitors
air_genre_name,Unnamed: 1_level_1
Asian,26.0
Bar/Cocktail,6.889011
Cafe/Sweets,3.294666
Creative cuisine,4.990551
Dining bar,5.789276
International cuisine,4.666667
Italian/French,3.705818
Izakaya,5.161932
Japanese food,4.079932
Karaoke/Party,22.833333


In [109]:
naive_predictions = pd.merge(data['air_store_info'][['air_store_id','air_genre_name']],naive_model_df.groupby('air_genre_name').mean(), left_on='air_genre_name', right_on='air_genre_name')

In [110]:
sub_df_pred = pd.merge(sub_df['store_id'],naive_predictions,left_on='store_id',right_on='air_store_id', how='left')
sub_df_pred.head(3)

Unnamed: 0,store_id,air_store_id,air_genre_name,reserve_visitors
0,air_00a91d42b08b08d9,air_00a91d42b08b08d9,Italian/French,3.705818
1,air_00a91d42b08b08d9,air_00a91d42b08b08d9,Italian/French,3.705818
2,air_00a91d42b08b08d9,air_00a91d42b08b08d9,Italian/French,3.705818


In [99]:
sub_df_pred[['id','reserve_visitors']].rename(columns={'reserve_visitors': 'visitors'}).to_csv('naive_sumission.csv',index=False)

In [101]:
!kaggle competitions submit -c recruit-restaurant-visitor-forecasting -f naive_sumission.csv -m naive_submission

100%|███████████████████████████████████████| 1.53M/1.53M [00:04<00:00, 367kB/s]
Successfully submitted to Recruit Restaurant Visitor Forecasting