# Chainer  NumPy Output

Creates a giant NumPy matrix that Chainer will be comfortable munching on.

* X-axis: each store
* Y-axis: each day
* Z-axis: see below


1. Customers on every single day for each shop
1. Day of the week
1. Holiday flag

1. Reservations
  * reserved for this day in AIR
  * reserved for this day in HPG

1. Genre of cuisine (one-hot)
1. Location (one-hot)
  * todofuken
  * ku/shi
  * Latitude
  * Longitude

1. Restaurant has opened yet
1. Customers stuff from my program
1. Golden week stuff
1. Average info from other stores, as before

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
plt.rcParams["figure.figsize"] = [12,11]

from datetime import datetime
from datetime import timedelta
missing_stores = ["air_d63cfa6d6ab78446","air_b2d8bc9c88b85f96","air_0ead98dd07e7a82a","air_cb083b4789a8d3a2","air_cf22e368c1a71d53","air_d0a7bd3339c3d12a","air_229d7e508d9f1b5e","air_2703dcb33192b181"]

### Visitors

In [30]:
# Read visit data
visits = pd.read_csv("../input/air_visit_data.csv",
                     parse_dates=["visit_date"])

# Drop stores missing in the test data
visits = visits.set_index(["air_store_id"]).drop(missing_stores).reset_index()


# Pad with zeroes so all store / date combinations appear
# also extend to cover the test period

# Prepare all possible store / date pairs within range
stores = visits["air_store_id"].unique()
dates = pd.date_range(visits["visit_date"].min(), visits["visit_date"].max() + timedelta(days=39))

# Find all empty pairs
visits.set_index(["air_store_id", "visit_date"], inplace=True)
stores_to_fill = []
dates_to_fill = []
for store in stores:
    for date in dates:
        if (store, date) not in visits.index:
            stores_to_fill.append(store)
            dates_to_fill.append(date)
visits.reset_index(inplace=True)

# Pad with zeroes
to_fill = pd.DataFrame()
to_fill["air_store_id"] = stores_to_fill
to_fill["visit_date"] = dates_to_fill
to_fill["visitors"] = np.zeros(len(stores_to_fill))
visits = pd.concat([visits, to_fill])

# re-sort
visits = visits.set_index(["air_store_id", "visit_date"]).sort_index().reset_index()

### Day of week, holidays

In [31]:
# Assign day of the week and month
visits = visits.assign(weekday = visits["visit_date"].map(lambda x: x.strftime("%a")))
visits = visits.assign(month = visits["visit_date"].map(lambda x: x.month))

# One-hot the weekdays
visits = pd.concat([visits, pd.get_dummies(visits["weekday"])], axis=1)
visits.drop(["weekday"], axis=1, inplace=True)

# Assign if a public holiday
holidays = pd.read_csv("../input/date_info.csv",
                      parse_dates = ["calendar_date"],
                      index_col = ["calendar_date"])

def is_holiday(x):
    return holidays.loc[x, "holiday_flg"]

visits = visits.assign(holiday = visits["visit_date"].map(is_holiday))

### Reservations

In [32]:
# Read reservations data (AIR = 6 MB, HPG = 120 MB)
reservations_air = pd.read_csv("../input/air_reserve.csv",
                              parse_dates=["visit_datetime", "reserve_datetime"])
reservations_hpg = pd.read_csv("../input/hpg_reserve.csv",
                              parse_dates=["visit_datetime", "reserve_datetime"])

# Assign AIR store number to HPG reservations data
store_id = pd.read_csv("../input/store_id_relation.csv")
store_id.set_index(["hpg_store_id"], inplace=True)

def get_air_store(hpg_store):
    if hpg_store in store_id.index:
        return store_id.loc[hpg_store,"air_store_id"]
    else:
        return np.nan
    
air_ids = reservations_hpg["hpg_store_id"].map(get_air_store).values
reservations_hpg = reservations_hpg.assign(air_store_id = air_ids)

# drop unnecessary HPG info
reservations_hpg.dropna(axis=0, inplace=True)
reservations_hpg.drop(["hpg_store_id"], axis=1, inplace=True)

# set air_store_id index
reservations_air.set_index(["air_store_id"], inplace=True)
reservations_hpg.set_index(["air_store_id"], inplace=True)

# drop same-day reservations
reservations_air = reservations_air[ reservations_air["reserve_datetime"].map(datetime.date) !=
                                               reservations_air["visit_datetime"].map(datetime.date) ]
reservations_hpg = reservations_hpg[ reservations_hpg["reserve_datetime"].map(datetime.date) !=
                                               reservations_hpg["visit_datetime"].map(datetime.date) ]
reservations_air.drop(["reserve_datetime"], axis=1, inplace=True)
reservations_hpg.drop(["reserve_datetime"], axis=1, inplace=True)


In [33]:
# Get reservations for each date
# Takes a minute or two

# copy reservations data and remove time-of-day info (keep date only)
res_air = reservations_air.assign(visit_date = reservations_air["visit_datetime"].map(datetime.date) ).copy()
res_hpg = reservations_hpg.assign(visit_date = reservations_hpg["visit_datetime"].map(datetime.date) ).copy()
res_air.drop(["visit_datetime"], inplace=True, axis=1)
res_hpg.drop(["visit_datetime"], inplace=True, axis=1)
res_air = res_air.reset_index().groupby(["air_store_id", "visit_date"]).sum()
res_hpg = res_hpg.reset_index().groupby(["air_store_id", "visit_date"]).sum()


# get reservations for each store and date
def get_res(store, datetime, file):
    date = datetime.date()
    if (store, date) in file.index:
        return file.loc[(store, date)].values[0]
    else:
        return 0

air, hpg = [], []
for index, row in visits.iterrows():
    air.append( get_res(row["air_store_id"], row["visit_date"], res_air) )
    hpg.append( get_res(row["air_store_id"], row["visit_date"], res_hpg) )

visits = visits.assign(res_air = air, res_hpg = hpg)

In [34]:
# Some outliers in the air reservations, looks like mistakes, pruning these...
prune = visits[ (visits["res_air"] > 100) & (visits["visitors"] < 100) ]
for i in prune.index:
    visits.at[i, "res_air"] = 0

### Genre, location

In [35]:
# Read store data
stores = pd.read_csv("../input/air_store_info.csv",
                    index_col = ["air_store_id"])

stores.drop(missing_stores, inplace=True)

address = stores["air_area_name"].map(str.split).values

todofuken, kushi = [], []
for store in address:
    todofuken.append(store[0])
    kushi.append(store[1])

stores.drop(["air_area_name"], axis=1, inplace=True)
stores = stores.assign(todofuken = todofuken)
stores = stores.assign(kushi = kushi)

# one-hot the genre and address
X_numeric = stores.select_dtypes(exclude=['object']).copy()
X_text = stores.select_dtypes(include=['object']).copy()
X_onehot = pd.get_dummies(X_text)
stores_onehot = pd.concat([X_numeric, X_onehot], axis=1)

# drop columns with really few entries
column_totals = stores_onehot.apply(np.sum, axis=0)
to_drop = column_totals[ column_totals < 5 ].index
stores_onehot = stores_onehot.drop(to_drop, axis=1)

In [36]:
# append to visits
visits = pd.merge(visits,stores_onehot.reset_index(), on="air_store_id")

### Restaurant is actually open in the AIR system

In [37]:
# assign opening date to stores
stores = stores.reset_index()
vis = visits[visits["visitors"]>0]
stores["open_date"] = stores.apply( lambda x: vis[vis["air_store_id"] == x["air_store_id"]]["visit_date"].min() , axis=1)
stores = stores.set_index("air_store_id")

In [38]:
# assign a pre/post store-opening flag to each visit
visits["is_open"] = visits.apply( lambda x:
                                 int(x["visit_date"] >= stores.loc[x["air_store_id"], "open_date"]), axis=1)

### Golden week

In [39]:
#gw is 4/29 -> 5/5
# in 2016 this is FRI ~ THU (MON is a "work day")
# in 2017 this is SAT ~ FRI (MON, TUE are "work days")

def check_golden_week(the_date):
    if (the_date >= datetime(2016,4,29)) and (the_date < datetime(2016,5,6)):
        return 1
    elif (the_date >= datetime(2017,4,29)) and (the_date < datetime(2017,5,6)):
        return 1
    else:
        return 0

    
visits["golden_week"] = visits.apply(lambda x: check_golden_week(x["visit_date"]), axis=1)

### Output

In [20]:
visits = visits.reset_index().set_index(["air_store_id", "visit_date"])
visits = visits.sort_index()
np_array = visits.values.reshape(821, 517, 68)
np.save("datapoop", np_array)

In [1]:
# Document the output
print(visits.columns)