# Purpose
- explore this restaurent forecasting
    - https://www.kaggle.com/c/recruit-restaurant-visitor-forecasting/data
    - It covers two restaurents booking systems
        - air
        - hpg
    - It contains
        - reservation data
        - basic store info (location + genre)
        - join table for some restaurents
        - Actual visitor data (only air restaurents)
        - (japanese holidays).
    - The tasks is to predict actual visitors for a number of retaurents
        - Parrelel: https://medium.com/spikelab/forecasting-multiples-time-series-using-prophet-in-parallel-2515abd1a245

In [25]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from fbprophet import Prophet

# time and timing
from tqdm import tqdm
import time

%matplotlib inline

In [2]:
# Get data
!kaggle competitions download -c recruit-restaurant-visitor-forecasting
!unzip -o recruit-restaurant-visitor-forecasting.zip -d data
!mkdir data
files = ['air_reserve.csv.zip','air_store_info.csv.zip','air_visit_data.csv.zip','date_info.csv.zip','hpg_reserve.csv.zip','hpg_store_info.csv.zip','sample_submission.csv.zip','store_id_relation.csv.zip']
for file in files:
    !unzip -o data/{file} -d data
    !rm data/{file}

recruit-restaurant-visitor-forecasting.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  recruit-restaurant-visitor-forecasting.zip
  inflating: data/air_reserve.csv.zip  
  inflating: data/air_store_info.csv.zip  
  inflating: data/air_visit_data.csv.zip  
  inflating: data/date_info.csv.zip  
  inflating: data/hpg_reserve.csv.zip  
  inflating: data/hpg_store_info.csv.zip  
  inflating: data/sample_submission.csv.zip  
  inflating: data/store_id_relation.csv.zip  
mkdir: data: File exists
Archive:  data/air_reserve.csv.zip
  inflating: data/air_reserve.csv    
Archive:  data/air_store_info.csv.zip
  inflating: data/air_store_info.csv  
Archive:  data/air_visit_data.csv.zip
  inflating: data/air_visit_data.csv  
Archive:  data/date_info.csv.zip
  inflating: data/date_info.csv      
Archive:  data/hpg_reserve.csv.zip
  inflating: data/hpg_reserve.csv    
Archive:  data/hpg_store_info.csv.zip
  inflating: data/hpg_store_info.csv  
Archive: 

In [2]:
data = {
    'air_reserve': pd.read_csv('data/air_reserve.csv'),
    'air_store_info': pd.read_csv('data/air_store_info.csv'),
    'air_visit_data': pd.read_csv('data/air_visit_data.csv'),
    'date_info': pd.read_csv('data/date_info.csv'),
    'hpg_reserve': pd.read_csv('data/hpg_reserve.csv'),
    'hpg_store_info': pd.read_csv('data/hpg_store_info.csv'),
    'sample_submission': pd.read_csv('data/sample_submission.csv'),
    'store_id_relation': pd.read_csv('data/store_id_relation.csv'),
}

# generate list of data

In [9]:
history = data["air_visit_data"]
history.rename(columns = {'visit_date':'ds','visitors':'y'}, inplace = True)
stores = pd.unique(history["air_store_id"])


hist_list = [(store_id,history[history["air_store_id"]==store_id]) for store_id in stores]



In [32]:
print(len(stores))

829


In [26]:
def run_prophet(raw_in):
    idd = raw_in[0]
    timeserie = raw_in[1]
    model = Prophet(yearly_seasonality=False,daily_seasonality=False)
    model.fit(timeserie)
    forecast = model.make_future_dataframe(periods=90, include_history=False)
    forecast = model.predict(forecast)
    return (idd,forecast)



In [27]:
# test 1 
f = run_prophet(hist_list[0])
f[1].head()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2017-04-23,15.405555,-10.129902,12.092882,15.405555,15.405555,-14.610081,-14.610081,-14.610081,-14.610081,-14.610081,-14.610081,0.0,0.0,0.0,0.795474
1,2017-04-24,15.374386,-2.879692,19.662629,15.374386,15.374386,-7.02448,-7.02448,-7.02448,-7.02448,-7.02448,-7.02448,0.0,0.0,0.0,8.349906
2,2017-04-25,15.343217,2.759732,25.223538,15.343217,15.343217,-2.120749,-2.120749,-2.120749,-2.120749,-2.120749,-2.120749,0.0,0.0,0.0,13.222468
3,2017-04-26,15.312049,7.638902,29.769596,15.312024,15.312049,3.162548,3.162548,3.162548,3.162548,3.162548,3.162548,0.0,0.0,0.0,18.474596
4,2017-04-27,15.28088,4.481972,26.080419,15.280294,15.281262,-0.343313,-0.343313,-0.343313,-0.343313,-0.343313,-0.343313,0.0,0.0,0.0,14.937568


# non parrallel 10 first

In [28]:
series = hist_list[:10] 
start_time = time.time()
result = list(map(lambda timeserie: run_prophet(timeserie), tqdm(series)))
print("--- %s seconds ---" % (time.time() - start_time))

100%|██████████| 10/10 [00:27<00:00,  2.71s/it]

--- 27.07002902030945 seconds ---





# do parrallel predict 10 first

In [31]:
# == Does not work == seems to hang indefintely

# from multiprocessing import Pool, cpu_count
# series = hist_list[:10] 

# p = Pool(cpu_count()-1)
# print(p)
# predictions = list(tqdm(p.imap(run_prophet, series), total=len(series)))
# p.close()
# p.join()
# print("--- %s seconds ---" % (time.time() - start_time))

# Try exact toy example

https://medium.com/spikelab/forecasting-multiples-time-series-using-prophet-in-parallel-2515abd1a245M
    

In [35]:
import pandas as pd
import numpy as np
def rnd_timeserie(min_date, max_date):
    time_index = pd.date_range(min_date, max_date)
    dates = (pd.DataFrame({'ds': pd.to_datetime(time_index.values)},
                          index=range(len(time_index))))
    y = np.random.random_sample(len(dates))*10
    dates['y'] = y
    return dates

SyntaxError: invalid syntax (<ipython-input-34-3ce43f55e4fc>, line 1)