In [1]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import os
import pandas as pd
import numpy as np 
import pickle as pk
import glob
from fbprophet import Prophet
import sys
sys.path.append('../')
import wiki
from wiki import utils 
import multiprocessing as mp
total_proc = None
from tqdm import tqdm
from IPython.display import clear_output

In [3]:
PROPHET_PATH = '../data/prophet/'
RESULTS_PATH = 'results/'

In [4]:
pagedf = pd.read_feather(PROPHET_PATH+'pagedf.f')
ds = pd.read_feather(PROPHET_PATH+'ds.f')

## Version 1.6
Should set version directory name in next cell. Should describe version specifics (outliers, holidays, validation period)

* Val indexing on -60
* No outlier fixing
* Linear growth
* Truncating predictions at 0
* Fill first NaNs with 0.0001
* Fill ALL other NaNs to 0

### Remarks
* ?

In [5]:
# should break if the dir already exists - avoids accidental overwriting
VERSION = 'v1.6/'
val_lims = (0,-60)
os.makedirs(PROPHET_PATH+VERSION)

In [6]:
pagedf.loc[:0] = pagedf.loc[:0].fillna(0.001)

In [7]:
pagedf = pagedf.fillna(0); pagedf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,145053,145054,145055,145056,145057,145058,145059,145060,145061,145062
0,18.0,11.0,1.0,35.0,0.001,12.0,0.001,118.0,5.0,6.0,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
1,11.0,14.0,0.0,13.0,0.0,7.0,0.0,26.0,23.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,15.0,1.0,10.0,0.0,4.0,0.0,30.0,14.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,13.0,18.0,1.0,94.0,0.0,5.0,0.0,24.0,12.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,14.0,11.0,0.0,4.0,0.0,20.0,0.0,29.0,9.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# WARNING:
Turned off the chained assignment warning - when slicing dfs they can return copies sometimes instead,
which will mean your assignment wont be done on the actual base df.
Not sure why it's still compaining at me when I'm using .loc for assignations everywhere... shitty

In [8]:
pd.options.mode.chained_assignment = None

In [9]:
def process_page(page):
    df = ds.join(pagedf[page])
    df.columns = ['ds','y']
    # note this is doing validation on last 60 days
    # should also consider doing validation on the time period we are forcasting
    traindf = df.iloc[val_lims[0]:val_lims[1]]
    traindf['train'] = 1 # feather won't serialize bool so 1s and 0s...
    # do outlier removal here
    #traindf.loc[traindf.y > traindf.y.quantile(.95), ['y']] = None
    m = Prophet(yearly_seasonality=True)
    m.fit(traindf)
    forecast = m.predict(ds)
    forecast['yhat_org'] = forecast['yhat']
    forecast.loc[forecast['yhat'] < 0,['yhat']] = 0.0
    forecast = forecast.join(df.y)
    forecast = forecast.join(traindf.loc[:,['train']]).fillna({'train':0}) # 0 bools
    forecast.to_feather(PROPHET_PATH+VERSION+page+'df.f')
    with open(PROPHET_PATH+VERSION+page+'m.pk', 'wb') as file:
        pk.dump(m,file)
    full_smape = wiki.val.smape(forecast.y, forecast.yhat)
    val_smape = wiki.val.smape(forecast[forecast['train'] == 0].y,forecast[forecast['train'] == 0].yhat)
    return (page, full_smape, val_smape)

In [10]:
def wrapper(pages):
    val_results = []
    for page in tqdm(pages):
        val_results.append(process_page(page))
    return val_results

In [11]:
pagedf = pagedf.loc[:,:'4']

In [12]:
total_proc = mp.cpu_count()>>1

In [13]:
col_split = np.array_split(pagedf.columns, total_proc)
mp_pool = mp.Pool(total_proc)

100%|██████████| 2/2 [00:03<00:00,  1.87s/it]
100%|██████████| 3/3 [00:04<00:00,  1.68s/it]


In [14]:
with utils.clock():
    %time val_results = mp_pool.map(wrapper, col_split)

CPU times: user 6.59 ms, sys: 6.11 ms, total: 12.7 ms
Wall time: 4.91 s
Elapsed time 4.913249969482422 seconds


In [15]:
val_results = [item for sublist in val_results for item in sublist]
val_results = pd.DataFrame(val_results, columns=['page_index',VERSION[:-1]+'_full',VERSION[:-1]+'_val'])

In [16]:
val_results.to_feather(PROPHET_PATH+RESULTS_PATH+VERSION[:-1]+'df.f')