In [1]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import os
import pandas as pd
import numpy as np 
import pickle as pk
import glob
from fbprophet import Prophet
import sys
sys.path.append('../')
import wiki
from wiki import utils 
import multiprocessing as mp
total_proc = None
from tqdm import tqdm
from IPython.display import clear_output

In [3]:
PROPHET_PATH = '../data/prophet/'

In [4]:
glob.glob(PROPHET_PATH+'*')

['../data/prophet/ds.f',
 '../data/prophet/page_index.f',
 '../data/prophet/pagedf.f',
 '../data/prophet/v0',
 '../data/prophet/v0.1',
 '../data/prophet/v0.2']

In [5]:
pagedf = pd.read_feather(PROPHET_PATH+'pagedf.f')
ds = pd.read_feather(PROPHET_PATH+'ds.f')

## Version 0.2
Should set version directory name in next cell. Should describe version specifics (outliers, holidays, validation period). How will the validation period be communcated to the SMAPE scoring stuff?

* Val indexing on -60
* No outlier fixing
* Linear growth
* Truncating predictions at 0

### Remarks
* ?

In [6]:
# should break if the dir already exists - avoids accidental overwriting
VERSION = 'v0.2/'
val_lims = (0,-60)
os.makedirs(PROPHET_PATH+VERSION)

FileExistsError: [Errno 17] File exists: '../data/prophet/v0.2/'

In [7]:
pagedf = pagedf.loc[:,:'10']

# WARNING:
Turned off the chained assignment warning - when slicing dfs they can return copies sometimes instead,
which will mean your assignment wont be done on the actual base df.
Not sure why it's still compaining at me when I'm using .loc for assignations everywhere... shitty

In [8]:
pd.options.mode.chained_assignment = None

In [9]:
def process_page(page):
    print(page)
    df = ds.join(pagedf[page])
    df.columns = ['ds','y']
    # note this is doing validation on last 60 days
    # should also consider doing validation on the time period we are forcasting
    traindf = df.iloc[val_lims[0]:val_lims[1]]
    traindf['train'] = 1 # feather won't serialize bool so 1s and 0s...
    # do outlier removal here
    #traindf.loc[traindf.y > traindf.y.quantile(.95), ['y']] = None
    m = Prophet(yearly_seasonality=True)
    m.fit(traindf)
    forecast = m.predict(ds)
    forecast['yhat_org'] = forecast['yhat']
    forecast.loc[forecast['yhat'] < 0,['yhat']] = 0.0
    forecast = forecast.join(df.y)
    forecast = forecast.join(traindf.loc[:,['train']]).fillna({'train':0}) # 0 bools
    #forecast.loc[forecast['train'] != 1,['train']] = 0 # 0 bools
    forecast.to_feather(PROPHET_PATH+VERSION+page+'df.f')
    with open(PROPHET_PATH+VERSION+page+'m.pk', 'wb') as file:
        pk.dump(m,file)
    full_smape = wiki.val.smape(forecast.y, forecast.yhat)
    val_smape = wiki.val.smape(forecast[forecast['train'] == 0].y,forecast[forecast['train'] == 0].yhat)
    full_smape_df = wiki.val.smape_df(forecast,'y','yhat')
    val_smape_df = wiki.val.smape_df(forecast[forecast['train'] == 0],'y','yhat')
    return (page, full_smape, val_smape,full_smape_df,val_smape_df)

In [10]:
def wrapper(pages):
    val_results = []
    for page in tqdm(pages):
        val_results.append(process_page(page))
    return val_results

In [11]:
total_proc = mp.cpu_count()

In [12]:
col_split = np.array_split(pagedf.columns, total_proc)
mp_pool = mp.Pool(total_proc)

  0%|          | 0/3 [00:00<?, ?it/s]

3


  0%|          | 0/2 [00:00<?, ?it/s]

6
0
9


 33%|███▎      | 1/3 [00:02<00:04,  2.43s/it]

1


 50%|█████     | 1/2 [00:02<00:02,  2.47s/it]

10


 33%|███▎      | 1/3 [00:02<00:04,  2.49s/it]

4


 33%|███▎      | 1/3 [00:02<00:05,  2.56s/it]

7


 67%|██████▋   | 2/3 [00:04<00:02,  2.46s/it]

5


 67%|██████▋   | 2/3 [00:04<00:02,  2.45s/it]


2


 67%|██████▋   | 2/3 [00:04<00:02,  2.52s/it]

8


100%|██████████| 3/3 [00:06<00:00,  2.31s/it]
100%|██████████| 3/3 [00:06<00:00,  2.30s/it]
100%|██████████| 3/3 [00:06<00:00,  2.35s/it]


In [13]:
with utils.clock():
    %time fin = mp_pool.map(wrapper, col_split)

CPU times: user 15.6 ms, sys: 12.5 ms, total: 28 ms
Wall time: 6.97 s
Elapsed time 6.9691290855407715 seconds


In [14]:
fin2 = [item for sublist in fin for item in sublist]

In [15]:
fin2

[('0',
  50.762019028294311,
  63.631257611154631,
  50.762019028294311,
  63.631257611154631),
 ('1',
  47.47865101487028,
  51.293459271888665,
  47.47865101487028,
  51.293459271888665),
 ('2',
  69.77089773423242,
  59.916423429826509,
  69.77089773423242,
  59.916423429826509),
 ('3',
  47.294630658769499,
  44.912849581367347,
  47.294630658769499,
  44.912849581367347),
 ('4',
  112.88369983371396,
  147.78164345169722,
  112.88369983371396,
  147.78164345169722),
 ('5',
  41.312534789626149,
  47.515460366720134,
  41.312534789626149,
  47.515460366720134),
 ('6',
  90.528941048415064,
  156.93273752848256,
  90.528941048415064,
  156.93273752848256),
 ('7',
  61.19347087119435,
  68.60909386853335,
  61.19347087119435,
  68.60909386853335),
 ('8',
  34.319141132261748,
  41.220362491473452,
  34.319141132261748,
  41.220362491473452),
 ('9',
  26.1060112287458,
  29.197131049681982,
  26.1060112287458,
  29.197131049681982),
 ('10',
  84.920968024207241,
  65.561638792196149,


In [16]:
pd.DataFrame(fin2, columns=['index',VERSION[:-1]+'_full',VERSION[:-1]+'_val',
                            VERSION[:-1]+'_full_df',VERSION[:-1]+'_val_df'])

Unnamed: 0,index,v0.2_full,v0.2_val,v0.2_full_df,v0.2_val_df
0,0,50.762019,63.631258,50.762019,63.631258
1,1,47.478651,51.293459,47.478651,51.293459
2,2,69.770898,59.916423,69.770898,59.916423
3,3,47.294631,44.91285,47.294631,44.91285
4,4,112.8837,147.781643,112.8837,147.781643
5,5,41.312535,47.51546,41.312535,47.51546
6,6,90.528941,156.932738,90.528941,156.932738
7,7,61.193471,68.609094,61.193471,68.609094
8,8,34.319141,41.220362,34.319141,41.220362
9,9,26.106011,29.197131,26.106011,29.197131


# Confirms that the safe SMAPE_df and the normal SMAPE gisthe same results
* Do don't need to worry (at all?) about the NaNs when computing SMAPE

In [17]:
df6 = pd.read_feather(PROPHET_PATH+VERSION+'6df.f')

In [18]:
%%timeit
wiki.val.smape(df6.y,df6.yhat)

2.92 ms ± 192 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
%%timeit
wiki.val.smape_df(df6,'y','yhat')

3.75 ms ± 293 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Confirms the original SMAPE is faster