### Project: Web Traffic Forecasting


In [None]:
import edward as ed
import numpy as np
import tensorflow as tf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import timedelta



from models import *
from utils import *
from pipeline import *
from cross_validation import cross_validation

In [None]:
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (24, 12)
matplotlib.rcParams['lines.linewidth'] = 2
matplotlib.rcParams['xtick.labelsize'] = 18
matplotlib.rcParams['ytick.labelsize'] = 18
matplotlib.rcParams['xtick.color'] = 'w'
matplotlib.rcParams['ytick.color'] = 'w'

## Data

In [None]:
def get_timeseries(path):
    df = pd.read_csv(path)
    timeseries = {}
    print("Loading timeseries:")
    for i, row in df.iterrows():
        ts = pd.DataFrame({"ds": row.index[1:], "views": row.values[1:]})
        timeseries[row.Page] = ts
        print(row.Page)
        #plt.plot(ts["ds"], np.log(ts["y"]))
        #plt.xticks(rotation=90)
        #plt.show()
    return timeseries

FPATH = "./data/nfl_teams.csv"
timeseries = get_timeseries(FPATH)

# Load data into DataFrame
pages = ["Atlanta_Falcons_en.wikipedia.org_mobile-web_all-agents", 
         "Dallas_Cowboys_en.wikipedia.org_mobile-web_all-agents"]
#pages = list(timeseries.keys())[:-1]
ts_dfs = []
for p in pages:
    print("Preparing timeseries %s" % p)
    df = setup_dataframe(timeseries[p])
    ts_dfs.append(df)

## Preprocess

In [None]:
# Split history (train) and future (test)
sdate = pd.datetime(2017, 7, 10)
ts_data = []
for df in ts_dfs: 
    history, future, y_scale = split_train_test(df, sdate)
    ts_data.append({
        "history": history, "future": future, "y_scale": y_scale
    })
    
print("Extracting features")
ts = ts_data[0] # same feature matrix for all test series  
train_data = extract_features(ts["history"])
test_data = extract_features(ts["future"], changepoints_t=train_data["t_change"])
assert(all(train_data["X"].columns ==  test_data["X"].columns))
assert(all(train_data["t_change"] == test_data["t_change"]))

## Model

In [None]:
def visualize_results(ts_data, predictions, metrics):
    for i, df in enumerate(ts_dfs):
        plt.plot(ts_data[i]["future"]["ds"], ts_data[i]["future"]["y_scaled"])
        plt.plot(predictions[i]["ds"], predictions[i]["y_scaled_pred"], '#2ca02c')
        plt.show()  
    m_pd = pd.DataFrame.from_dict(metrics)
    m_pd.loc['mean'] = m_pd.mean()
    print(m_pd)
    print()

def visualize_cross_validation(ts_dfs, predictions, metrics):
    for i, df in enumerate(ts_dfs):
        df = df[df["ds"] > pd.datetime(2016,6,1)]
        plt.plot(df["ds"], df["y"])
        for pred in predictions:
            plt.plot(pred[i]["ds"], pred[i]["y_pred"], '#2ca02c')
        plt.show()
    
    metrics_df = pd.DataFrame(columns=['start', 'end', 'MAPE_avg', 'SMAPE_avg'])
    for i, m_cutoff in enumerate(metrics):
        dmin, dmax = predictions[i][0]["ds"].min(), predictions[i][0]["ds"].max()
        avg_mape_scaled = np.mean([m["MAPE"] for m in m_cutoff])
        avg_smape_scaled = np.mean([m["SMAPE"] for m in m_cutoff])
        metrics_df = metrics_df.append({"start": dmin,
                                        "end": dmax,
                                        "MAPE_avg": avg_mape_scaled, 
                                        "SMAPE_avg": avg_smape_scaled}, ignore_index=True)
    
    print(metrics_df)

In [None]:
results = []
for model in [Model1(), Model2(), Model3()]:
    p, m = pipeline(ts_data, model,train_data, test_data, ITR=5000)
    results.append({"predictions": p, "metrics": m})

In [None]:
for i, r in enumerate(results):
    print("Model %d" % i)
    visualize_results(ts_data, r["predictions"], r["metrics"])

## Benchmark

In [None]:
# def median_model(train, size, p=-50):
#     visits = np.nan_to_num(np.nanmedian(train[-p:]))
#     return np.ones(size) * visits

# for i, ts in enumerate(ts_data):
#     print("Median model for %d" % i)
#     y_true = ts["future"]["y_scaled"]
#     y_pred_median = median_model(ts["history"]["y_scaled"], len(y_true))
#     evaluate(y_true, y_pred_median)
#     print()

## Cross Validation

In [None]:
%%capture
predictions, metrics = cross_validation(ts_dfs, Model1())

In [None]:
visualize_cross_validation(ts_dfs, predictions, metrics)

In [None]:
%%capture
predictions2, metrics2 = cross_validation(ts_dfs, Model2())

In [None]:
visualize_cross_validation(ts_dfs, predictions2, metrics2)

In [None]:
%%capture
predictions3, metrics3 = cross_validation(ts_dfs, Model3())

In [None]:
visualize_cross_validation(ts_dfs, predictions3, metrics3)

In [None]:
# Training error 
# y_train_pred = np.array([sess.run([y_post], 
#                                   feed_dict={t: X_train['t'],
#                                              A: X_train['A'], X: X_train['X'].as_matrix(), 
#                                              sigmas: X_train['sigmas'], t_change: changepoints_t}
#                                                 #tau: changepoint_prior_scale}))
#                                  ) for _ in range(500)]).mean(axis=0)[0]

In [None]:
# # Posterior check
# kmean, kstddev = sess.run([qk.mean(), qk.stddev()])
# print("Inferred posterior k: mean = %f, stddev = %f" % (kmean, kstddev))
# mmean, mstddev = sess.run([qm.mean(), qm.stddev()])
# print("Inferred posterior m: mean = %f, stddev = %f" % (mmean, mstddev))
# tau_mean, tau_stddev = sess.run([qtau.mean(), qtau.stddev()])
# print("Inferred posterior tau: mean = %f, stddev = %f" % (tau_mean, tau_stddev))


# noise_mean, noise_stddev = sess.run([qsigma_obs.mean(), qsigma_obs.stddev()])
# print("Inferred posterior noise: mean = %f, stddev = %f" % (noise_mean, noise_stddev))

# nburn = 500
# stride = 10
# sns.distplot(qk.params.eval()[nburn:ITR:stride])
# plt.show()
# sns.distplot(qm.params.eval()[nburn:ITR:stride])
# plt.show()

# sns.distplot(qtau.params.eval()[nburn:ITR:stride])
# plt.show()