In [None]:
!pip install arize[MimicExplainer] -q

# Timeseries Forecasting
In this example, we are showing how timeseries forecasting data can be observed in the Arize platform by leveraging a Lag (delta between "run date" which indicated the day which the inference was made ***on*** and "prediction timestamp" which indicates when the prediction was made ***for***.

This example is predicting therms to be generated each day for the next 14 days based on expected temperatures on western cities and renewable energy source outputs. Sales Forecasts, Product Demand, etc. can all be ingested in a similar fashion.

In [3]:
import pandas as pd

from datetime import datetime, date, timezone
from pandas.core.frame import DataFrame

from arize.pandas.logger import Client, Schema
from arize.utils.types import ModelTypes, Environments

In [185]:
# Setup Arize client
SPACE_KEY = "SPACE_KEY"
API_KEY = "API_KEY"
RECEIVER_URI = "https://api.arize.com/v1"

if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")

arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY, uri=RECEIVER_URI)

### Prediction IDs need to be unique, even in the case where a prediction is for the same day, so actuals can be joined against the correct target. A hermetic way to accomplish this is by concatnating the run_date and the prediction_date.

---
### For the sake of this example, we are manipulating timestamps position them to coeincide with the time someone run this notebook - for convinience purposes only.

In [191]:


def prep_data(df: DataFrame):
  # We'll shift timestamps so the last "run" of the prediction timeline occurs today
  days_to_shift = (datetime.today() - datetime.strptime(df['run_date'].max(), "%Y-%m-%d")).days
  df['run_date'] = (pd.to_datetime(df['run_date']) + pd.DateOffset(days=days_to_shift)).dt.strftime('%Y-%m-%d')
  
  # Prediction dates are also shifted in this example to line up with our model use case which 
  # predicts the next 14 days of energy use
  df['prediction_timestamp'] = df['prediction_timestamp'] + (days_to_shift * 24 * 60 * 60)
  df['Date'] = pd.to_datetime(df['prediction_timestamp'], unit='s').astype(str)

  # We now need to generate an unique prediction ID - since we are making predictions for the same 
  # date multiple days leading up to the forecast date, the prediction id
  # is a compound key with Date the model ran and date which the model is forecasting for
  df['prediction_id'] = df['run_date'] + "_" + df['Date']
  
  # Since we have some data which only have predictions and no actuals (predictions made for future dates)
  # mixed with data that contains both predictions and actuals, we'll split them into a historical and a forecast set
  historical_data = df[df["reported_thermal"].notnull()]
  forecast_data = df[df["reported_thermal"].isnull()]
  return historical_data, forecast_data

# Now we can send the data in.

In [None]:
df = pd.read_parquet('https://storage.googleapis.com/arize-assets/tutorials/fixture_data/timeseries_energy_consumption_example.parquet.out')

# First we prep the data, then we'll send over to Arize for visualizations.
historical_data, forecast_data = prep_data(df)

# We'll list out feature columns for convenience
features = df.drop(columns=["Date","run_date","prediction_id","prediction_timestamp",
                            "predicted_thermal","reported_thermal","lag"]).columns.to_list()

MODEL_ID = "Timeseries-Forecast-Model-5"
MODEL_VERSION = "1.0"

# First send just the historical data (data which we already have actuals for)
historical_response = arize_client.log(
    dataframe=historical_data,
    model_id=MODEL_ID,
    model_version=MODEL_VERSION,
    model_type=ModelTypes.NUMERIC,
    environment=Environments.PRODUCTION,
    schema=Schema(prediction_id_column_name="prediction_id",
                  feature_column_names=features,
                  timestamp_column_name="prediction_timestamp",
                  prediction_label_column_name="predicted_thermal",
                  actual_label_column_name="reported_thermal",
                  tag_column_names=[
                                    "run_date", # date which the inference was made on
                                    "lag" # number of days between run_date and prediction_timestamp
                                    ],

    ),
    # Since we don't have pre-calculated Shap Values, we can enable Surrugate
    # Explainability to get SHAP values for our features
    surrogate_explainability=True,
)

# Next send forecast data, notice we removed the actual_label_column_name
# argument from the Schema definition
forecast_response = arize_client.log(
    dataframe=forecast_data,
    model_id=MODEL_ID,
    model_version=MODEL_VERSION,
    model_type=ModelTypes.NUMERIC,
    environment=Environments.PRODUCTION,
    schema=Schema(prediction_id_column_name="prediction_id",
                  feature_column_names=features,
                  timestamp_column_name="prediction_timestamp",
                  prediction_label_column_name="predicted_thermal",
                  tag_column_names=[
                                    "run_date", # date which the inference was made on
                                    "lag" # number of days between run_date and prediction_timestamp
                                    ],

    ),
    surrogate_explainability=True,
)