In [337]:
%load_ext autoreload
%autoreload 2

import os

if 'cachai' not in os.listdir('.'):
    os.chdir('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [338]:
from enum import Enum, auto


class ObservationType(Enum):
    HIT = auto()
    MISS = auto()
    STALE = auto()
    VALID_TTL = auto()

    def __str__(self):
        return self.name

In [339]:
import numpy as np
import pandas as pd
from sklearn.metrics import root_mean_squared_error, mean_absolute_error


class TTLSimulator:

    def __init__(self, iterations=1_000):
        self._iterations = iterations
        self._target_params = [
            (50, 5),
            (200, 10),
            (400, 30),
        ]
        # means = np.linspace(10, 500, 10).astype(int)
        # std = np.arange(1, len(means) + 1)**2
        # self._target_params = np.array([means, std]).T

    def generate_features(self, target, num_features=1, correlation=0.8):
        cov_matrix = np.eye(num_features) * (1 - correlation) + np.ones((num_features, num_features)) * correlation
        features = np.random.multivariate_normal(np.ones(num_features) * target, cov_matrix)
        return features.reshape(1, -1)
        # return np.full((1, num_features), fill_value=target)

    def update_target_params(self, progress):
        target_params = []
        for param in self._target_params:
            mean = float(
                round(param[0]*np.sin(progress*2*np.pi/2)/(param[0]/2) + param[0], 2)
            )
            std = param[1]
            target_params.append((mean, std))
        self._target_params = target_params

    def feedback(self, y_true, y_pred):
        observation_time = int(min(y_true, y_pred)[0])
        hits = max(0, observation_time-1)
        observation_type = None
        if y_pred[0] < y_true[0]:
            observation_type = ObservationType.MISS
        elif y_pred[0] > y_true[0]:
            observation_type = ObservationType.STALE
        else:
            observation_type = ObservationType.VALID_TTL
        return observation_time, observation_type, hits

    def generate(self):
        target_param_index = np.random.randint(0, len(self._target_params))
        target_params = self._target_params[target_param_index]
        y = np.random.normal(target_params[0], target_params[1], 1)
        X = self.generate_features(y)
        return X, y

In [340]:
class Experiment():

    DF_COLUMNS = [
        'experiment_name', 'model_name', 'iteration', 'observation_type', 'observation_time',
        'y_true', 'y_pred', 'hits', 'mae'
    ]

    def __init__(self, simulator, iterations=1_000):
        self._simulator = simulator
        self._iterations = iterations

    def run(self, experiments):
        df = []
        for experiment in experiments:
            experiment_name = experiment['name']
            model = experiment['model']
            for i in range(self._iterations):
                X, y_true = self._simulator.generate()
                y_pred = model.predict(X)
                observation_time, observation_type, hits = self._simulator.feedback(y_true, y_pred)
                mae = mean_absolute_error(y_true, y_pred)
                model.observe(observation_time, observation_type, hits, y_pred)
                df.append([
                    experiment_name, model.NAME, i, observation_type, observation_time,
                    y_pred[0], y_true[0], hits, mae
                ])
        return pd.DataFrame(df, columns=Experiment.DF_COLUMNS)

In [341]:
def evaluate(df):
    metrics = []
    y_true = df['y_true']
    y_pred = df['y_pred']
    hits = df['hits']
    rmse = root_mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    metrics.append([rmse, mae, hits.sum(), len(df)])
    return pd.DataFrame(metrics, columns=[
        'rmse', 'mae', 'hits', 'len'
    ])

In [342]:
from abc import ABC, abstractmethod


class BaseModel(ABC):

    @property
    @abstractmethod
    def NAME(self) -> str:
        pass

    @abstractmethod
    def predict(self, X: np.array) -> np.array:
        pass

    @abstractmethod
    def observe(
        self,
        observation_time: int,
        observation_type: ObservationType,
        hits: int,
        prev_prediction: float
    ) -> None:
        pass

In [343]:
class Model(BaseModel):
    NAME = 'OPTIMAL'

    def __init__(self):
        pass

    def predict(self, X):
        output = X.mean()
        return np.array([output])

    def observe(self, observation_time, observation_type, hits, prev_prediction):
        a = 2+2

In [344]:
simulator = TTLSimulator()
experiment = Experiment(simulator=simulator)
df = experiment.run([
    {
        'name': 'test 1',
        'model': Model(),
    },
    {
        'name': 'test 2',
        'model': Model(),
    }
])
df

Unnamed: 0,experiment_name,model_name,iteration,observation_type,observation_time,y_true,y_pred,hits,mae
0,test 1,OPTIMAL,0,STALE,222,223.816042,222.902006,221,0.914035
1,test 1,OPTIMAL,1,MISS,190,190.836575,191.629154,189,0.792578
2,test 1,OPTIMAL,2,MISS,47,47.824450,48.794813,46,0.970362
3,test 1,OPTIMAL,3,MISS,365,365.306744,367.617022,364,2.310277
4,test 1,OPTIMAL,4,STALE,196,196.725712,196.204182,195,0.521531
...,...,...,...,...,...,...,...,...,...
1995,test 2,OPTIMAL,995,STALE,207,208.867573,207.973498,206,0.894075
1996,test 2,OPTIMAL,996,MISS,424,424.508040,425.828477,423,1.320437
1997,test 2,OPTIMAL,997,STALE,52,52.401968,52.285087,51,0.116881
1998,test 2,OPTIMAL,998,STALE,52,54.985911,52.943788,51,2.042123


In [345]:
df.groupby(['experiment_name', 'model_name']).apply(evaluate)

  df.groupby(['experiment_name', 'model_name']).apply(evaluate)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rmse,mae,hits,len
experiment_name,model_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
test 1,OPTIMAL,0,0.982372,0.798285,211672,1000
test 2,OPTIMAL,0,0.987047,0.789464,220437,1000
