In [None]:
from argparse import Namespace
from tqdm import tqdm

from agent import Agent
from levers import Levers
from memory import Memory

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Configuration

In [None]:
args = Namespace(
    k=10,
    nsteps=1000,
    eps=0.1
)

In [None]:
class Trial(object):
    def __init__(self, args: Namespace):
        self.args = args
        self.levers = Levers(k=args.k)
        self.agent = Agent(k=args.k)

    # run the trial and return average rewards
    def run(self) -> pd.Series:
        for step in range(self.args.nsteps):
            lever = self.agent.select_action(eps=self.args.eps)
            reward = self.levers.pull(lever)
            self.agent.memory.received_rewards.append(reward)
            self.agent.memory.update_estimates(lever, reward)

        received_rewards = pd.Series(self.agent.memory.received_rewards)
        return received_rewards.cumsum() / (received_rewards.index + 1).to_numpy()

## Training

In [None]:
epss = [0.0, 0.01, 0.1]
trial_results = {eps: list() for eps in epss}

ntrials = 2000

for eps in epss:
    args.eps = eps
    print(f"Now training with eps = {eps}")
    for _ in tqdm(range(ntrials)):
        trial = Trial(args)
        trial_result = trial.run()
        trial_results[eps].append(trial_result)
        del trial

In [None]:
averaged_rewards = dict()

for eps in epss:
    averaged_rewards[eps] = pd.concat(objs=trial_results[eps], axis=1).mean(axis=1)

In [None]:
averaged_rewards.keys()

In [None]:
plt.plot(averaged_rewards[0.0])
plt.plot(averaged_rewards[0.01])
plt.plot(averaged_rewards[0.1])