In [2]:
from scipy.stats import norm
import torch.distributions as tdist
import numpy as np

sample_norm = norm(
    loc=13,
    scale=13 * .75
).cdf(21) + np.random.uniform(-0.1, 0, 100).mean()

sample_norm

0.739570586828752

In [3]:
sample_norm = norm(
    loc=21,
    scale=20 * .75
).cdf(37) + np.random.uniform(-0.15, -.05, 100).mean()
sample_norm

0.7543362158187632

In [33]:
sample_norm = norm(
    33,
    scale=33 * .75
).cdf(75) + np.random.uniform(-0.25, -.15, 100).mean()

sample_norm

0.7615784537382475

In [27]:
s.mean()

26.966019494179836

In [1]:
!pip install python-dotenv --quiet
!pip install gym stable-baselines3[extra] awscli boto3 pqdm awscli --quiet


[0m

In [2]:
%load_ext dotenv
%dotenv env

In [3]:
!aws s3 sync s3://dissertation-data-dmiller/rl_ready_data_conv/files_used_30/batched_train rl_ready_data_conv

In [4]:
# %load rl_constant.py
# %load rl_constant.py
FEATURE_COLUMNS = [
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event",
    "cum_session_time",
    "expanding_click_average",
   
    "cum_platform_time",
    "cum_platform_event",
    "cum_projects",
    "average_event_time",
    "delta_last_event",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]

METADATA = [
    "user_id",
    "session_30_count_raw",
    "cum_platform_event_raw",
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "cum_session_event_raw",
    "date_time"
]

RL_STAT_COLS = [
    'session_size',
    'session_minutes',
    'sim_size',
    'sim_minutes',
    'reward'
]

PREDICTION_COLS = [
    "seq_40",
    "session_terminates_30_minutes"
    # "label"
]

LOAD_COLS = list(set(FEATURE_COLUMNS + METADATA + RL_STAT_COLS + PREDICTION_COLS))

In [5]:
# %load environment
import gym
import numpy as np
from scipy.stats import norm

import numpy as np
from scipy.stats import norm 
import gym
from datetime import datetime
from copy import deepcopy

class CitizenScienceEnv(gym.Env):
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self, dataset, out_features, n_sequences, evaluation=False):
        """
        trajectories: dictionary of user_id to their respective trajectories.
        n_sequences: number of sequences used for preprocessing.
        n_features: number of features used for preprocessing.
        """
        super(CitizenScienceEnv, self).__init__()
        self.dataset = dataset
        self.n_sequences = n_sequences
        self.current_session = None
        self.current_session_index = 0
        self.reward = 0
        self.n_sequences = n_sequences
        self.out_features = out_features
        
        self.action_space = gym.spaces.Discrete(2)
        self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(len(out_features), n_sequences + 1), dtype=np.float32)
        self.evalution = evaluation
        self.episode_bins = []
        self.exp_runs = 0

    def reset(self):
        user_to_run, session_to_run = self.dataset.sample(1)[['user_id', 'session_30_count_raw']].values[0]
        self.current_session = self._get_events(user_to_run, session_to_run)
        self.metadata = self._metadata()
        self.current_session_index = 0
        self.reward = 0
        return self._state()
    
    def _row_to_dict(self, metadata):
        """
        Convert a row of metadata to a dictionary.
        """
        return metadata.to_dict()

    def step(self, action):
        
        self._take_action(action)
        next_state, done, meta = self._calculate_next_state()
        
        if done:
            current_session_index = self.current_session_index if \
                self.current_session_index != self.current_session.shape[0] else self.current_session.shape[0] - 1
            
            self.exp_runs += 1
        
            self.metadata['ended'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
            self.metadata['reward'] = self.current_session.iloc[current_session_index]['reward']
            self.metadata['session_exp_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            self.metadata['exp_runs'] = self.exp_runs
            self.episode_bins.append(self._row_to_dict(self.metadata))
            
            return next_state, float(self.reward), done, {}
        else:
            self.reward = self.current_session.iloc[self.current_session_index]['reward'] 
            self.current_session_index += 1        
            return next_state, float(self.reward), done, meta
    
    def _metadata(self):
        session_metadata = self.current_session.iloc[0][RL_STAT_COLS]
        session_metadata['ended'] = 0
        session_metadata['incentive_index'] = 0
        return session_metadata
    
    
    def flush_episode_bins(self):
        episode_bins = self.episode_bins.copy()
        self.episode_bins = []
        return episode_bins
    
    def _calculate_next_state(self):
        
        if (self.current_session_index == self.current_session.shape[0]):
            return None, True, {}

        if self._continuing_in_session():
            return self._state(), False, {}
    
        return None, True, {}
         
    def _continuing_in_session(self):
        sim_minutes = self.current_session.iloc[self.current_session_index]['sim_minutes']
        current_session_minutes = self.current_session.iloc[self.current_session_index]['cum_session_time_raw']
        if current_session_minutes < sim_minutes:
            return True
        
        extending_session = self._probability_extending_session(current_session_minutes)
        
        return all([extending_session >= .3, extending_session <= .7])
        
    
    def _probability_extending_session(self, current_session_count):
        if self.metadata['incentive_index'] == 0:
            return 0
        
        scale = max(5, int(self.metadata['session_minutes'] / 5))
        continue_session = norm(
            loc=self.metadata['incentive_time'],
            scale=scale
        ).cdf(current_session_count)
        
        return continue_session
        

    def _get_events(self, user_id, session):
        subset = self.dataset[
            (self.dataset['user_id'] == user_id) &
            (self.dataset['session_30_count_raw'] == session).copy()
        ]

        subset = subset.sort_values(by=['date_time'])
        return subset
    
    def _take_action(self, action):
        if action == 0 or self.metadata['incentive_index'] > 0:
            return
        
        current_session_index = self.current_session_index if \
            self.current_session_index != self.current_session.shape[0] else self.current_session.shape[0] - 1
        
        self.metadata['incentive_index'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
        self.metadata['incentive_time'] = self.current_session.iloc[current_session_index]['cum_session_time_raw']
        
    def _state(self):

        if self.current_session_index > self.n_sequences:
            events = self.current_session.iloc[self.current_session_index - (self.n_sequences + 1):self.current_session_index][self.out_features].values
            
        else:
            delta = min((self.n_sequences + 1)- self.current_session_index, self.n_sequences)
            zero_cat = np.zeros((delta, len(self.out_features)))
            events = self.current_session.iloc[:max(self.current_session_index, 1)][self.out_features].values
            events = np.concatenate((zero_cat, events), axis=0)
            

        return events.astype(np.float32).T

In [6]:
# %load callback
# %load callback
import pandas as pd
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat
from datetime import datetime

class DistributionCallback(BaseCallback):
    
    @classmethod
    def tensorboard_setup(cls, log_dir, log_freq):
        cls._log_dir = log_dir
        cls._log_freq = log_freq

    
    def _on_step(self) -> bool:
        if self.n_calls % self._log_freq == 0:
            dist_list = self.training_env.env_method('flush_episode_bins')
            values_to_log = [item for sublist in dist_list for item in sublist if len(sublist) > 0]

            values_df = pd.DataFrame(
                values_to_log
            )
            
            
            session_size, sim_size, session_minutes, sim_minutes, ended, reward, inc_time, inc_index = (
                values_df['session_size'].mean(),
                values_df['sim_size'].mean(),
                values_df['session_minutes'].mean(),
                values_df['sim_minutes'].mean(),
                values_df['ended'].mean(),
                values_df['reward'].mean(),
                values_df['incentive_time'].mean(),
                values_df['incentive_index'].mean()
            )
            
            size_stats = {
                'session_size': session_size,
                'sim_size': sim_size,
                'ended': ended,
                'inc_index': inc_index
            }
            
            
            time_stats = {
                'session_minutes': session_minutes,
                'sim_minutes': sim_minutes,
                'reward': reward,
                'inc_time': inc_time   
            }
            
            for key, value in size_stats.items():
                self.logger.record(f'size/{key}', value)
            
            for key, value in time_stats.items():
                self.logger.record(f'time/{key}', value)

            
        return True

In [7]:
# %load policies/cnn_policy
# %load policies/cnn_policy
from typing import Dict, List, Type, Union

import gym
import torch
from gym import spaces
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.dqn.policies import DQNPolicy
from torch import nn


class CustomConv1dFeatures(BaseFeaturesExtractor):
    
    @classmethod
    def setup_sequences_features(cls, n_sequences, n_features):
        cls.n_sequences = n_sequences
        cls.n_features = n_features
        
    
    def __init__(self, observation_space: spaces.Box, features_dim=20):
        super().__init__(observation_space, features_dim)
        
        
        self.cnn_1 = nn.Sequential(
            nn.Conv1d(self.n_features, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.ReLU(),
            
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.ReLU(),
            
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            
            nn.AvgPool1d(2)
        )
        
        self.cnn_2 = nn.Sequential(
            nn.Conv1d(self.n_features*2, self.n_features, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features),
            nn.ReLU(),
            
            nn.Conv1d(self.n_features, self.n_features, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features),
            nn.ReLU()
        )
        
        self.act = nn.Sequential(
            nn.MaxPool1d(2),
            nn.Flatten(),
        )
        
        with torch.no_grad():
            out_shape = self.act(self.cnn_2(self.cnn_1(torch.zeros((1, self.n_features, self.n_sequences))))).shape[1]
            self.linear = nn.Linear(out_shape, features_dim)
    
    def forward(self, obs):
        out = self.cnn_1(obs)
        out = self.cnn_2(out)
        out = self.act(out)
        return self.linear(out)


        

In [9]:
# %load incentive_reinforcement_learning_cpu.py
import argparse
import logging
import os
from datetime import datetime
from functools import reduce
from pprint import pformat
from typing import Callable
import boto3
import random
import numpy as np
import pandas as pd
import torch


from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.common.callbacks import (CallbackList,
                                                CheckpointCallback,
                                                StopTrainingOnMaxEpisodes)
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from stable_baselines3.dqn.policies import DQNPolicy


logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
torch.set_printoptions(precision=2, linewidth=200, sci_mode=False)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
global logger
logger = logging.getLogger('rl_exp_train')
logger.setLevel(logging.INFO)

S3_BASELINE_PATH = 's3://dissertation-data-dmiller/'
N_SEQUENCES = 40
CHECKPOINT_FREQ = 250_000
TB_LOG = 10_000
WINDOW = 1

def parse_args():
    parse = argparse.ArgumentParser()
    parse.add_argument('--read_path', type=str, default='rl_ready_data_conv')
    parse.add_argument('--n_files', type=int, default=2)
    parse.add_argument('--n_episodes', type=int, default=50)
    parse.add_argument('--n_envs', type=int, default=100)
    parse.add_argument('--lstm', type=str, default='label')
    parse.add_argument('--part', type=str, default='train')
    parse.add_argument('--feature_extractor', type=str, default='cnn') 
    args = parse.parse_args()
    return args


def load_and_dedupe(read_path, cols):
    
    df = pd.read_parquet(read_path, columns=cols)
    return df



def main(args):
    
    
    logger.info('Starting Incentive Reinforcement Learning')
    logger.info(pformat(args.__dict__))
    exec_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    
    read_path, n_files, n_episodes, n_envs, lstm, part, feature_ext = (
        args.read_path, 
        args.n_files, 
        args.n_episodes, 
        args.n_envs,
        args.lstm,
        args.part,
        args.feature_extractor,
    )

    base_read_path = os.path.join('rl_ready_data_conv', f'files_used_{n_files}')
    full_read_path = os.path.join(base_read_path, f'window_{WINDOW}_{part}_final.parquet')
    vec_df_path = os.path.join(base_read_path, f'window_{WINDOW}_{part}_batched')
    load_cols = LOAD_COLS

    base_exp_path = '/'.join(
        [
            'rl_experiments',
            f'n_files_{n_files}',
            f'{feature_ext}_{lstm}',
            exec_time,
        ]
    )

    if not os.path.exists(full_read_path):
        client = boto3.client('s3')
        download_dataset_from_s3(client,  base_read_path, full_read_path)
        
    if not os.path.exists(vec_df_path):
        df = load_and_dedupe(full_read_path, cols=load_cols)
        df = df.sort_values(['date_time', 'user_id'])
        logger.info(f'Loaded data with shape {df.shape}')
        os.makedirs(vec_df_path, exist_ok=True)
        logger.info(f'Writing vectorized data to {vec_df_path}')
        batch_environments_for_vectorization(df, n_envs, vec_df_path)
        logger.info(f'Vectorized environments created')
        del df
    

    logger.info(f'Loading vectorized data from {vec_df_path}')
    vectorized_df = [
        pd.read_parquet(os.path.join(vec_df_path, f'batch_{i}.parquet'), columns=LOAD_COLS)
        for i in range(n_envs)
    ]

    out_features = FEATURE_COLUMNS + [lstm] if lstm else FEATURE_COLUMNS


    citizen_science_vec =DummyVecEnv([lambda: CitizenScienceEnv(vec_df, out_features, N_SEQUENCES) for vec_df in vectorized_df])
    monitor_train = VecMonitor(citizen_science_vec)
    
    logger.info(f'Vectorized environments created')



    tensorboard_dir, checkpoint_dir = (
        os.path.join(base_exp_path, 'training_metrics'),
        os.path.join(base_exp_path, 'checkpoints')
    )

    if not os.path.exists(tensorboard_dir):
        logger.info(f'Creating directory {tensorboard_dir} for tensorboard logs')
        os.makedirs(tensorboard_dir)
   
    if not os.path.exists(checkpoint_dir):
        logger.info(f'Creating directory {checkpoint_dir} for checkpoints')
        os.makedirs(checkpoint_dir) 

    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=n_episodes, verbose=1)
    checkpoint_freq = int(CHECKPOINT_FREQ // n_envs)
    log_freq = int(TB_LOG // n_envs)
    checkpoint_callback = CheckpointCallback(
        save_freq=checkpoint_freq,
        save_path=checkpoint_dir, 
        verbose=2
    )
    
    DistributionCallback.tensorboard_setup(tensorboard_dir, log_freq)
    logger_callback = DistributionCallback()
    
    callback_list = CallbackList([checkpoint_callback, logger_callback, callback_max_episodes])
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    if feature_ext == 'cnn':
        CustomConv1dFeatures.setup_sequences_features(N_SEQUENCES + 1, len(out_features))
        logger.info('Using custom 1 dimensional CNN feature extractor')
        policy_kwargs = dict(
            features_extractor_class=CustomConv1dFeatures,
            net_arch=[10]
        )
        model = DQN(policy='CnnPolicy', env=monitor_train, verbose=1, tensorboard_log=tensorboard_dir, policy_kwargs=policy_kwargs, device=device, stats_window_size=1000)
    else:
        logger.info('Using default MLP feature extractor')
        model = DQN(policy='MlpPolicy', env=monitor_train, verbose=1, tensorboard_log=tensorboard_dir, device=device, stats_window_size=1000)
        
    logger.info(f'Model created: policy')
    
    logger.info(pformat(model.policy))
        
    logger.info(f'Beginning training') 
    
            
    logger.info(pformat([
        'n_episodes: {}'.format(n_episodes),
        'read_path: {}'.format(read_path),
        'n_files: {}'.format(n_files),
        'n_sequences: {}'.format(N_SEQUENCES),
        'n_envs: {}'.format(n_envs),
        'device: {}'.format(device),
        'lstm: {}'.format(lstm),
        'part: {}'.format(part),
        'feature_extractor: {}'.format(feature_ext),
        'tensorboard_dir: {}'.format(tensorboard_dir),
        'checkpoint_dir: {}'.format(checkpoint_dir),
        'checkpoint_freq: {}'.format(checkpoint_freq),
        'tb_freq: {}'.format(log_freq),
    ]))
    
    model.learn(total_timesteps=25_000_000, progress_bar=True, log_interval=log_freq, callback=callback_list)
    



In [10]:
class Argument:
    read_path = 'rl_ready_data_conv'
    n_files = 30
    n_episodes = 500_000
    n_envs = 100
    lstm = None
    part = 'train'
    feature_extractor = 'cnn'

In [11]:

main(Argument)

06/03/2023 07:33:59 PM Starting Incentive Reinforcement Learning
06/03/2023 07:33:59 PM mappingproxy({'__dict__': <attribute '__dict__' of 'Argument' objects>,
              '__doc__': None,
              '__module__': '__main__',
              '__weakref__': <attribute '__weakref__' of 'Argument' objects>,
              'feature_extractor': 'cnn',
              'lstm': None,
              'n_envs': 100,
              'n_episodes': 500000,
              'n_files': 30,
              'part': 'train',
              'read_path': 'rl_ready_data_conv'})
06/03/2023 07:33:59 PM Loading vectorized data from rl_ready_data_conv/files_used_30/window_1_train_batched
06/03/2023 07:34:00 PM Vectorized environments created
06/03/2023 07:34:00 PM Creating directory rl_experiments/n_files_30/cnn_None/2023-06-03_19-33-59/training_metrics for tensorboard logs
06/03/2023 07:34:00 PM Creating directory rl_experiments/n_files_30/cnn_None/2023-06-03_19-33-59/checkpoints for checkpoints
06/03/2023 07:34:00 PM 

Using cuda device


06/03/2023 07:34:01 PM Model created: policy
06/03/2023 07:34:01 PM CnnPolicy(
  (q_net): QNetwork(
    (features_extractor): CustomConv1dFeatures(
      (cnn_1): Sequential(
        (0): Conv1d(20, 40, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): BatchNorm1d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): Conv1d(40, 40, kernel_size=(3,), stride=(1,), padding=(1,))
        (4): BatchNorm1d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (5): ReLU()
        (6): Conv1d(40, 40, kernel_size=(3,), stride=(1,), padding=(1,))
        (7): BatchNorm1d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (8): Conv1d(40, 40, kernel_size=(3,), stride=(1,), padding=(1,))
        (9): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
      )
      (cnn_2): Sequential(
        (0): Conv1d(40, 20, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): BatchNorm1d(20, eps=1e-05

Logging to rl_experiments/n_files_30/cnn_None/2023-06-03_19-33-59/training_metrics/DQN_1


Output()

-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 13.9      |
|    ep_rew_mean      | 27.743235 |
|    exploration_rate | 0.999     |
| time/               |           |
|    episodes         | 100       |
|    fps              | 614       |
|    time_elapsed     | 5         |
|    total_timesteps  | 3600      |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 21.7      |
|    ep_rew_mean      | 38.780884 |
|    exploration_rate | 0.997     |
| time/               |           |
|    episodes         | 200       |
|    fps              | 735       |
|    time_elapsed     | 10        |
|    total_timesteps  | 7400      |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 23.6     |
|    ep_rew_mean      | 40.47379 |
|    exploration_rate | 0.996    |
| size/               |          |
|    ended            | 23.2     |
|    inc_index        | 2.04     |
|    session_size     | 32.7     |
|    sim_size         | 23.2     |
| time/               |          |
|    episodes         | 300      |
|    fps              | 770      |
|    inc_time         | 2.29     |
|    reward           | 2.62     |
|    session_minutes  | 57.9     |
|    sim_minutes      | 39.9     |
|    time_elapsed     | 14       |
|    total_timesteps  | 11200    |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 27.5      |
|    ep_rew_mean      | 44.626587 |
|    exploration_rate | 0.994     |
| time/               |           |
|    episodes         | 400       |
|    fps              | 811       |
|    time_elapsed     | 18        |
|    total_timesteps  | 14900     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 30.3      |
|    ep_rew_mean      | 48.101593 |
|    exploration_rate | 0.993     |
| time/               |           |
|    episodes         | 500       |
|    fps              | 841       |
|    time_elapsed     | 22        |
|    total_timesteps  | 19000     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 31.4      |
|    ep_rew_mean      | 49.834805 |
|    exploration_rate | 0.991     |
| size/               |           |
|    ended            | 38.1      |
|    inc_index        | 1.97      |
|    session_size     | 54        |
|    sim_size         | 38.1      |
| time/               |           |
|    episodes         | 600       |
|    fps              | 860       |
|    inc_time         | 2.11      |
|    reward           | 1.94      |
|    session_minutes  | 81.2      |
|    sim_minutes      | 57.8      |
|    time_elapsed     | 26        |
|    total_timesteps  | 23000     |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 32.4     |
|    ep_rew_mean      | 50.60825 |
|    exploration_rate | 0.99     |
| time/               |          |
|    episodes         | 700      |
|    fps              | 869      |
|    time_elapsed     | 30       |
|    total_timesteps  | 26300    |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 33.2      |
|    ep_rew_mean      | 51.732357 |
|    exploration_rate | 0.989     |
| size/               |           |
|    ended            | 38.1      |
|    inc_index        | 1.85      |
|    session_size     | 53.9      |
|    sim_size         | 38.1      |
| time/               |           |
|    episodes         | 800       |
|    fps              | 882       |
|    inc_time         | 1.85      |
|    reward           | 2.11      |
|    session_minutes  | 82.3      |
|    sim_minutes      | 58.1      |
|    time_elapsed     | 34        |
|    total_timesteps  | 30200     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 33.1      |
|    ep_rew_mean      | 51.886257 |
|    exploration_rate | 0.987     |
| time/               |           |
|    episodes         | 900       |
|    fps              | 889       |
|    time_elapsed     | 38        |
|    total_timesteps  | 33900     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 33.5      |
|    ep_rew_mean      | 52.237904 |
|    exploration_rate | 0.986     |
| time/               |           |
|    episodes         | 1000      |
|    fps              | 897       |
|    time_elapsed     | 42        |
|    total_timesteps  | 37900     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 35.8      |
|    ep_rew_mean      | 55.223915 |
|    exploration_rate | 0.984     |
| size/               |           |
|    ended            | 34.7      |
|    inc_index        | 1.99      |
|    session_size     | 49.1      |
|    sim_size         | 34.7      |
| time/               |           |
|    episodes         | 1100      |
|    fps              | 904       |
|    inc_time         | 2.22      |
|    reward           | 2.14      |
|    session_minutes  | 80.1      |
|    sim_minutes      | 54.9      |
|    time_elapsed     | 45        |
|    total_timesteps  | 41400     |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 36.7     |
|    ep_rew_mean      | 55.95528 |
|    exploration_rate | 0.983    |
| time/               |          |
|    episodes         | 1200     |
|    fps              | 911      |
|    time_elapsed     | 49       |
|    total_timesteps  | 44700    |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 37.5      |
|    ep_rew_mean      | 57.271664 |
|    exploration_rate | 0.982     |
| time/               |           |
|    episodes         | 1300      |
|    fps              | 920       |
|    time_elapsed     | 52        |
|    total_timesteps  | 48600     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 36.8      |
|    ep_rew_mean      | 56.713383 |
|    exploration_rate | 0.98      |
| size/               |           |
|    ended            | 37.5      |
|    inc_index        | 2.07      |
|    session_size     | 53.1      |
|    sim_size         | 37.5      |
| time/               |           |
|    episodes         | 1400      |
|    fps              | 880       |
|    inc_time         | 2.54      |
|    reward           | 1.87      |
|    session_minutes  | 81.9      |
|    sim_minutes      | 57.4      |
|    time_elapsed     | 58        |
|    total_timesteps  | 51900     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 0.957     |
|    n_updates        | 4         |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 37.1      |
|    ep_rew_mean      | 57.767685 |
|    exploration_rate | 0.979     |
| time/               |           |
|    episodes         | 1500      |
|    fps              | 885       |
|    time_elapsed     | 62        |
|    total_timesteps  | 55500     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 0.717     |
|    n_updates        | 13        |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 36.8     |
|    ep_rew_mean      | 56.92637 |
|    exploration_rate | 0.977    |
| time/               |          |
|    episodes         | 1600     |
|    fps              | 890      |
|    time_elapsed     | 67       |
|    total_timesteps  | 59900    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.09     |
|    n_updates        | 24       |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 37.2      |
|    ep_rew_mean      | 57.834183 |
|    exploration_rate | 0.976     |
| size/               |           |
|    ended            | 36.9      |
|    inc_index        | 1.97      |
|    session_size     | 52.2      |
|    sim_size         | 36.9      |
| time/               |           |
|    episodes         | 1700      |
|    fps              | 892       |
|    inc_time         | 2.5       |
|    reward           | 1.86      |
|    session_minutes  | 83.2      |
|    sim_minutes      | 58.1      |
|    time_elapsed     | 71        |
|    total_timesteps  | 64000     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 0.485     |
|    n_updates        | 34        |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 37.9      |
|    ep_rew_mean      | 58.378834 |
|    exploration_rate | 0.974     |
| time/               |           |
|    episodes         | 1800      |
|    fps              | 897       |
|    time_elapsed     | 76        |
|    total_timesteps  | 68700     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 0.46      |
|    n_updates        | 46        |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 38.7     |
|    ep_rew_mean      | 59.0444  |
|    exploration_rate | 0.972    |
| size/               |          |
|    ended            | 43.6     |
|    inc_index        | 1.94     |
|    session_size     | 61.7     |
|    sim_size         | 43.5     |
| time/               |          |
|    episodes         | 1900     |
|    fps              | 900      |
|    inc_time         | 1.71     |
|    reward           | 1.78     |
|    session_minutes  | 91.6     |
|    sim_minutes      | 63.8     |
|    time_elapsed     | 80       |
|    total_timesteps  | 72800    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.69     |
|    n_updates        | 56       |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 38.6     |
|    ep_rew_mean      | 59.06325 |
|    exploration_rate | 0.971    |
| time/               |          |
|    episodes         | 2000     |
|    fps              | 903      |
|    time_elapsed     | 85       |
|    total_timesteps  | 76900    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.696    |
|    n_updates        | 67       |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 39.8      |
|    ep_rew_mean      | 60.312485 |
|    exploration_rate | 0.969     |
| size/               |           |
|    ended            | 41.1      |
|    inc_index        | 1.98      |
|    session_size     | 58.2      |
|    sim_size         | 41.1      |
| time/               |           |
|    episodes         | 2100      |
|    fps              | 906       |
|    inc_time         | 1.84      |
|    reward           | 2.03      |
|    session_minutes  | 87.9      |
|    sim_minutes      | 62        |
|    time_elapsed     | 90        |
|    total_timesteps  | 81900     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 1.41      |
|    n_updates        | 79        |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 40.8      |
|    ep_rew_mean      | 61.900784 |
|    exploration_rate | 0.967     |
| time/               |           |
|    episodes         | 2200      |
|    fps              | 908       |
|    time_elapsed     | 94        |
|    total_timesteps  | 86100     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 0.31      |
|    n_updates        | 90        |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 42        |
|    ep_rew_mean      | 63.043247 |
|    exploration_rate | 0.966     |
| size/               |           |
|    ended            | 46.8      |
|    inc_index        | 2         |
|    session_size     | 66.4      |
|    sim_size         | 46.8      |
| time/               |           |
|    episodes         | 2300      |
|    fps              | 909       |
|    inc_time         | 2.14      |
|    reward           | 1.76      |
|    session_minutes  | 98.3      |
|    sim_minutes      | 68.9      |
|    time_elapsed     | 99        |
|    total_timesteps  | 90200     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 1.19      |
|    n_updates        | 100       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 41.7      |
|    ep_rew_mean      | 62.469112 |
|    exploration_rate | 0.964     |
| time/               |           |
|    episodes         | 2400      |
|    fps              | 910       |
|    time_elapsed     | 102       |
|    total_timesteps  | 93800     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 1.32      |
|    n_updates        | 109       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 40.8      |
|    ep_rew_mean      | 61.047516 |
|    exploration_rate | 0.963     |
| time/               |           |
|    episodes         | 2500      |
|    fps              | 911       |
|    time_elapsed     | 107       |
|    total_timesteps  | 97800     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 1.31      |
|    n_updates        | 119       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 41.3      |
|    ep_rew_mean      | 61.673733 |
|    exploration_rate | 0.961     |
| size/               |           |
|    ended            | 35.2      |
|    inc_index        | 1.9       |
|    session_size     | 49.8      |
|    sim_size         | 35.2      |
| time/               |           |
|    episodes         | 2600      |
|    fps              | 911       |
|    inc_time         | 1.81      |
|    reward           | 1.78      |
|    session_minutes  | 76.9      |
|    sim_minutes      | 53.7      |
|    time_elapsed     | 111       |
|    total_timesteps  | 101500    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 1.02      |
|    n_updates        | 128       |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 40.9     |
|    ep_rew_mean      | 61.02838 |
|    exploration_rate | 0.96     |
| time/               |          |
|    episodes         | 2700     |
|    fps              | 911      |
|    time_elapsed     | 115      |
|    total_timesteps  | 105300   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.683    |
|    n_updates        | 138      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 40       |
|    ep_rew_mean      | 60.19645 |
|    exploration_rate | 0.959    |
| time/               |          |
|    episodes         | 2800     |
|    fps              | 912      |
|    time_elapsed     | 118      |
|    total_timesteps  | 108500   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.806    |
|    n_updates        | 146      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 39.6     |
|    ep_rew_mean      | 59.82985 |
|    exploration_rate | 0.957    |
| size/               |          |
|    ended            | 36.4     |
|    inc_index        | 2.08     |
|    session_size     | 51.4     |
|    sim_size         | 36.4     |
| time/               |          |
|    episodes         | 2900     |
|    fps              | 913      |
|    inc_time         | 2.61     |
|    reward           | 2.07     |
|    session_minutes  | 81.7     |
|    sim_minutes      | 57       |
|    time_elapsed     | 122      |
|    total_timesteps  | 112200   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 2.8      |
|    n_updates        | 155      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 40.3     |
|    ep_rew_mean      | 60.28913 |
|    exploration_rate | 0.956    |
| time/               |          |
|    episodes         | 3000     |
|    fps              | 916      |
|    time_elapsed     | 127      |
|    total_timesteps  | 117000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.975    |
|    n_updates        | 167      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 39.7     |
|    ep_rew_mean      | 59.65618 |
|    exploration_rate | 0.954    |
| size/               |          |
|    ended            | 43.1     |
|    inc_index        | 1.96     |
|    session_size     | 61       |
|    sim_size         | 43.1     |
| time/               |          |
|    episodes         | 3100     |
|    fps              | 917      |
|    inc_time         | 1.92     |
|    reward           | 1.9      |
|    session_minutes  | 88.7     |
|    sim_minutes      | 61.9     |
|    time_elapsed     | 131      |
|    total_timesteps  | 121000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.14     |
|    n_updates        | 177      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 39       |
|    ep_rew_mean      | 58.93857 |
|    exploration_rate | 0.953    |
| time/               |          |
|    episodes         | 3200     |
|    fps              | 918      |
|    time_elapsed     | 135      |
|    total_timesteps  | 124700   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.64     |
|    n_updates        | 186      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 37.7     |
|    ep_rew_mean      | 57.09935 |
|    exploration_rate | 0.951    |
| time/               |          |
|    episodes         | 3300     |
|    fps              | 920      |
|    time_elapsed     | 139      |
|    total_timesteps  | 128700   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.29     |
|    n_updates        | 196      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 39.3     |
|    ep_rew_mean      | 59.46882 |
|    exploration_rate | 0.949    |
| size/               |          |
|    ended            | 38.8     |
|    inc_index        | 2.05     |
|    session_size     | 54.9     |
|    sim_size         | 38.8     |
| time/               |          |
|    episodes         | 3400     |
|    fps              | 922      |
|    inc_time         | 2.34     |
|    reward           | 1.95     |
|    session_minutes  | 83.7     |
|    sim_minutes      | 58.3     |
|    time_elapsed     | 144      |
|    total_timesteps  | 132900   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.54     |
|    n_updates        | 207      |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 39.3      |
|    ep_rew_mean      | 59.197853 |
|    exploration_rate | 0.948     |
| time/               |           |
|    episodes         | 3500      |
|    fps              | 921       |
|    time_elapsed     | 148       |
|    total_timesteps  | 136500    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 2.76      |
|    n_updates        | 216       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 39        |
|    ep_rew_mean      | 58.937786 |
|    exploration_rate | 0.947     |
| time/               |           |
|    episodes         | 3600      |
|    fps              | 922       |
|    time_elapsed     | 151       |
|    total_timesteps  | 139900    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 0.865     |
|    n_updates        | 224       |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 38.8     |
|    ep_rew_mean      | 58.77157 |
|    exploration_rate | 0.945    |
| size/               |          |
|    ended            | 38.2     |
|    inc_index        | 1.95     |
|    session_size     | 54       |
|    sim_size         | 38.2     |
| time/               |          |
|    episodes         | 3700     |
|    fps              | 922      |
|    inc_time         | 1.94     |
|    reward           | 2.11     |
|    session_minutes  | 85.2     |
|    sim_minutes      | 59.5     |
|    time_elapsed     | 155      |
|    total_timesteps  | 143700   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.68     |
|    n_updates        | 234      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 38.7     |
|    ep_rew_mean      | 58.90962 |
|    exploration_rate | 0.944    |
| time/               |          |
|    episodes         | 3800     |
|    fps              | 921      |
|    time_elapsed     | 159      |
|    total_timesteps  | 147000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.56     |
|    n_updates        | 242      |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 38.8      |
|    ep_rew_mean      | 58.846134 |
|    exploration_rate | 0.942     |
| size/               |           |
|    ended            | 36.2      |
|    inc_index        | 2.07      |
|    session_size     | 51.3      |
|    sim_size         | 36.2      |
| time/               |           |
|    episodes         | 3900      |
|    fps              | 923       |
|    inc_time         | 2.61      |
|    reward           | 2.19      |
|    session_minutes  | 80.4      |
|    sim_minutes      | 56.9      |
|    time_elapsed     | 164       |
|    total_timesteps  | 151500    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 1.84      |
|    n_updates        | 253       |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 38.3     |
|    ep_rew_mean      | 58.46743 |
|    exploration_rate | 0.941    |
| time/               |          |
|    episodes         | 4000     |
|    fps              | 923      |
|    time_elapsed     | 167      |
|    total_timesteps  | 155000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 2.17     |
|    n_updates        | 262      |
----------------------------------
