In [1]:
%pip install python-dotenv --quiet
%pip install gym stable-baselines3[extra] awscli boto3 pqdm awscli --quiet


[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
%load_ext dotenv
%dotenv env

In [3]:
!rm -r rl_ready_data_conv

In [4]:
!aws s3 sync s3://dissertation-data-dmiller/rl_ready_data_conv/files_used_30/window_1/batched_train rl_ready_data_conv/files_used_30/window_1/batched_train --delete --quiet
# !aws s3 sync s3://dissertation-data-dmiller/rl_ready_data_conv/files_used_30/batched_train rl_ready_data_conv/files_used_30/batched_eval --quiet

In [5]:
# %load rl_constant.py

FEATURE_COLUMNS = [
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event",
    "cum_session_time",
    "expanding_click_average",
   
    "cum_platform_time",
    "cum_platform_event",
    "cum_projects",
    "average_event_time",
    "delta_last_event",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]

METADATA = [
    "user_id",
    "session_30_count_raw",
    "cum_platform_event_raw",
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "cum_session_event_raw",
    "date_time"
]

RL_STAT_COLS = [
    'session_size',
    'session_minutes',
    'size_cutoff',
    'time_cutoff',
    'reward'
]

PREDICTION_COLS = [
    "label",
    "pred"
]

LOAD_COLS = list(set(FEATURE_COLUMNS + METADATA + RL_STAT_COLS + PREDICTION_COLS))

In [6]:
# %load environment
import gym
import numpy as np
from scipy.stats import norm
MAX_EVAL_SIZE = 75
class CitizenScienceEnv(gym.Env):
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self, dataset, out_features, n_sequences, evaluation=False):
        """
        trajectories: dictionary of user_id to their respective trajectories.
        n_sequences: number of sequences used for preprocessing.
        n_features: number of features used for preprocessing.
        """
        super(CitizenScienceEnv, self).__init__()
        self.dataset = dataset
        self.unique_sessions = self.dataset[['user_id', 'session_30_count_raw']].drop_duplicates()
        self.n_sequences = n_sequences
        self.current_session = None
        self.current_session_index = 0
        self.reward = 0
        self.n_sequences = n_sequences
        self.out_features = out_features
        
        self.action_space = gym.spaces.Discrete(4)
        self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(len(out_features), n_sequences + 1), dtype=np.float32)
        self.evalution = evaluation
        self.episode_bins = []
        self.exp_runs = 0

    def reset(self):
        random_session = np.random.randint(0, self.unique_sessions.shape[0])
        
        user_to_run, session_to_run = self.unique_sessions.iloc[random_session][['user_id', 'session_30_count_raw']]
        self.current_session = self._get_events(user_to_run, session_to_run)
        self.metadata = self._metadata()
        self.current_session_index = 0
        self.reward = 0
        return self._state()
    
    def _row_to_dict(self, metadata):
        """
        Convert a row of metadata to a dictionary.
        """
        return metadata.to_dict()

    def step(self, action):
        
        self._take_action(action)
        next_state, done, meta = self._calculate_next_state()
        
        if done:
            current_session_index = self.current_session_index if \
                self.current_session_index != self.current_session.shape[0] else self.current_session.shape[0] - 1
            
            self.exp_runs += 1
        
            self.metadata['ended_event'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
            self.metadata['ended_time'] = self.current_session.iloc[current_session_index]['cum_session_time_raw']
            self.metadata['exp_runs'] = self.exp_runs
            self.episode_bins.append(self._row_to_dict(self.metadata))
            
            return next_state, float(self.reward), done, {}
        else:
            self.reward = self.current_session.iloc[self.current_session_index]['reward'] 
            self.current_session_index += 1        
            return next_state, float(self.reward), done, meta
    
    def _metadata(self):
        session_metadata = self.current_session.iloc[0][RL_STAT_COLS].copy()
        session_metadata['ended'] = 0
        for meta_col in ['small', 'medium', 'large']:
            session_metadata[f'inc_{meta_col}'] = 0
            session_metadata[f'time_{meta_col}'] = 0

        return session_metadata
    
    def flush_episode_bins(self):
        episode_bins = self.episode_bins.copy()
        self.episode_bins = []
        return episode_bins
    
    def _calculate_next_state(self):
        
        if (self.current_session_index == self.current_session.shape[0]):
            return None, True, {}

        if self._continuing_in_session():
            return self._state(), False, {}
    
        return None, True, {}
         
    def _continuing_in_session(self):
        time_cutoff = self.current_session.iloc[self.current_session_index]['time_cutoff']
        current_session_time = self.current_session.iloc[self.current_session_index]['cum_session_time_raw']
        if current_session_time <= time_cutoff or current_session_time >= MAX_EVAL_SIZE:
            return True
    
        extending_low = self._probability_extending(current_session_time, self.metadata['time_small']) + \
            np.random.uniform(-0.1, 0, 100).mean()
            
        
        extending_medium = self._probability_extending(current_session_time, self.metadata['time_medium']) + \
            np.random.uniform(-.15, -.05, 100).mean()
            
        extending_large = self._probability_extending(current_session_time, self.metadata['time_large']) + \
            np.random.uniform(-0.25, -.15, 100).mean()
            
        return any([
            extending_low > 0.4 and extending_low <= 0.75,
            extending_medium > 0.4 and extending_medium <= 0.75,
            extending_large > 0.4 and extending_large <= 0.75
        ])
        
           
    
    def _probability_extending(self, current_session_time, incentive_time):
        if incentive_time == 0:
            return 0
         
        continue_session = norm(
            loc=max(incentive_time, 1),
            scale=max(incentive_time *.75, 1)
        ).cdf(max(current_session_time, 1)) 
        
        return continue_session
        

    def _get_events(self, user_id, session):
        subset = self.dataset[
            (self.dataset['user_id'] == user_id) &
            (self.dataset['session_30_count_raw'] == session).copy()
        ]

        subset = subset.sort_values(by=['date_time'])
        return subset
    
    def _take_action(self, action):
        if action == 0:
            return
        
        if action == 1 and self.metadata['inc_small'] == 0:
            current_session_index = self.current_session_index if \
                self.current_session_index != self.current_session.shape[0] else self.current_session.shape[0] - 1
        
            self.metadata['inc_small'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
            self.metadata['time_small'] = self.current_session.iloc[current_session_index]['cum_session_time_raw']
            
        if action == 2 and all([(self.metadata['inc_small'] > 0), (self.metadata['inc_medium'] == 0)]):
            current_session_index = self.current_session_index if \
                self.current_session_index != self.current_session.shape[0] else self.current_session.shape[0] - 1
            
            self.metadata['inc_medium'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
            self.metadata['time_medium'] = self.current_session.iloc[current_session_index]['cum_session_time_raw']
            
        if action == 3 and all([(self.metadata['inc_small'] > 0), (self.metadata['inc_medium'] > 0), (self.metadata['inc_large'] == 0)]):
            current_session_index = self.current_session_index if \
                self.current_session_index != self.current_session.shape[0] else self.current_session.shape[0] - 1
            
            self.metadata['inc_large'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
            self.metadata['time_large'] = self.current_session.iloc[current_session_index]['cum_session_time_raw']
        

    def _state(self):

        if self.current_session_index > self.n_sequences:
            events = self.current_session.iloc[self.current_session_index - (self.n_sequences + 1):self.current_session_index][self.out_features].values
            
        else:
            delta = min((self.n_sequences + 1)- self.current_session_index, self.n_sequences)
            zero_cat = np.zeros((delta, len(self.out_features)))
            events = self.current_session.iloc[:max(self.current_session_index, 1)][self.out_features].values
            events = np.concatenate((zero_cat, events), axis=0)
            

        return events.astype(np.float32).T

In [7]:
# %load callback

import pandas as pd
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat

class DistributionCallback(BaseCallback):
    
    @classmethod
    def tensorboard_setup(cls, log_dir, log_freq):
        cls._log_dir = log_dir
        cls._log_freq = log_freq

    
    def _on_step(self) -> bool:
        if self.n_calls % self._log_freq == 0:
            dist_list = self.training_env.env_method('flush_episode_bins')
            values_to_log = [item for sublist in dist_list for item in sublist if len(sublist) > 0]

            values_df = pd.DataFrame(
                values_to_log
            )
            
            
            session_size, size_cutoff, session_minutes, time_cutoff, ended_event, ended_time = (
                values_df['session_size'].mean(),
                values_df['size_cutoff'].mean(),
                values_df['session_minutes'].mean(),
                values_df['time_cutoff'].mean(),
                values_df['ended_event'].mean(),
                values_df['ended_time'].mean(),
            )
            
            inc_index_small, inc_index_medium, inc_index_large = (
                values_df['inc_small'].mean(),
                values_df['inc_medium'].mean(),
                values_df['inc_large'].mean()
            )
            
            time_minutes_small, time_minutes_medium, time_minutes_large = (
                values_df['time_small'].mean(),
                values_df['time_medium'].mean(),
                values_df['time_large'].mean()
            )
            
            size_stats = {
                'session_size': session_size,
                'size_cutoff': size_cutoff,
                'ended_size': ended_event,
                'inc_small': inc_index_small,
                'inc_medium': inc_index_medium,
                'inc_large': inc_index_large,
            }
            
            
            time_stats = {
                'session_minutes': session_minutes,
                'time_cutoff': time_cutoff,
                'ended_time': ended_time,
                'time_small': time_minutes_small,
                'time_medium': time_minutes_medium,
                'time_large': time_minutes_large,
            }
            
            for key, value in size_stats.items():
                self.logger.record(f'size/{key}', value)
            
            for key, value in time_stats.items():
                self.logger.record(f'sess_time/{key}', value)
                
            values_df.to_csv(f'{self._log_dir}/{self.n_calls // self._log_freq}.csv')
            
        return True

In [8]:
# %load policies/cnn_policy
from typing import Dict, List, Type, Union

import gym
import torch
from gym import spaces
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.dqn.policies import DQNPolicy
from torch import nn


class CustomConv1dFeatures(BaseFeaturesExtractor):
    
    @classmethod
    def setup_sequences_features(cls, n_sequences, n_features):
        cls.n_sequences = n_sequences
        cls.n_features = n_features
        
    
    def __init__(self, observation_space: spaces.Box, features_dim=20):
        super().__init__(observation_space, features_dim)
        
        
        self.cnn_1 = nn.Sequential(
            nn.Conv1d(self.n_features, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.ReLU(),
            
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.ReLU(),
            
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            
            nn.AvgPool1d(2)
        )
        
        self.cnn_2 = nn.Sequential(
            nn.Conv1d(self.n_features*2, self.n_features, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features),
            nn.ReLU(),
            
            nn.Conv1d(self.n_features, self.n_features, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features),
            nn.ReLU()
        )
        
        self.act = nn.Sequential(
            nn.MaxPool1d(2),
            nn.Flatten(),
        )
        
        with torch.no_grad():
            out_shape = self.act(self.cnn_2(self.cnn_1(torch.zeros((1, self.n_features, self.n_sequences))))).shape[1]
            self.linear = nn.Linear(out_shape, features_dim)
    
    def forward(self, obs):
        out = self.cnn_1(obs)
        out = self.cnn_2(out)
        out = self.act(out)
        return self.linear(out)


        

In [9]:
# %load incentive_reinforcement_learning_cpu.py
# %load incentive_reinforcement_learning_cpu.py
import argparse
import logging
import os
from datetime import datetime
from functools import reduce
from pprint import pformat
from typing import Callable
import boto3
import random
import numpy as np
import pandas as pd
import torch


from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.common.callbacks import (CallbackList,
                                                CheckpointCallback,
                                                StopTrainingOnMaxEpisodes)
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from stable_baselines3.dqn.policies import DQNPolicy

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
torch.set_printoptions(precision=2, linewidth=200, sci_mode=False)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
global logger
logger = logging.getLogger('rl_exp_train')
logger.setLevel(logging.INFO)

S3_BASELINE_PATH = 's3://dissertation-data-dmiller/'
N_SEQUENCES = 40
CHECKPOINT_FREQ = 750_000
TB_LOG = 10_000
WINDOW = 1

def linear_schedule(initial_value: float) -> Callable[[float], float]:
    """
    Linear learning rate schedule.

    :param initial_value: Initial learning rate.
    :return: schedule that computes
      current learning rate depending on remaining progress
    """
    def func(progress_remaining: float) -> float:
        """
        Progress will decrease from 1 (beginning) to 0.

        :param progress_remaining:
        :return: current learning rate
        """
        return progress_remaining * initial_value

    return func

def parse_args():
    parse = argparse.ArgumentParser()
    parse.add_argument('--read_path', type=str, default='rl_ready_data_conv')
    parse.add_argument('--n_files', type=int, default=2)
    parse.add_argument('--n_episodes', type=int, default=10_000)
    parse.add_argument('--n_envs', type=int, default=100)
    parse.add_argument('--lstm', type=str, default='label')
    parse.add_argument('--part', type=str, default='train')
    parse.add_argument('--feature_extractor', type=str, default='cnn') 
    args = parse.parse_args()
    return args



def main(args):
   
    
    logger.info('Starting Incentive Reinforcement Learning')
    logger.info(pformat(args.__dict__))
    exec_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    
    read_path, n_files, n_episodes, n_envs, lstm, part, feature_ext = (
        args.read_path, 
        args.n_files, 
        args.n_episodes, 
        args.n_envs,
        args.lstm,
        args.part,
        args.feature_extractor,
    )

    base_read_path = os.path.join(read_path, f'files_used_{n_files}', f'window_{WINDOW}', f'batched_{part}')
    logger.info(f'Reading data from {base_read_path}')
    files= os.listdir(base_read_path)
    logger.info(f'Files found: {len(files)} for environment vectorization')


    vectorized_df = [
        pd.read_parquet(os.path.join(base_read_path, file), columns=LOAD_COLS)
        for file in files
    ]
    
    out_features = FEATURE_COLUMNS + [lstm] if lstm else FEATURE_COLUMNS
    logger.info(f'Out features: {out_features}')

    citizen_science_vec =DummyVecEnv([lambda: CitizenScienceEnv(vec_df, out_features, N_SEQUENCES) for vec_df in vectorized_df])
    monitor_train = VecMonitor(citizen_science_vec)
    
    logger.info(f'Vectorized environments created')

    base_exp_path = os.path.join('experiments', f'dqn_{lstm}_{feature_ext}/{exec_time}')


    tensorboard_dir, checkpoint_dir = (
        os.path.join(base_exp_path, 'training_metrics'),
        os.path.join(base_exp_path, 'checkpoints')
    )

    if not os.path.exists(tensorboard_dir):
        logger.info(f'Creating directory {tensorboard_dir} for tensorboard logs')
        os.makedirs(tensorboard_dir)
   
    if not os.path.exists(checkpoint_dir):
        logger.info(f'Creating directory {checkpoint_dir} for checkpoints')
        os.makedirs(checkpoint_dir) 

    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=n_episodes, verbose=1)
    checkpoint_freq = int(CHECKPOINT_FREQ // (n_envs // 8))
    log_freq = int(TB_LOG // (n_envs // 4))
    checkpoint_callback = CheckpointCallback(
        save_freq=checkpoint_freq,
        save_path=checkpoint_dir, 
        verbose=2
    )
    
    DistributionCallback.tensorboard_setup(tensorboard_dir, TB_LOG // (n_envs // 8))
    logger_callback = DistributionCallback()
    
    callback_list = CallbackList([checkpoint_callback, logger_callback, callback_max_episodes])
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    if feature_ext == 'cnn':
        CustomConv1dFeatures.setup_sequences_features(N_SEQUENCES + 1, len(out_features))
        logger.info('Using custom 1 dimensional CNN feature extractor')
        policy_kwargs = dict(
            features_extractor_class=CustomConv1dFeatures,
            net_arch=[10]
        )
        model = DQN(
            policy='CnnPolicy', 
            env=monitor_train, 
            verbose=1, 
            tensorboard_log=tensorboard_dir, 
            policy_kwargs=policy_kwargs, 
            device=device, 
            stats_window_size=1000)
    else:
        logger.info('Using default MLP feature extractor')
        model = DQN(policy='MlpPolicy', env=monitor_train, verbose=1, tensorboard_log=tensorboard_dir, device=device, stats_window_size=1000)
        
    logger.info(f'Model created: policy')
    
    logger.info(pformat(model.policy))
        
    logger.info(f'Beginning training') 
    
            
    logger.info(pformat([
        'n_episodes: {}'.format(n_episodes),
        'read_path: {}'.format(read_path),
        'n_files: {}'.format(n_files),
        'n_sequences: {}'.format(N_SEQUENCES),
        'n_envs: {}'.format(n_envs),
        'device: {}'.format(device),
        'lstm: {}'.format(lstm),
        'part: {}'.format(part),
        'feature_extractor: {}'.format(feature_ext),
        'tensorboard_dir: {}'.format(tensorboard_dir),
        'checkpoint_dir: {}'.format(checkpoint_dir),
        'checkpoint_freq: {}'.format(checkpoint_freq),
        'tb_freq: {}'.format(log_freq),
    ]))
    
    model.learn(total_timesteps=25_000_000, progress_bar=True, log_interval=log_freq, callback=callback_list)
    


In [10]:
class Argument:
    read_path = 'rl_ready_data_conv'
    n_files = 30
    n_episodes = 500_000
    lstm = 'pred'
    part = 'train'
    feature_extractor = 'cnn'
    n_envs = 100

In [11]:

main(Argument)

06/05/2023 03:36:01 PM Starting Incentive Reinforcement Learning
06/05/2023 03:36:01 PM mappingproxy({'__dict__': <attribute '__dict__' of 'Argument' objects>,
              '__doc__': None,
              '__module__': '__main__',
              '__weakref__': <attribute '__weakref__' of 'Argument' objects>,
              'feature_extractor': 'cnn',
              'lstm': 'label',
              'n_envs': 100,
              'n_episodes': 500000,
              'n_files': 30,
              'part': 'train',
              'read_path': 'rl_ready_data_conv'})
06/05/2023 03:36:01 PM Reading data from rl_ready_data_conv/files_used_30/window_1/batched_train
06/05/2023 03:36:01 PM Files found: 100 for environment vectorization
06/05/2023 03:36:05 PM Out features: ['country_count', 'date_hour_sin', 'date_hour_cos', 'date_minute_sin', 'date_minute_cos', 'session_30_count', 'session_5_count', 'cum_session_event', 'cum_session_time', 'expanding_click_average', 'cum_platform_time', 'cum_platform_event',

Using cuda device


06/05/2023 03:36:08 PM Model created: policy
06/05/2023 03:36:08 PM CnnPolicy(
  (q_net): QNetwork(
    (features_extractor): CustomConv1dFeatures(
      (cnn_1): Sequential(
        (0): Conv1d(21, 42, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): BatchNorm1d(42, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): Conv1d(42, 42, kernel_size=(3,), stride=(1,), padding=(1,))
        (4): BatchNorm1d(42, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (5): ReLU()
        (6): Conv1d(42, 42, kernel_size=(3,), stride=(1,), padding=(1,))
        (7): BatchNorm1d(42, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (8): Conv1d(42, 42, kernel_size=(3,), stride=(1,), padding=(1,))
        (9): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
      )
      (cnn_2): Sequential(
        (0): Conv1d(42, 21, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): BatchNorm1d(21, eps=1e-05

Logging to experiments/dqn_label_cnn/2023-06-05_15-36-01/training_metrics/DQN_1


Output()

-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.98      |
|    ep_rew_mean      | 14.815564 |
|    exploration_rate | 0.998     |
| time/               |           |
|    episodes         | 400       |
|    fps              | 368       |
|    time_elapsed     | 12        |
|    total_timesteps  | 4500      |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 8.9       |
|    ep_rew_mean      | 17.063095 |
|    exploration_rate | 0.997     |
| time/               |           |
|    episodes         | 800       |
|    fps              | 442       |
|    time_elapsed     | 19        |
|    total_timesteps  | 8700      |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 10.3      |
|    ep_rew_mean      | 19.422132 |
|    exploration_rate | 0.995     |
| time/               |           |
|    episodes         | 1200      |
|    fps              | 465       |
|    time_elapsed     | 29        |
|    total_timesteps  | 13600     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 11        |
|    ep_rew_mean      | 20.330784 |
|    exploration_rate | 0.993     |
| time/               |           |
|    episodes         | 1600      |
|    fps              | 488       |
|    time_elapsed     | 37        |
|    total_timesteps  | 18500     |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 12.1     |
|    ep_rew_mean      | 21.3188  |
|    exploration_rate | 0.991    |
| time/               |          |
|    episodes         | 2000     |
|    fps              | 509      |
|    time_elapsed     | 46       |
|    total_timesteps  | 23600    |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 12.5      |
|    ep_rew_mean      | 21.291468 |
|    exploration_rate | 0.989     |
| time/               |           |
|    episodes         | 2400      |
|    fps              | 521       |
|    time_elapsed     | 54        |
|    total_timesteps  | 28200     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 12.8      |
|    ep_rew_mean      | 21.658516 |
|    exploration_rate | 0.987     |
| time/               |           |
|    episodes         | 2800      |
|    fps              | 532       |
|    time_elapsed     | 62        |
|    total_timesteps  | 33100     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 12.6      |
|    ep_rew_mean      | 21.672949 |
|    exploration_rate | 0.985     |
| time/               |           |
|    episodes         | 3200      |
|    fps              | 543       |
|    time_elapsed     | 70        |
|    total_timesteps  | 38500     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 12.6      |
|    ep_rew_mean      | 22.301949 |
|    exploration_rate | 0.983     |
| time/               |           |
|    episodes         | 3600      |
|    fps              | 553       |
|    time_elapsed     | 79        |
|    total_timesteps  | 44000     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 13.4      |
|    ep_rew_mean      | 23.839216 |
|    exploration_rate | 0.981     |
| time/               |           |
|    episodes         | 4000      |
|    fps              | 559       |
|    time_elapsed     | 88        |
|    total_timesteps  | 49300     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 13.4      |
|    ep_rew_mean      | 24.155434 |
|    exploration_rate | 0.979     |
| time/               |           |
|    episodes         | 4400      |
|    fps              | 533       |
|    time_elapsed     | 102       |
|    total_timesteps  | 54400     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 2.33      |
|    n_updates        | 10        |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 13.2      |
|    ep_rew_mean      | 22.886099 |
|    exploration_rate | 0.977     |
| time/               |           |
|    episodes         | 4800      |
|    fps              | 538       |
|    time_elapsed     | 110       |
|    total_timesteps  | 59600     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 3.85      |
|    n_updates        | 23        |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 13.4      |
|    ep_rew_mean      | 23.660732 |
|    exploration_rate | 0.975     |
| time/               |           |
|    episodes         | 5200      |
|    fps              | 544       |
|    time_elapsed     | 119       |
|    total_timesteps  | 65100     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 0.793     |
|    n_updates        | 37        |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 13.4      |
|    ep_rew_mean      | 23.885683 |
|    exploration_rate | 0.973     |
| time/               |           |
|    episodes         | 5600      |
|    fps              | 547       |
|    time_elapsed     | 127       |
|    total_timesteps  | 69800     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 1.36      |
|    n_updates        | 49        |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 12.7      |
|    ep_rew_mean      | 22.904833 |
|    exploration_rate | 0.972     |
| time/               |           |
|    episodes         | 6000      |
|    fps              | 551       |
|    time_elapsed     | 135       |
|    total_timesteps  | 74900     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 2.22      |
|    n_updates        | 62        |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 12.6      |
|    ep_rew_mean      | 22.155735 |
|    exploration_rate | 0.97      |
| time/               |           |
|    episodes         | 6400      |
|    fps              | 555       |
|    time_elapsed     | 144       |
|    total_timesteps  | 80200     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 1.19      |
|    n_updates        | 75        |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 12.6      |
|    ep_rew_mean      | 22.144798 |
|    exploration_rate | 0.968     |
| sess_time/          |           |
|    ended_time       | 22.1      |
|    session_minutes  | 29.7      |
|    time_cutoff      | 14        |
|    time_large       | 6.18      |
|    time_medium      | 5.34      |
|    time_small       | 3.13      |
| size/               |           |
|    ended_size       | 11.9      |
|    inc_large        | 3.44      |
|    inc_medium       | 3.02      |
|    inc_small        | 2.11      |
|    session_size     | 15.9      |
|    size_cutoff      | 8.21      |
| time/               |           |
|    episodes         | 6800      |
|    fps              | 557       |
|    time_elapsed     | 153       |
|    total_timesteps  | 85400     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 1.52      |
|    n_updates        | 88  

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 12.7     |
|    ep_rew_mean      | 22.2716  |
|    exploration_rate | 0.966    |
| time/               |          |
|    episodes         | 7200     |
|    fps              | 558      |
|    time_elapsed     | 161      |
|    total_timesteps  | 90000    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 2.2      |
|    n_updates        | 99       |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 12.7      |
|    ep_rew_mean      | 22.595716 |
|    exploration_rate | 0.964     |
| time/               |           |
|    episodes         | 7600      |
|    fps              | 561       |
|    time_elapsed     | 169       |
|    total_timesteps  | 95400     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 1.04      |
|    n_updates        | 113       |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 12.2     |
|    ep_rew_mean      | 21.44815 |
|    exploration_rate | 0.962    |
| time/               |          |
|    episodes         | 8000     |
|    fps              | 562      |
|    time_elapsed     | 177      |
|    total_timesteps  | 100000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.01     |
|    n_updates        | 124      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 12.3     |
|    ep_rew_mean      | 21.61725 |
|    exploration_rate | 0.96     |
| time/               |          |
|    episodes         | 8400     |
|    fps              | 565      |
|    time_elapsed     | 186      |
|    total_timesteps  | 105600   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.48     |
|    n_updates        | 138      |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 13        |
|    ep_rew_mean      | 22.565933 |
|    exploration_rate | 0.958     |
| time/               |           |
|    episodes         | 8800      |
|    fps              | 566       |
|    time_elapsed     | 195       |
|    total_timesteps  | 110900    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 1.13      |
|    n_updates        | 152       |
-----------------------------------
