In [None]:
!python -m pip install --upgrade pip python-dotenv --quiet
!python -m pip install gym stable-baselines3[extra] --upgrade --quiet
!python -m pip install boto3 --upgrade --quiet

In [None]:
%load_ext dotenv
%dotenv env

In [None]:
# %load rl_constant.py
LABEL = [
    "continue_work_session_30_minutes"
]

METADATA = [
    "user_id",
    "session_30_raw",
    "cum_platform_event_raw",
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "global_events_user",
    "global_session_time",
    
    "year",
    "month",
    "day",
    "hour",
    "minute",
    "second"
]

OUT_FEATURE_COLUMNS = [
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time",
    
    "expanding_click_average",
    "cum_platform_time",
    "cum_platform_events",
    "cum_projects",
    "average_event_time",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]

PREDICTION_COLS = [
    'seq_10',
]


GROUPBY_COLS = ['user_id']

RL_STAT_COLS = [
    'session_size',
    'sim_size',
    'session_minutes',
    'sim_minutes',
    'reward',
    'session_30_raw',
    'cum_platform_time_raw',
    'global_session_time',
]


In [None]:
# %load callback
import numpy as np
import pandas as pd
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat


class DistributionCallback(BaseCallback):
    
    @classmethod
    def tensorboard_setup(cls, log_dir, log_freq):
        cls._log_dir = log_dir
        cls._log_freq = log_freq

    def _on_training_start(self) -> None:
        output_formats = self.logger.output_formats
        self.tb_formatter = next(f for f in output_formats if isinstance(f, TensorBoardOutputFormat))
    
    def _on_step(self) -> bool:
        if self.n_calls % self._log_freq == 0:
            dist_list = self.training_env.env_method('dists')
            values_to_log = np.concatenate([d for d in dist_list if d.shape[0] > 0])

            values_df = pd.DataFrame(
                values_to_log, 
                columns=RL_STAT_COLS + ['ended', 'incentive_index', 'n_episodes']
            )
            
            dist_session_time = (values_df['session_minutes'] - values_df['reward']).mean()
            dist_session_end = (values_df['session_size'] - values_df['ended']).mean()
            
            dist_sim_time = (values_df['reward'] - values_df['sim_size']).mean()
            dist_sim_end = (values_df['ended'] - values_df['sim_size']).mean()


            dist_inc_session = (values_df['session_size'] - values_df['incentive_index']).mean()
            dist_inc_end = (values_df['ended'] - values_df['incentive_index']).mean()
            dist_inc_sim_index = (values_df['sim_size'] - values_df['incentive_index']).mean()

            n_call = self.n_calls // self._log_freq
            
            self.tb_formatter.writer.add_scalar('distance/session/max_reward::decrease', dist_session_time, n_call)
            self.tb_formatter.writer.add_scalar('distance/session/max_ended::decrease', dist_session_end, n_call)
           
            self.tb_formatter.writer.add_scalar('distance/session/sim_reward::increase', dist_sim_time, n_call)
            self.tb_formatter.writer.add_scalar('distance/session/sim_ended::increase', dist_sim_end, n_call)
            
            
            self.tb_formatter.writer.add_scalar('distance/incentive/max_incentive::decrease', dist_inc_session, n_call)
            self.tb_formatter.writer.add_scalar('distance/incentive/ended_incentive', dist_inc_end, n_call)
            self.tb_formatter.writer.add_scalar('distance/incentive/sim_inc_placement::decrease', dist_inc_sim_index, n_call)
            self.tb_formatter.writer.add_scalar('distance/ended_sim_size::increase', dist_inc_sim_index, n_call)
            
            self.tb_formatter.writer.flush()
            
            values_df.to_parquet(f'{self._log_dir}/dist_{n_call}.parquet')
            
        return True

In [None]:
# %load environment
# %load environment
import numpy as np
from scipy.stats import norm 
import gym

class CitizenScienceEnv(gym.Env):
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self, dataset, unique_episodes, unique_sessions, out_features, n_sequences):
        """
        trajectories: dictionary of user_id to their respective trajectories.
        n_sequences: number of sequences used for preprocessing.
        n_features: number of features used for preprocessing.
        """
        super(CitizenScienceEnv, self).__init__()
        self.dataset = dataset
        self.unique_episodes = unique_episodes
        self.n_episodes = 0
        self.n_sequences = n_sequences
        self.unique_sessions = unique_sessions
        self.current_session = None
        self.current_session_index = 0
        self.reward = 0
        self.metadata_container = []
        self.n_sequences = n_sequences
        self.out_features = out_features
        
        self.action_space = gym.spaces.Discrete(2)
        self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(len(out_features), n_sequences + 1), dtype=np.float32)

    def reset(self):
        self.n_episodes += 1
        session_to_run = self.unique_sessions.sample(1)['session_30_raw'].values[0]
        user_to_run = self.unique_episodes[self.unique_episodes['session_30_raw'] == session_to_run].sample(1)['user_id'].values[0]
        self.current_session = self._get_events(user_to_run, session_to_run)
        self.metadata = self._metadata()
        self.current_session_index = 0
        self.reward = 0
        return self._state()

    def step(self, action):
        self._take_action(action)
        next_state, done, meta = self._calculate_next_state()
        
        if done:
            curent_session_index = min(self.current_session_index, self.current_session.shape[0] - 1)
            self.metadata['ended'] = self.current_session.iloc[curent_session_index]['cum_session_event_raw']
            self.metadata['reward'] = self.reward
            self.metadata_container.append(self.metadata.values)
            return next_state, float(self.reward), done, meta
        else:
            self.reward = self.current_session.iloc[self.current_session_index]['reward'] 
            self.current_session_index += 1        
        return next_state, float(self.reward), done, meta
    
    def _metadata(self):
        session_metadata = self.current_session.iloc[0][RL_STAT_COLS]
        session_metadata['ended'] = 0
        session_metadata['incentive_index'] = 0
        session_metadata['n_episodes'] = self.n_episodes
        return session_metadata
    
    
    def _calculate_next_state(self):
        
        if (self.current_session_index == self.current_session.shape[0]):
            return None, True, {}

        if self._continuing_in_session():
            return self._state(), False, {}
    
        return None, True, {}
        
      
  
    def _continuing_in_session(self):
        sim_counts = self.metadata['sim_size']
        current_session_count = self.current_session.iloc[self.current_session_index]['cum_session_event_raw']
        if current_session_count < sim_counts:
            return True
        
        extending_session = self._probability_extending_session(current_session_count)
        
        return all([extending_session >= .3, extending_session <= .7])
        
    
    def _probability_extending_session(self, current_session_count):
        if self.metadata['incentive_index'] == 0:
            return 0
        
        scale = max(5, int(self.metadata['session_size'] / 4))
        continue_session = norm(
            loc=self.metadata['incentive_index'],
            scale=scale
        ).cdf(current_session_count)
        
        return continue_session
        

    def _get_events(self, user_id, session):
        subset = self.dataset[
            (self.dataset['user_id'] == user_id) &
            (self.dataset['session_30_raw'] == session)
        ]
   
        return subset.sort_values(by=['date_time']).reset_index(drop=True)
    
    def _take_action(self, action):
        if action == 0 or self.metadata['incentive_index'] > 0:
            return
        
        current_session_index = min(self.current_session_index, self.current_session.shape[0] - 1)
        self.metadata['incentive_index'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
        
    def _state(self):

        if self.current_session_index > self.n_sequences:
            events = self.current_session.iloc[self.current_session_index - (self.n_sequences + 1):self.current_session_index][self.out_features].values
            
        else:
            delta = min((self.n_sequences + 1)- self.current_session_index, 10)
            zero_cat = np.zeros((delta, len(self.out_features)))
            events = self.current_session.iloc[:max(self.current_session_index, 1)][self.out_features].values
            events = np.concatenate((zero_cat, events), axis=0)
            

        return events.astype(np.float32).T
  
    
    def dists(self):
        metadata_container = self.metadata_container.copy()
        self.metadata_container = []
        return np.array(metadata_container)

In [None]:
# %load policies/cnn_policy
from typing import Dict, List, Type, Union
import gym
from gym import spaces
import torch
from torch import nn
from stable_baselines3.dqn.policies import DQNPolicy
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CustomConv1dFeatures(BaseFeaturesExtractor):
    def __init__(self, observation_space: spaces.Box, n_sequences=11, n_features=21, features_dim=20):
        super().__init__(observation_space, features_dim)
        
        
        self.cnn_1 = nn.Sequential(
            nn.Conv1d(n_features, n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(n_features*2),
            nn.ReLU(),
            
            nn.Conv1d(n_features*2, n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(n_features*2),
            nn.ReLU(),
            
            nn.Conv1d(n_features*2, n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(n_features*2),
            nn.Conv1d(n_features*2, n_features*2, kernel_size=3, padding=1),
            
            nn.AvgPool1d(2)
        )
        
        self.cnn_2 = nn.Sequential(
            nn.Conv1d(n_features*2, n_features, kernel_size=3, padding=1),
            nn.BatchNorm1d(n_features),
            nn.ReLU(),
            
            nn.Conv1d(n_features, n_features, kernel_size=3, padding=1),
            nn.BatchNorm1d(n_features),
            nn.ReLU()
        )
        
        self.act = nn.Sequential(
            nn.MaxPool1d(2),
            nn.Flatten(1),
        )
    
    def forward(self, obs):
        out = self.cnn_1(obs)
        out = self.cnn_2(out)
        out = self.act(out)
        return out


        

    

In [None]:
# %load incentive_reinforcement_learning_cpu.py
import argparse
import numpy as np
import torch
from stable_baselines3.common.callbacks import CallbackList, StopTrainingOnMaxEpisodes, CheckpointCallback
from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.dqn.policies import DQNPolicy
from stable_baselines3.common.env_checker import check_env
import logging
import pandas as pd
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from datetime import datetime
from pprint import pformat
import os
from functools import reduce

import numpy as np
import torch
from stable_baselines3.common.callbacks import CallbackList, StopTrainingOnMaxEpisodes, CheckpointCallback
from stable_baselines3 import A2C, DQN
import logging
import pandas as pd
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from datetime import datetime
from pprint import pformat
import os


ALL_COLS = METADATA + OUT_FEATURE_COLUMNS 

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
torch.set_printoptions(precision=2, linewidth=200, sci_mode=False)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)

S3_BASELINE_PATH = 's3://dissertation-data-dmiller'
USER_INDEX = 1
SESSION_INDEX = 2
CUM_SESSION_EVENT_RAW = 3
TIMESTAMP_INDEX = 11
TRAIN_SPLIT = 0.7
EVAL_SPLIT = 0.15



def train_eval_split(dataset, logger):
    train_split = int(dataset.shape[0] * TRAIN_SPLIT)
    eval_split = int(dataset.shape[0] * EVAL_SPLIT)
    test_split = dataset.shape[0] - train_split - eval_split
    logger.info(f'Train size: 0:{train_split}, eval size: {train_split}:{train_split+eval_split}: test size: {train_split + eval_split}:{dataset.shape[0]}')
    train_dataset, eval_dataset, test_split = dataset[:train_split], dataset[train_split:train_split+eval_split], dataset[train_split+eval_split:]
    
    return {
        'train': train_dataset,
        'eval': eval_dataset,
        'test': test_split
    }

def generate_metadata(dataset):
    
    session_size = dataset.groupby(['user_id', 'session_30_raw'])['size_of_session'].max().reset_index(name='session_size')
    session_minutes = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_time_raw'].max().reset_index(name='session_minutes')
    
    sim_minutes = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_time_raw'].quantile(.7, interpolation='nearest').reset_index(name='sim_minutes')
    sim_size = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_event_raw'].quantile(.7, interpolation='nearest').reset_index(name='sim_size')
    
    
    sessions = [session_size, session_minutes, sim_minutes, sim_size]
    sessions = reduce(lambda left, right: pd.merge(left, right, on=['user_id', 'session_30_raw']), sessions)
    dataset = pd.merge(dataset, sessions, on=['user_id', 'session_30_raw'])
    dataset['reward'] = dataset['cum_session_time_raw']
    return dataset


def parse_args():
    parse = argparse.ArgumentParser()
    parse.add_argument('--read_path', type=str, default='datasets/rl_ready_data')
    parse.add_argument('--n_files', type=int, default=2)
    parse.add_argument('--n_episodes', type=int, default=50)
    parse.add_argument('--n_sequences', type=int, default=10)
    parse.add_argument('--n_envs', type=int, default=100)
    parse.add_argument('--lstm', type=str, default='seq_10')
    parse.add_argument('--device', type=str, default='cpu')
    parse.add_argument('--checkpoint_freq', type=int, default=1000)
    parse.add_argument('--tb_log', type=int, default=100)
    parse.add_argument('--feature_extractor', type=str, default='cnn') 
    args = parse.parse_args()
    return args



def run_reinforcement_learning_incentives(environment, logger, n_episodes=1):
    for epoch in range(n_episodes):
        environment_comp = False
        state = environment.reset()
        i = 0
        while not environment_comp:
            next_action = (
                1 if np.random.uniform(low=0, high=1) > 0.8 else 0
            )
            state, rewards, environment_comp, meta = environment.step(next_action)
            i +=1
            if i % 100 == 0:
                logger.info(f'Step: {i} - Reward: {rewards}')
                
        logger.info(f'Epoch: {epoch} - Reward: {rewards}')
        print(environment.user_sessions.head(10))


def remove_events_in_2_minute_window(df):
    df['second_window'] = df['second'] // 10
    df = df.drop_duplicates(
        subset=['user_id', 'session_30_raw', 'year', 'month', 'day', 'hour', 'minute'],
        keep='last'
    ).reset_index(drop=True)

    return df


def convolve_delta_events(df):
    df['convolved_delta_event'] = (
        df.set_index('date_time').groupby(by=['user_id', 'session_30_raw'], group_keys=False) \
            .rolling('2T', min_periods=1)['delta_last_event'] \
            .mean()
            .reset_index(name='convolved_event_delta')['convolved_event_delta']
    )

    df['delta_last_event'] = df['convolved_delta_event']

    return df

    
def main(args):
    
    exec_time = datetime.now().strftime("%Y-%m-%d-%H-%M")
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    
    logger.info('Starting Incentive Reinforcement Learning')
    
    read_path, n_files, n_sequences, n_episodes, device, n_envs, lstm, tb_log, check_freq, feature_ext = (
        args.read_path, 
        args.n_files, 
        args.n_sequences, 
        args.n_episodes, 
        args.device,
        args.n_envs,
        args.lstm,
        args.tb_log,
        args.checkpoint_freq,
        args.feature_extractor
    )
    
    read_path = os.path.join(
        read_path,
        f'files_used_{n_files}',
        f'predicted_data.parquet'
    )
   
    if not os.path.exists(read_path):
        logger.info(f'Downloading data from {S3_BASELINE_PATH}/{read_path}') 
        s3 = boto3.client('s3')
        s3.download_file(
            S3_BASELINE_PATH,     
            read_path,
            read_path
        )
             
            
    logger.info(f'Reading data from {read_path}')
    cols = ALL_COLS + [lstm] if lstm else ALL_COLS
    out_features = OUT_FEATURE_COLUMNS + [lstm] if lstm else OUT_FEATURE_COLUMNS
    if args.lstm:
        logger.info(f'Including LSTM prediction: {lstm}')
        df = pd.read_parquet(read_path, columns=cols)
    else:
        logger.info(f'Setting up baseline without prediction')
        df = pd.read_parquet(read_path, columns=ALL_COLS)
    df['date_time'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute', 'second']], errors='coerce')


    df = df.sort_values(by=['date_time'])    
    
    logger.info(f'N events:: {df.shape[0]} creating training partitions')
    df = df.head(int(df.shape[0] * .7))
    logger.info(f'N events after 70% split: {df.shape[0]}')
    size_of_session = df.groupby(['user_id', 'session_30_raw']).size().reset_index(name='size_of_session')
    df = pd.merge(df, size_of_session, on=['user_id', 'session_30_raw'])
    df['cum_session_event_raw'] = df.groupby(['user_id', 'session_30_raw'])['date_time'].cumcount() + 1
    
    logger.info(f'Convolution over 2 minute window')
    df = convolve_delta_events(df)
    logger.info(f'Convolving over 2 minute window complete: generating metadata')
    df = generate_metadata(df) 
    logger.info(f'Metadata generated: selecting events only at 2 minute intervals')
    df = df[df['minute'] % 2 == 0]
    logger.info(f'Data read: {df.shape[0]} rows, {df.shape[1]} columns, dropping events within 2 minute window')
    df = remove_events_in_2_minute_window(df)
    df = df.reset_index(drop=True)
    
    logger.info(f'Number of events after dropping events within 2 minute window: {df.shape[0]}')
    
    unique_episodes = df[['user_id', 'session_30_raw']].drop_duplicates()
    unique_sessions = df[['session_30_raw']].drop_duplicates()
    df = df.drop(columns=['year', 'month', 'day', 'hour', 'minute', 'second', 'second_window'])
    
    citizen_science_vec =DummyVecEnv([lambda: CitizenScienceEnv(df, unique_episodes, unique_sessions, out_features, 10) for i in range(n_envs)])
    logger.info(f'Vectorized environments created')
    
    base_path = os.path.join(
        S3_BASELINE_PATH,
        'reinforcement_learning_incentives',
        f'n_files_{n_files}',
        feature_ext + '_' + 'label' if lstm.startswith('continue') else lstm,
        'results',
        exec_time,
    ) 
    
    
    tensorboard_dir, checkpoint_dir = (
        os.path.join(base_path, 'training_metrics'),
        os.path.join(base_path, 'checkpoints')
    )

    logger.info(f'Creating callbacks, monitors and loggerss')
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=n_episodes, verbose=1)
    checkpoint_callback = CheckpointCallback(save_freq=check_freq // n_envs, save_path=checkpoint_dir, name_prefix='rl_model')
    dist_callback = DistributionCallback()
    DistributionCallback.tensorboard_setup(tensorboard_dir, tb_log)
    callback_list = CallbackList([callback_max_episodes, dist_callback, checkpoint_callback])
    monitor_train = VecMonitor(citizen_science_vec)
   
    if feature_ext == 'cnn':
        logger.info('Using custom 1 dimensional CNN feature extractor')
        policy_kwargs = dict(
            features_extractor_class=CustomConv1dFeatures,
            net_arch=[20, 10]
        )
        model = DQN(policy='MlpPolicy', env=monitor_train, verbose=1, tensorboard_log=tensorboard_dir, policy_kwargs=policy_kwargs, device=device, stats_window_size=1000)
    else:
        logger.info('Using default MLP feature extractor')
        model = DQN(policy='MlpPolicy', env=monitor_train, verbose=1, tensorboard_log=tensorboard_dir, device=device, stats_window_size=1000)
    
    logger.info(f'Model created: policy')
    
    logger.info(pformat(model.policy))
        
    logger.info(f'Beginning training') 
            
    logger.info(pformat([
        'n_epochs: {}'.format(n_episodes),
        'read_path: {}'.format(read_path),
        'n_files: {}'.format(n_files),
        'n_sequences: {}'.format(n_sequences),
        'n_envs: {}'.format(n_envs),
        'total_timesteps: {}'.format(df.shape),
        f'unique_episodes: {unique_episodes.shape[0]}',
        'device: {}'.format(device),
        'tensorboard_dir: {}'.format(tensorboard_dir),
        'checkpoint_dir: {}'.format(checkpoint_dir)
    ]))
    
    model.learn(total_timesteps=100_000_000, progress_bar=True, log_interval=1000, callback=callback_list)
    



In [None]:
class Argument:
    read_path = 'rl_ready_data'
    n_files = 30
    n_sequences = 10
    n_episodes = 500_000
    n_envs = 1000
    lstm = 'continue_work_session_30_minutes'
    device = 'cuda'
    checkpoint_freq = 100_000
    tb_log = 1000
    feature_extractor = 'mlp'

In [9]:

main(Argument)

-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.1       |
|    ep_rew_mean      | 230.03746 |
|    exploration_rate | 0.942     |
| time/               |           |
|    episodes         | 85000     |
|    fps              | 403       |
|    time_elapsed     | 1516      |
|    total_timesteps  | 612000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 36.6      |
|    n_updates        | 140       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.04      |
|    ep_rew_mean      | 244.52078 |
|    exploration_rate | 0.941     |
| time/               |           |
|    episodes         | 86000     |
|    fps              | 403       |
|    time_elapsed     | 1532      |
|    total_timesteps  | 618000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 35.7      |
|    n_updates        | 142       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 6.28      |
|    ep_rew_mean      | 167.43466 |
|    exploration_rate | 0.941     |
| time/               |           |
|    episodes         | 87000     |
|    fps              | 402       |
|    time_elapsed     | 1551      |
|    total_timesteps  | 625000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 29.8      |
|    n_updates        | 144       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.55      |
|    ep_rew_mean      | 235.86134 |
|    exploration_rate | 0.94      |
| time/               |           |
|    episodes         | 88000     |
|    fps              | 403       |
|    time_elapsed     | 1570      |
|    total_timesteps  | 633000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 17.7      |
|    n_updates        | 146       |
-----------------------------------


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 6.97      |
|    ep_rew_mean      | 224.08922 |
|    exploration_rate | 0.927     |
| time/               |           |
|    episodes         | 107000    |
|    fps              | 403       |
|    time_elapsed     | 1911      |
|    total_timesteps  | 772000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 21.1      |
|    n_updates        | 180       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.5       |
|    ep_rew_mean      | 294.32407 |
|    exploration_rate | 0.925     |
| time/               |           |
|    episodes         | 109000    |
|    fps              | 404       |
|    time_elapsed     | 1947      |
|    total_timesteps  | 787000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 27.5      |
|    n_updates        | 184       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.27      |
|    ep_rew_mean      | 234.76776 |
|    exploration_rate | 0.925     |
| time/               |           |
|    episodes         | 110000    |
|    fps              | 404       |
|    time_elapsed     | 1964      |
|    total_timesteps  | 794000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 37        |
|    n_updates        | 186       |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7.24     |
|    ep_rew_mean      | 268.0618 |
|    exploration_rate | 0.924    |
| time/               |          |
|    episodes         | 111000   |
|    fps              | 404      |
|    time_elapsed     | 1981     |
|    total_timesteps  | 801000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 20.3     |
|    n_updates        | 188      |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.31      |
|    ep_rew_mean      | 260.81845 |
|    exploration_rate | 0.923     |
| time/               |           |
|    episodes         | 112000    |
|    fps              | 404       |
|    time_elapsed     | 2000      |
|    total_timesteps  | 809000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 22.1      |
|    n_updates        | 190       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.46      |
|    ep_rew_mean      | 344.89426 |
|    exploration_rate | 0.922     |
| time/               |           |
|    episodes         | 113000    |
|    fps              | 404       |
|    time_elapsed     | 2016      |
|    total_timesteps  | 816000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 37.1      |
|    n_updates        | 191       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.56      |
|    ep_rew_mean      | 318.98703 |
|    exploration_rate | 0.922     |
| time/               |           |
|    episodes         | 114000    |
|    fps              | 404       |
|    time_elapsed     | 2034      |
|    total_timesteps  | 823000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 23.6      |
|    n_updates        | 193       |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7.58     |
|    ep_rew_mean      | 285.8113 |
|    exploration_rate | 0.921    |
| time/               |          |
|    episodes         | 115000   |
|    fps              | 404      |
|    time_elapsed     | 2053     |
|    total_timesteps  | 831000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 37.6     |
|    n_updates        | 195      |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.5       |
|    ep_rew_mean      | 281.84744 |
|    exploration_rate | 0.92      |
| time/               |           |
|    episodes         | 116000    |
|    fps              | 404       |
|    time_elapsed     | 2070      |
|    total_timesteps  | 838000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 27.7      |
|    n_updates        | 197       |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7.42     |
|    ep_rew_mean      | 279.5405 |
|    exploration_rate | 0.92     |
| time/               |          |
|    episodes         | 117000   |
|    fps              | 404      |
|    time_elapsed     | 2089     |
|    total_timesteps  | 846000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 25.5     |
|    n_updates        | 199      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 6.98     |
|    ep_rew_mean      | 217.4085 |
|    exploration_rate | 0.919    |
| time/               |          |
|    episodes         | 118000   |
|    fps              | 404      |
|    time_elapsed     | 2107     |
|    total_timesteps  | 853000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 31       |
|    n_updates        | 201      |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.53      |
|    ep_rew_mean      | 299.41113 |
|    exploration_rate | 0.918     |
| time/               |           |
|    episodes         | 119000    |
|    fps              | 404       |
|    time_elapsed     | 2125      |
|    total_timesteps  | 860000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 51.9      |
|    n_updates        | 202       |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7.36     |
|    ep_rew_mean      | 289.2404 |
|    exploration_rate | 0.918    |
| time/               |          |
|    episodes         | 120000   |
|    fps              | 404      |
|    time_elapsed     | 2143     |
|    total_timesteps  | 867000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 22.4     |
|    n_updates        | 204      |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 6.97      |
|    ep_rew_mean      | 234.41624 |
|    exploration_rate | 0.917     |
| time/               |           |
|    episodes         | 121000    |
|    fps              | 404       |
|    time_elapsed     | 2160      |
|    total_timesteps  | 874000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 50.7      |
|    n_updates        | 206       |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7        |
|    ep_rew_mean      | 224.8686 |
|    exploration_rate | 0.916    |
| time/               |          |
|    episodes         | 122000   |
|    fps              | 404      |
|    time_elapsed     | 2177     |
|    total_timesteps  | 881000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 59.3     |
|    n_updates        | 208      |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 6.96      |
|    ep_rew_mean      | 228.19632 |
|    exploration_rate | 0.916     |
| time/               |           |
|    episodes         | 123000    |
|    fps              | 404       |
|    time_elapsed     | 2195      |
|    total_timesteps  | 888000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 53        |
|    n_updates        | 209       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.01      |
|    ep_rew_mean      | 247.87178 |
|    exploration_rate | 0.915     |
| time/               |           |
|    episodes         | 124000    |
|    fps              | 404       |
|    time_elapsed     | 2212      |
|    total_timesteps  | 895000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 36.4      |
|    n_updates        | 211       |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7.41     |
|    ep_rew_mean      | 265.9421 |
|    exploration_rate | 0.914    |
| time/               |          |
|    episodes         | 125000   |
|    fps              | 404      |
|    time_elapsed     | 2230     |
|    total_timesteps  | 902000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 24.5     |
|    n_updates        | 213      |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.38      |
|    ep_rew_mean      | 242.33218 |
|    exploration_rate | 0.914     |
| time/               |           |
|    episodes         | 126000    |
|    fps              | 404       |
|    time_elapsed     | 2249      |
|    total_timesteps  | 910000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 31.8      |
|    n_updates        | 215       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.56      |
|    ep_rew_mean      | 276.77054 |
|    exploration_rate | 0.913     |
| time/               |           |
|    episodes         | 127000    |
|    fps              | 404       |
|    time_elapsed     | 2265      |
|    total_timesteps  | 917000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 46        |
|    n_updates        | 217       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 6.94      |
|    ep_rew_mean      | 245.53355 |
|    exploration_rate | 0.912     |
| time/               |           |
|    episodes         | 128000    |
|    fps              | 404       |
|    time_elapsed     | 2282      |
|    total_timesteps  | 924000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 28        |
|    n_updates        | 218       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 6.87      |
|    ep_rew_mean      | 234.58653 |
|    exploration_rate | 0.912     |
| time/               |           |
|    episodes         | 129000    |
|    fps              | 404       |
|    time_elapsed     | 2299      |
|    total_timesteps  | 931000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 47.7      |
|    n_updates        | 220       |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.15      |
|    ep_rew_mean      | 256.92474 |
|    exploration_rate | 0.911     |
| time/               |           |
|    episodes         | 130000    |
|    fps              | 404       |
|    time_elapsed     | 2317      |
|    total_timesteps  | 938000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 18.5      |
|    n_updates        | 222       |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7.42     |
|    ep_rew_mean      | 242.1612 |
|    exploration_rate | 0.91     |
| time/               |          |
|    episodes         | 131000   |
|    fps              | 404      |
|    time_elapsed     | 2337     |
|    total_timesteps  | 946000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 32.6     |
|    n_updates        | 224      |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 7.47      |
|    ep_rew_mean      | 282.19266 |
|    exploration_rate | 0.909     |
| time/               |           |
|    episodes         | 132000    |
|    fps              | 404       |
|    time_elapsed     | 2354      |
|    total_timesteps  | 953000    |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 23.7      |
|    n_updates        | 226       |
-----------------------------------
