In [4]:
%pip install python-dotenv awscli --quiet
# %pip install gym stable-baselines3[extra] awscli boto3 pqdm awscli torch 


[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
%load_ext dotenv
%dotenv env


In [9]:
!aws s3 sync rl_ready_data s3://dissertation-data-dmiller/rl_ready_data --delete
!aws s3 sync rl_ready_data_conv s3://dissertation-data-dmiller/rl_ready_data_conv --delete
!aws s3 sync calculated_features s3://dissertation-data-dmiller/calculated_features --delete
!aws s3 sync labelled_session_count_data s3://dissertation-data-dmiller/labelled_session_count_data --delete
!aws s3 sync torch_ready_data s3://dissertation-data-dmiller/torch_ready_data --delete

^Cmpleted 1.6 GiB/~25.4 GiB (15.3 MiB/s) with ~13 file(s) remaining (calculating...)   
cancelled: ctrl-c received                                                             


In [7]:
!aws s3 sync experiments s3://dissertation-data-dmiller/experiments


The user-provided path experiments does not exist.


In [4]:
# !aws s3 sync s3://dissertation-data-dmiller/rl_ready_data_conv/files_used_30/window_1/batched_train rl_ready_data_conv/files_used_30/window_1/batched_train --delete

In [5]:
# %load rl_constant.py
FEATURE_COLUMNS = [
    
    "user_count",
    "project_count",
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event",
    "cum_session_time",
    "expanding_click_average",
   
    "cum_platform_time",
    "cum_platform_event",
    "cum_projects",
    "average_event_time",
    "delta_last_event",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]


METADATA = [
    "user_id",
    "session_30_count_raw",
    "cum_platform_event_raw",
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "cum_session_event_raw",
    "date_time"
]

RL_STAT_COLS = [
    'session_size',
    'session_minutes',
    'size_cutoff',
    'time_cutoff',
    'reward'
]

PREDICTION_COLS = [
    "label",
    "pred"
]

LOAD_COLS = list(set(FEATURE_COLUMNS + METADATA + RL_STAT_COLS + PREDICTION_COLS))

In [6]:
# %load environment_exp_replay
# %load environment
# %load environment
import gym
import numpy as np
from scipy.stats import norm

MAX_EVAL_SIZE = 75

class CitizenScienceEnvReplay(gym.Env):
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self, dataset, out_features, n_sequences, evaluation=False):
        """
        trajectories: dictionary of user_id to their respective trajectories.
        n_sequences: number of sequences used for preprocessing.
        n_features: number of features used for preprocessing.
        """
        super(CitizenScienceEnvReplay, self).__init__()
        self.dataset = dataset
        self.unique_sessions = self.dataset[['user_id', 'session_30_count_raw']].drop_duplicates()
        self.n_sequences = n_sequences
        self.current_session = None
        self.current_session_index = 0
        self.reward = 0
        self.n_sequences = n_sequences
        self.out_features = out_features
        
        max_session_size = self.dataset['session_size'].max()
        
        self.action_space = gym.spaces.Discrete(4)
        self.observation_space = gym.spaces.Dict({
            "observation": gym.spaces.Box(low=-1, high=max_session_size, shape=(len(out_features) + 3, n_sequences + 1), dtype=np.float32),
            "achieved_goal": gym.spaces.Discrete(max_session_size),
            "desired_goal": gym.spaces.Discrete(max_session_size)
        })
        self.evalution = evaluation
        self.episode_bins = []
        self.exp_runs = 0

    def reset(self):
        random_session = np.random.randint(0, self.unique_sessions.shape[0])
        
        user_to_run, session_to_run = self.unique_sessions.iloc[random_session][['user_id', 'session_30_count_raw']]
        self.current_session = self._get_events(user_to_run, session_to_run)
        self.metadata = self._metadata()
        self.current_session_index = 1
        self.reward = 0
        return self._state()
    
    def _row_to_dict(self, metadata):
        """
        Convert a row of metadata to a dictionary.
        """
        return metadata.to_dict()

    def compute_reward(self, achieved_goal, desired_goal, info=None):
        return np.exp(-np.linalg.norm(desired_goal - achieved_goal)) * ((achieved_goal - self.metadata['size_cutoff']) * (achieved_goal / self.metadata['size_cutoff']))
            

    def step(self, action):
        
        self._take_action(action)
            
        next_state, done, meta = self._calculate_next_state()
        
        
        if done:
            current_session_index = self.current_session_index if \
                self.current_session_index != self.current_session.shape[0] else self.current_session.shape[0] - 1
            
            self.exp_runs += 1
            self.metadata['ended_event'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
            self.metadata['ended_time'] = self.current_session.iloc[current_session_index]['cum_session_time_raw']
            self.metadata['exp_runs'] = self.exp_runs
            self.episode_bins.append(self._row_to_dict(self.metadata))
            
            self.metadata['ended_event'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
            self.metadata['ended_time'] = self.current_session.iloc[current_session_index]['cum_session_time_raw']
            self.metadata['exp_runs'] = self.exp_runs
            self.episode_bins.append(self._row_to_dict(self.metadata))
           
            cum_session_event_raw = self.current_session.iloc[current_session_index]['cum_session_event_raw']
            reward_exp = self.compute_reward(cum_session_event_raw, self.metadata['session_size'])
            
            return next_state, reward_exp , done, {}
        else:
            self.reward = self.current_session.iloc[self.current_session_index]['cum_session_event_raw']
            cum_session_event_raw = self.current_session.iloc[self.current_session_index]['cum_session_event_raw']
            
            reward_exp = self.compute_reward(cum_session_event_raw, self.metadata['session_size'])
    
            self.current_session_index += 1        
            
            return next_state, reward_exp, done, meta
    
    def _metadata(self):
        session_metadata = self.current_session.iloc[0][RL_STAT_COLS].copy()
        session_metadata['ended'] = 0
        for meta_col in ['small', 'medium', 'large']:
            session_metadata[f'inc_{meta_col}'] = 0
            session_metadata[f'time_{meta_col}'] = 0

        return session_metadata
    
    def flush_episode_bins(self):
        episode_bins = self.episode_bins.copy()
        self.episode_bins = []
        return episode_bins
    
    def _calculate_next_state(self):
        
        if (self.current_session_index == self.current_session.shape[0]):
            return None, True, {}

        if self._continuing_in_session():
            return self._state(), False, {}
    
        return None, True, {}
         
    def _continuing_in_session(self):
        event_cutoff = self.current_session.iloc[self.current_session_index]['size_cutoff']
        current_session_event = self.current_session.iloc[self.current_session_index]['cum_session_event_raw']
        if current_session_event <= event_cutoff or current_session_event  >= MAX_EVAL_SIZE:
            return True
    
        extending_low = self._probability_extending(current_session_event, self.metadata['inc_small']) - \
            (0.05 + np.random.normal(-0.02, 0.1, 100).mean())

            
        extending_medium = self._probability_extending(current_session_event, self.metadata['inc_medium']) - \
            (0.1 + np.random.normal(-0.02, 0.1, 100).mean()) 
            
        extending_large = self._probability_extending(current_session_event, self.metadata['inc_large']) + \
            (0.2 + np.random.normal(-0.02, 0.1, 100).mean())
            
        return any([
            extending_low > 0.4 and extending_low <= 0.75,
            extending_medium > 0.4 and extending_medium <= 0.75,
            extending_large > 0.4 and extending_large <= 0.75
        ])
        
           
    
    def _probability_extending(self, current_session_event, incentive_event):
        if incentive_event == 0:
            return 0
         
        continue_session = norm(
            loc=max(incentive_event, 1),
            scale=max(incentive_event *.75, 1)
        ).cdf(max(current_session_event, 1)) 
        
        return continue_session
        

    def _get_events(self, user_id, session):
        subset = self.dataset[
            (self.dataset['user_id'] == user_id) &
            (self.dataset['session_30_count_raw'] == session).copy()
        ]

        subset = subset.sort_values(by=['date_time'])
        return subset
    
    def _take_action(self, action):
        if action == 0:
            return 1
        
        current_session_index = self.current_session_index if \
            self.current_session_index != self.current_session.shape[0] else self.current_session.shape[0] - 1
    
        if action == 1:
            if self.metadata['inc_small'] > 0:
                return 1

            self.metadata['inc_small'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
            self.metadata['time_small'] = self.current_session.iloc[current_session_index]['cum_session_time_raw']
            return 1
    
        elif action == 2:
            if self.metadata['inc_medium'] > 0:
                return 1
            self.metadata['inc_medium'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
            self.metadata['time_medium'] = self.current_session.iloc[current_session_index]['cum_session_time_raw']
            return 1
        
        else:
            if self.metadata['inc_large'] > 0:
                return 1
            self.metadata['inc_large'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
            self.metadata['time_large'] = self.current_session.iloc[current_session_index]['cum_session_time_raw']
            return 1

    def _state(self):

        if self.current_session_index > self.n_sequences:
            events = self.current_session.iloc[self.current_session_index - (self.n_sequences + 1):self.current_session_index][self.out_features]
            events['inc_small'] = self.metadata['inc_small']
            events['inc_medium'] = self.metadata['inc_medium']
            events['inc_large'] = self.metadata['inc_large']
            
            events = events.values
            
            
        else:
            current_session_index = self.current_session_index if \
                self.current_session_index != self.current_session.shape[0] else self.current_session.shape[0] - 1
            
            delta = min((self.n_sequences + 1)- current_session_index, self.n_sequences)
            zero_cat = np.zeros((delta, len(self.out_features) + 3))
            events = self.current_session.iloc[:max(current_session_index, 1)][self.out_features]
            
            events['inc_small'] = self.metadata['inc_small']
            events['inc_medium'] = self.metadata['inc_medium']
            events['inc_large'] = self.metadata['inc_large']
            
            
            events = np.concatenate((zero_cat, events), axis=0)
    
        current_session_index = self.current_session_index if \
            self.current_session_index != self.current_session.shape[0] else self.current_session.shape[0] - 1
       
        return {
            'observation': events.astype(np.float32).T.copy(),
            'achieved_goal': self.current_session.iloc[current_session_index]['cum_session_event_raw'].astype(np.float32).copy(),
            'desired_goal': self.metadata['session_size'].astype(np.float32).copy()
        } 



In [7]:
# %load callback
import pandas as pd
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat
from datetime import datetime

class DistributionCallback(BaseCallback):
    
    @classmethod
    def tensorboard_setup(cls, log_dir, log_freq):
        cls._log_dir = log_dir
        cls._log_freq = log_freq

    
    def _on_step(self) -> bool:
        if self.n_calls % self._log_freq == 0:
            dist_list = self.training_env.env_method('flush_episode_bins')
            values_to_log = [item for sublist in dist_list for item in sublist if len(sublist) > 0]

            values_df = pd.DataFrame(
                values_to_log
            )
            
            
            session_size, size_cutoff, session_minutes, time_cutoff, ended_event, ended_time = (
                values_df['session_size'].mean(),
                values_df['size_cutoff'].mean(),
                values_df['session_minutes'].mean(),
                values_df['time_cutoff'].mean(),
                values_df['ended_event'].mean(),
                values_df['ended_time'].mean(),
            )
            
            inc_index_small, inc_index_medium, inc_index_large = (
                values_df['inc_small'].mean(),
                values_df['inc_medium'].mean(),
                values_df['inc_large'].mean()
            )
            
            time_minutes_small, time_minutes_medium, time_minutes_large = (
                values_df['time_small'].mean(),
                values_df['time_medium'].mean(),
                values_df['time_large'].mean()
            )
            
            size_stats = {
                'session_size': session_size,
                'size_cutoff': size_cutoff,
                'ended_size': ended_event,
                'inc_small': inc_index_small,
                'inc_medium': inc_index_medium,
                'inc_large': inc_index_large,
            }
            
            
            time_stats = {
                'session_minutes': session_minutes,
                'time_cutoff': time_cutoff,
                'ended_time': ended_time,
                'time_small': time_minutes_small,
                'time_medium': time_minutes_medium,
                'time_large': time_minutes_large,
            }
            
            for key, value in size_stats.items():
                self.logger.record(f'size/{key}', value)
            
            for key, value in time_stats.items():
                self.logger.record(f'sess_time/{key}', value)
                
            values_df.to_csv(f'{self._log_dir}/{self.n_calls // self._log_freq}.csv')
            
        return True

In [8]:
# %load policies/cnn_policy

from typing import Dict, List, Type, Union

import gym
import torch
from gym import spaces
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.dqn.policies import DQNPolicy
import torch.nn.functional as F
from torch import nn
import logging
global logger
logger = logging.getLogger(__name__)

class CustomConv1dFeatures(BaseFeaturesExtractor):
    
    @classmethod
    def setup_sequences_features(cls, n_sequences, n_features):
        cls.n_sequences = n_sequences
        cls.n_features = n_features
        
    
    def __init__(self, observation_space: spaces.Dict, features_dim=24):
        super().__init__(observation_space, features_dim)
        
        
        self.cnn_1 = nn.Sequential(
            nn.Conv1d(self.n_features, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.ELU(),
            
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.ELU(),
            
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.ELU()
            
        )
        
        self.conv_1_reshape = nn.Conv1d(
            self.n_features,
            self.n_features*2,
            kernel_size=1,
            padding=0
        
        )
        
        self.a_pool_1 = nn.AvgPool1d(kernel_size=2, stride=2)
        
        self.cnn_bottleneck_wide = nn.Sequential(
            nn.Conv1d(self.n_features*2, self.n_features*4, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*4),
            nn.ELU(),
            
            nn.Conv1d(self.n_features*4, self.n_features*4, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*4),
            nn.ELU(),
            
            nn.Conv1d(self.n_features*4, self.n_features*4, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*4),
            nn.ELU()   
        )
        
        self.conv_2_reshape = nn.Conv1d(
            self.n_features*2,
            self.n_features*4,
            kernel_size=1,
            padding=0
        )
        
        
        self.cnn_bottleneck_narrow = nn.Sequential(
            nn.Conv1d(self.n_features*4, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.ELU(),
            
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.ELU(),
            
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.ELU()
        )
        
        self.conv_3_reshape = nn.Conv1d(
            self.n_features*4,
            self.n_features*2,
            kernel_size=1,
            padding=0
        )
        
        self.downsample = nn.Sequential(
            nn.Conv1d(self.n_features*2, self.n_features, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features),
            nn.ELU(),
            
            nn.Conv1d(self.n_features, self.n_features, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features),
            nn.ELU(),
            
            nn.Conv1d(self.n_features, self.n_features, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features),
            nn.ELU()
        )
        
        self.conv_4_reshape = nn.Conv1d(
            self.n_features*2,
            self.n_features,
            kernel_size=1,
            padding=0
        )
                
        self.down_max = nn.Sequential(
            nn.Conv1d(self.n_features, self.n_features // 2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features // 2),
            nn.ELU(),
            
            nn.Conv1d(self.n_features // 2, self.n_features // 2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features // 2),
            nn.ELU(),
            
            nn.Conv1d(self.n_features // 2, self.n_features // 2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features // 2),
            nn.ELU(),
        )
        
        
        self.mpool_flat = nn.Sequential(
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Flatten()
        )
        self.down_max_reshape = nn.Conv1d(
            self.n_features,
            self.n_features // 2,
            kernel_size=1,
            padding=0
        )
        
        with torch.no_grad():
            sample_tensor = torch.zeros((1, self.n_features, self.n_sequences))
            sample_tensor = self.cnn_1(sample_tensor) + self.conv_1_reshape(sample_tensor)
            sample_tensor = self.a_pool_1(sample_tensor)
            sample_tensor = self.cnn_bottleneck_wide(sample_tensor) + self.conv_2_reshape(sample_tensor)
            sample_tensor = self.cnn_bottleneck_narrow(sample_tensor) + self.conv_3_reshape(sample_tensor)
            sample_tensor = self.downsample(sample_tensor) + self.conv_4_reshape(sample_tensor)
            sample_tensor = self.down_max(sample_tensor) + self.down_max_reshape(sample_tensor)
            mpool_flat_out = self.mpool_flat(sample_tensor)
            linear_in = mpool_flat_out.shape[1]
            self.final_out_linear = nn.Sequential(

                nn.Linear(linear_in, features_dim),
                nn.ELU()
            )

        



        

    def forward(self, obs):
        
        obs_tensor = obs['observation']  
        
        obs_cnn_1 = self.cnn_1(obs_tensor) + self.conv_1_reshape(obs_tensor)
        
        obs_cnn_1 = self.a_pool_1(obs_cnn_1)
        
        obs_cnn_2 = self.cnn_bottleneck_wide(obs_cnn_1) + self.conv_2_reshape(obs_cnn_1)
        
        obs_cnn_3 = self.cnn_bottleneck_narrow(obs_cnn_2) + self.conv_3_reshape(obs_cnn_2)
        
        obs_cnn_4 = self.downsample(obs_cnn_3) + self.conv_4_reshape(obs_cnn_3)
       
        obs_cnn_5 = self.down_max(obs_cnn_4) + self.down_max_reshape(obs_cnn_4)
        
        mpool_flat_out = self.mpool_flat(obs_cnn_5)
        
        return self.final_out_linear(mpool_flat_out)
        
        
        

        
        

In [9]:
# %load incentive_reinforcement_learning_cpu.py
import argparse
import logging
import os
from datetime import datetime
from functools import reduce
from pprint import pformat
from typing import Callable
import boto3
import random
import numpy as np
import pandas as pd
import torch
torch.backends.cudnn.enabled = False



from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.common.callbacks import (CallbackList,
                                                CheckpointCallback,
                                                StopTrainingOnMaxEpisodes)
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor, VecNormalize

import argparse
import logging
import os
from datetime import datetime
from functools import reduce
from pprint import pformat
from typing import Callable
import boto3
import random
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import argparse
import logging
import os
from datetime import datetime
from functools import reduce
from pprint import pformat
from typing import Callable
import boto3
import random
import numpy as np
import pandas as pd
import torch


from stable_baselines3 import A2C, DQN, PPO, HerReplayBuffer
from stable_baselines3.common.callbacks import (CallbackList,
                                                CheckpointCallback,
                                                StopTrainingOnMaxEpisodes)
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor, VecNormalize
from stable_baselines3.dqn.policies import DQNPolicy


import argparse
import logging
import os
from datetime import datetime
from functools import reduce
from pprint import pformat
from typing import Callable
import boto3
import random
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
np.set_printoptions(precision=4, linewidth=1000, suppress=True)
torch.set_printoptions(precision=4, linewidth=500, sci_mode=False)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
import torch.nn as nn
global logger
logger = logging.getLogger('rl_exp_train')
logger.setLevel(logging.INFO)

S3_BASELINE_PATH = 's3://dissertation-data-dmiller/'
N_SEQUENCES = 15
CHECKPOINT_FREQ = 300_000
TB_LOG = 10_000
WINDOW = 1
REWARD_CLIP = 90
MIN_MAX_RANGE = (10, 90)
"""
Reward clip based on achieving maximum reward for 90 minute session at
(s / 45) * (s - 45)
"""

def linear_schedule(initial_value: float) -> Callable[[float], float]:
    """
    Linear learning rate schedule.

    :param initial_value: Initial learning rate.
    :return: schedule that computes
      current learning rate depending on remaining progress
    """
    def func(progress_remaining: float) -> float:
        """
        Progress will decrease from 1 (beginning) to 0.

        :param progress_remaining:
        :return: current learning rate
        """
        return progress_remaining * initial_value

    return func

def parse_args():
    parse = argparse.ArgumentParser()
    parse.add_argument('--read_path', type=str, default='rl_ready_data_conv')
    parse.add_argument('--n_files', type=int, default=2)
    parse.add_argument('--n_episodes', type=int, default=10_000)
    parse.add_argument('--lstm', type=str, default='label')
    parse.add_argument('--part', type=str, default='train')
    parse.add_argument('--feature_extractor', type=str, default='cnn') 
    parse.add_argument('--clip_engagement', type=bool, default=False)
    parse.add_argument('--her_env', type=bool, default=True)
    args = parse.parse_args()
    return args


def simplify_experiment(vectorized_df, clip_engagement=False):
    vectorized_df = [
        df[(df['session_size'] >= MIN_MAX_RANGE[0]) & (df['session_size'] <= MIN_MAX_RANGE[1])] for df in vectorized_df
    ]
    if clip_engagement:
        vectorized_df_clip = []
        for df in vectorized_df:
            df['pred'] = df['pred'].apply(lambda x: 1 if x > .5 else 0)
            vectorized_df_clip.append(df)
        vectorized_df = vectorized_df_clip

    return vectorized_df


def main(args):
   
    
    logger.info('Starting Incentive Reinforcement Learning')
    logger.info(pformat(args.__dict__))
    exec_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    
    read_path, n_files, n_episodes, lstm, part, feature_ext, clip_engage, her = (
        args.read_path, 
        args.n_files, 
        args.n_episodes, 
        args.lstm,
        args.part,
        args.feature_extractor,
        args.clip_engagement,
        args.her_env
    )

    base_read_path = os.path.join(read_path, f'files_used_{n_files}', f'window_{WINDOW}', f'batched_{part}')
    logger.info(f'Reading data from {base_read_path}')
    files= os.listdir(base_read_path)
    n_envs = len(files)
    logger.info(f'Files found: {len(files)} for environment vectorization')


    df_files = [
        pd.read_parquet(os.path.join(base_read_path, file), columns=LOAD_COLS)
        for file in files
    ]
   
    df_files = simplify_experiment(df_files, clip_engagement=clip_engage)

    n_envs = len(df_files)
    
    logger.info(f'Files used: {len(df_files)} for environment vectorization')
    
    out_features = FEATURE_COLUMNS + [lstm] if lstm else FEATURE_COLUMNS
    
    logger.info(f'Out features: {out_features}')
    if her:
        citizen_science_vec = DummyVecEnv([lambda: CitizenScienceEnvReplay(vec_df, out_features, N_SEQUENCES) for vec_df in df_files])
    else:
        citizen_science_vec =DummyVecEnv([lambda: CitizenScienceEnv(vec_df, out_features, N_SEQUENCES) for vec_df in df_files])
    

    monitor_train = VecMonitor(citizen_science_vec)
    
    logger.info(f'Vectorized environments created')

    base_exp_path = os.path.join('experiments', f'dqn_{lstm}_{feature_ext}_her_{her}/{exec_time}')


    tensorboard_dir, checkpoint_dir = (
        os.path.join(base_exp_path, 'training_metrics'),
        os.path.join(base_exp_path, 'checkpoints')
    )

    if not os.path.exists(tensorboard_dir):
        logger.info(f'Creating directory {tensorboard_dir} for tensorboard logs')
        os.makedirs(tensorboard_dir)
   
    if not os.path.exists(checkpoint_dir):
        logger.info(f'Creating directory {checkpoint_dir} for checkpoints')
        os.makedirs(checkpoint_dir) 

    checkpoint_freq = int(CHECKPOINT_FREQ // (n_envs // 2))
    log_freq = int(TB_LOG // n_envs)
    checkpoint_callback = CheckpointCallback(
        save_freq=checkpoint_freq,
        save_path=checkpoint_dir, 
        verbose=2
    )
    
    DistributionCallback.tensorboard_setup(tensorboard_dir, (TB_LOG * 5) // n_envs)
    logger_callback = DistributionCallback()
    
    callback_list = CallbackList([checkpoint_callback, logger_callback])
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    if feature_ext == 'cnn':
        CustomConv1dFeatures.setup_sequences_features(N_SEQUENCES + 1, len(out_features) + 3)
        logger.info('Using custom 1 dimensional CNN feature extractor')
        policy_kwargs = dict(
            features_extractor_class=CustomConv1dFeatures,
            net_arch=[12],
            normalize_images=False,
            activation_fn=nn.ELU,
            
        )
        model = DQN(
            policy='MultiInputPolicy', 
            env=monitor_train, 
            verbose=1, 
            replay_buffer_class=HerReplayBuffer,
            replay_buffer_kwargs=dict(
                n_sampled_goal=8,
                goal_selection_strategy='future',
            ),
                
            tensorboard_log=tensorboard_dir, 
            policy_kwargs=policy_kwargs, 
            device=device, 
            stats_window_size=1000)
    else:
        model = DQN(
            policy='MlpPolicy', 
            env=monitor_train, 
            verbose=1, 
            tensorboard_log=tensorboard_dir, 
            policy_kwargs=dict(
                activation_fn=nn.ELU,
                normalize_images=False,
            ),
            device=device, 
            stats_window_size=1000
        )
        
    logger.info(f'Model created: policy')
    
    logger.info(pformat(model.policy))
        
    logger.info(f'Beginning training') 
    
            
    logger.info(pformat([
        'n_episodes: {}'.format(n_episodes),
        'read_path: {}'.format(read_path),
        'n_files: {}'.format(n_files),
        'n_sequences: {}'.format(N_SEQUENCES),
        'n_envs: {}'.format(n_envs),
        'device: {}'.format(device),
        'lstm: {}'.format(lstm),
        'part: {}'.format(part),
        'feature_extractor: {}'.format(feature_ext),
        'tensorboard_dir: {}'.format(tensorboard_dir),
        'checkpoint_dir: {}'.format(checkpoint_dir),
        'checkpoint_freq: {}'.format(checkpoint_freq),
        'tb_freq: {}'.format(log_freq),
        'hindsight experience replay: {}'.format(her),
    ]))
    

    model.learn(total_timesteps=8_000_000, log_interval=log_freq, progress_bar=True, callback=callback_list)

    # model.learn(total_timesteps=n_episodes, log_interval=log_freq, progress_bar=True, callback=callback_list)
    # model.learn(total_timesteps=8_000_000, log_interval=log_freq, progress_bar=True, callback=callback_list)
    


In [10]:
class Argument:
    read_path = 'rl_ready_data_conv'
    n_files = 30
    n_episodes = 600_000
    lstm = 'pred'
    part = 'train'
    feature_extractor = 'cnn'
    clip_engagement = False
    her_env = True
    # penalize_losses = True
    # include_her = False
    # upper_prob_bound = 0.75
    # lower_prob_bound = 0.4
    # large_inc_effect = .2
    # mid_inc_effect = .1
    # small_inc_effect = .05
    

In [11]:

main(Argument)

06/16/2023 05:17:04 PM Starting Incentive Reinforcement Learning
06/16/2023 05:17:04 PM mappingproxy({'__dict__': <attribute '__dict__' of 'Argument' objects>,
              '__doc__': None,
              '__module__': '__main__',
              '__weakref__': <attribute '__weakref__' of 'Argument' objects>,
              'clip_engagement': False,
              'feature_extractor': 'cnn',
              'her_env': True,
              'lstm': 'pred',
              'n_episodes': 600000,
              'n_files': 30,
              'part': 'train',
              'read_path': 'rl_ready_data_conv'})
06/16/2023 05:17:04 PM Reading data from rl_ready_data_conv/files_used_30/window_1/batched_train
06/16/2023 05:17:04 PM Files found: 100 for environment vectorization
06/16/2023 05:17:06 PM Files used: 100 for environment vectorization
06/16/2023 05:17:06 PM Out features: ['user_count', 'project_count', 'country_count', 'date_hour_sin', 'date_hour_cos', 'date_minute_sin', 'date_minute_cos', 'session

Using cuda device


06/16/2023 05:17:08 PM Model created: policy
06/16/2023 05:17:08 PM MultiInputPolicy(
  (q_net): QNetwork(
    (features_extractor): CustomConv1dFeatures(
      (cnn_1): Sequential(
        (0): Conv1d(26, 52, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): BatchNorm1d(52, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ELU(alpha=1.0)
        (3): Conv1d(52, 52, kernel_size=(3,), stride=(1,), padding=(1,))
        (4): BatchNorm1d(52, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (5): ELU(alpha=1.0)
        (6): Conv1d(52, 52, kernel_size=(3,), stride=(1,), padding=(1,))
        (7): BatchNorm1d(52, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (8): ELU(alpha=1.0)
      )
      (conv_1_reshape): Conv1d(26, 52, kernel_size=(1,), stride=(1,))
      (a_pool_1): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
      (cnn_bottleneck_wide): Sequential(
        (0): Conv1d(52, 104, kernel_size=(3,

Logging to experiments/dqn_pred_cnn_her_True/2023-06-16_17-17-04/training_metrics/DQN_1


Output()

-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 9.9       |
|    ep_rew_mean      | 1.5896614 |
|    exploration_rate | 0.998     |
| time/               |           |
|    episodes         | 100       |
|    fps              | 291       |
|    time_elapsed     | 6         |
|    total_timesteps  | 1800      |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 11.7      |
|    ep_rew_mean      | 1.4117608 |
|    exploration_rate | 0.996     |
| time/               |           |
|    episodes         | 200       |
|    fps              | 335       |
|    time_elapsed     | 9         |
|    total_timesteps  | 3200      |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 12.6      |
|    ep_rew_mean      | 1.3962388 |
|    exploration_rate | 0.994     |
| time/               |           |
|    episodes         | 300       |
|    fps              | 357       |
|    time_elapsed     | 13        |
|    total_timesteps  | 4700      |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 13.2      |
|    ep_rew_mean      | 1.4577384 |
|    exploration_rate | 0.993     |
| time/               |           |
|    episodes         | 400       |
|    fps              | 369       |
|    time_elapsed     | 16        |
|    total_timesteps  | 6200      |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 13.4      |
|    ep_rew_mean      | 1.6109136 |
|    exploration_rate | 0.991     |
| time/               |           |
|    episodes         | 500       |
|    fps              | 379       |
|    time_elapsed     | 20        |
|    total_timesteps  | 7600      |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 13.5      |
|    ep_rew_mean      | 1.7063746 |
|    exploration_rate | 0.989     |
| time/               |           |
|    episodes         | 600       |
|    fps              | 385       |
|    time_elapsed     | 23        |
|    total_timesteps  | 9100      |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 13.8      |
|    ep_rew_mean      | 1.6374981 |
|    exploration_rate | 0.988     |
| time/               |           |
|    episodes         | 700       |
|    fps              | 382       |
|    time_elapsed     | 27        |
|    total_timesteps  | 10500     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 13.8      |
|    ep_rew_mean      | 1.6321081 |
|    exploration_rate | 0.986     |
| time/               |           |
|    episodes         | 800       |
|    fps              | 387       |
|    time_elapsed     | 30        |
|    total_timesteps  | 12000     |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 13.9     |
|    ep_rew_mean      | 1.625745 |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 900      |
|    fps              | 390      |
|    time_elapsed     | 34       |
|    total_timesteps  | 13500    |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14        |
|    ep_rew_mean      | 1.6328787 |
|    exploration_rate | 0.982     |
| time/               |           |
|    episodes         | 1000      |
|    fps              | 391       |
|    time_elapsed     | 37        |
|    total_timesteps  | 14800     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.4      |
|    ep_rew_mean      | 1.6359079 |
|    exploration_rate | 0.98      |
| time/               |           |
|    episodes         | 1100      |
|    fps              | 395       |
|    time_elapsed     | 41        |
|    total_timesteps  | 16500     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.7      |
|    ep_rew_mean      | 1.7262291 |
|    exploration_rate | 0.979     |
| time/               |           |
|    episodes         | 1200      |
|    fps              | 397       |
|    time_elapsed     | 45        |
|    total_timesteps  | 17900     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.7      |
|    ep_rew_mean      | 1.7890383 |
|    exploration_rate | 0.977     |
| time/               |           |
|    episodes         | 1300      |
|    fps              | 399       |
|    time_elapsed     | 48        |
|    total_timesteps  | 19300     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.6      |
|    ep_rew_mean      | 1.7605326 |
|    exploration_rate | 0.975     |
| time/               |           |
|    episodes         | 1400      |
|    fps              | 400       |
|    time_elapsed     | 51        |
|    total_timesteps  | 20700     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.5      |
|    ep_rew_mean      | 1.7250196 |
|    exploration_rate | 0.974     |
| time/               |           |
|    episodes         | 1500      |
|    fps              | 401       |
|    time_elapsed     | 55        |
|    total_timesteps  | 22100     |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 14.5     |
|    ep_rew_mean      | 1.600773 |
|    exploration_rate | 0.972    |
| time/               |          |
|    episodes         | 1600     |
|    fps              | 404      |
|    time_elapsed     | 58       |
|    total_timesteps  | 23700    |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.6      |
|    ep_rew_mean      | 1.5848128 |
|    exploration_rate | 0.97      |
| time/               |           |
|    episodes         | 1700      |
|    fps              | 405       |
|    time_elapsed     | 61        |
|    total_timesteps  | 25100     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.6      |
|    ep_rew_mean      | 1.4954957 |
|    exploration_rate | 0.968     |
| time/               |           |
|    episodes         | 1800      |
|    fps              | 406       |
|    time_elapsed     | 65        |
|    total_timesteps  | 26700     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.7      |
|    ep_rew_mean      | 1.4639564 |
|    exploration_rate | 0.967     |
| time/               |           |
|    episodes         | 1900      |
|    fps              | 407       |
|    time_elapsed     | 68        |
|    total_timesteps  | 28100     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.7      |
|    ep_rew_mean      | 1.4018296 |
|    exploration_rate | 0.965     |
| time/               |           |
|    episodes         | 2000      |
|    fps              | 407       |
|    time_elapsed     | 72        |
|    total_timesteps  | 29700     |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 14.8     |
|    ep_rew_mean      | 1.352055 |
|    exploration_rate | 0.963    |
| time/               |          |
|    episodes         | 2100     |
|    fps              | 408      |
|    time_elapsed     | 76       |
|    total_timesteps  | 31300    |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.7      |
|    ep_rew_mean      | 1.3232366 |
|    exploration_rate | 0.961     |
| time/               |           |
|    episodes         | 2200      |
|    fps              | 408       |
|    time_elapsed     | 79        |
|    total_timesteps  | 32600     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.7      |
|    ep_rew_mean      | 1.3723215 |
|    exploration_rate | 0.96      |
| time/               |           |
|    episodes         | 2300      |
|    fps              | 408       |
|    time_elapsed     | 83        |
|    total_timesteps  | 34100     |
-----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 14.7     |
|    ep_rew_mean      | 1.315244 |
|    exploration_rate | 0.958    |
| time/               |          |
|    episodes         | 2400     |
|    fps              | 409      |
|    time_elapsed     | 86       |
|    total_timesteps  | 35400    |
----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.8      |
|    ep_rew_mean      | 1.2052026 |
|    exploration_rate | 0.956     |
| time/               |           |
|    episodes         | 2500      |
|    fps              | 410       |
|    time_elapsed     | 89        |
|    total_timesteps  | 36800     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.8      |
|    ep_rew_mean      | 1.2877417 |
|    exploration_rate | 0.954     |
| time/               |           |
|    episodes         | 2600      |
|    fps              | 411       |
|    time_elapsed     | 93        |
|    total_timesteps  | 38400     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.6      |
|    ep_rew_mean      | 1.3843764 |
|    exploration_rate | 0.953     |
| time/               |           |
|    episodes         | 2700      |
|    fps              | 411       |
|    time_elapsed     | 96        |
|    total_timesteps  | 39900     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.6      |
|    ep_rew_mean      | 1.3807771 |
|    exploration_rate | 0.951     |
| time/               |           |
|    episodes         | 2800      |
|    fps              | 412       |
|    time_elapsed     | 100       |
|    total_timesteps  | 41300     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.5      |
|    ep_rew_mean      | 1.3799155 |
|    exploration_rate | 0.949     |
| time/               |           |
|    episodes         | 2900      |
|    fps              | 411       |
|    time_elapsed     | 103       |
|    total_timesteps  | 42700     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.6      |
|    ep_rew_mean      | 1.4422411 |
|    exploration_rate | 0.948     |
| time/               |           |
|    episodes         | 3000      |
|    fps              | 412       |
|    time_elapsed     | 107       |
|    total_timesteps  | 44200     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.5      |
|    ep_rew_mean      | 1.4008111 |
|    exploration_rate | 0.946     |
| time/               |           |
|    episodes         | 3100      |
|    fps              | 413       |
|    time_elapsed     | 110       |
|    total_timesteps  | 45600     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.5      |
|    ep_rew_mean      | 1.4122112 |
|    exploration_rate | 0.944     |
| time/               |           |
|    episodes         | 3200      |
|    fps              | 413       |
|    time_elapsed     | 114       |
|    total_timesteps  | 47100     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.5      |
|    ep_rew_mean      | 1.2558182 |
|    exploration_rate | 0.942     |
| time/               |           |
|    episodes         | 3300      |
|    fps              | 414       |
|    time_elapsed     | 117       |
|    total_timesteps  | 48700     |
-----------------------------------


-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 14.5      |
|    ep_rew_mean      | 1.3992202 |
|    exploration_rate | 0.941     |
| time/               |           |
|    episodes         | 3400      |
|    fps              | 414       |
|    time_elapsed     | 120       |
|    total_timesteps  | 49900     |
-----------------------------------


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.