In [2]:
!python -m pip install python-dotenv --quiet
!python -m pip install gym stable-baselines3[extra] --quiet


[0m

In [1]:
%load_ext dotenv
%dotenv env

In [3]:
# %load rl_constant.py
LABEL = [
    "continue_work_session_30_minutes"
]

METADATA = [
    "user_id",
    "session_30_raw",
    "cum_platform_event_raw",
    "cum_platform_time_raw",
    "cum_session_time_raw",
    "cum_session_event_raw",
    "global_events_user",
    "global_session_time",
    "date_time",
]

OUT_FEATURE_COLUMNS = [
    "country_count", 
    "date_hour_sin", 
    "date_hour_cos",
    "date_minute_sin",
    "date_minute_cos",
    
    "session_30_count",
    "session_5_count",
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time",
    
    "expanding_click_average",
    "cum_platform_time",
    "cum_platform_events",
    "cum_projects",
    "average_event_time",
    
    "rolling_session_time",
    "rolling_session_events",
    "rolling_session_gap",
    "previous_session_time",
    "previous_session_events",
]

PREDICTION_COLS = [
    'seq_40',
]


GROUPBY_COLS = ['user_id']

RL_STAT_COLS = [
    'session_size',
    'sim_size',
    'session_minutes',
    'sim_minutes',
    'reward',
]

TORCH_LOAD_COLS = list(set(LABEL + METADATA + OUT_FEATURE_COLUMNS + RL_STAT_COLS))

In [4]:
# %load environment
import gym
import numpy as np
from scipy.stats import norm

import numpy as np
from scipy.stats import norm 
import gym
from datetime import datetime

class CitizenScienceEnv(gym.Env):
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self, dataset, out_features, n_sequences, evaluation=False):
        """
        trajectories: dictionary of user_id to their respective trajectories.
        n_sequences: number of sequences used for preprocessing.
        n_features: number of features used for preprocessing.
        """
        super(CitizenScienceEnv, self).__init__()
        self.dataset = dataset
        self.n_sequences = n_sequences
        self.current_session = None
        self.current_session_index = 0
        self.reward = 0
        self.n_sequences = n_sequences
        self.out_features = out_features
        
        self.action_space = gym.spaces.Discrete(2)
        self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(len(out_features), n_sequences + 1), dtype=np.float32)
        self.evalution = evaluation
        self.episode_bins = []

    def reset(self):
        user_to_run, session_to_run = self.dataset.sample(1)[['user_id', 'session_30_raw']].values[0]
        self.current_session = self._get_events(user_to_run, session_to_run)
        self.metadata = self._metadata()
        self.current_session_index = 0
        self.reward = 0
        return self._state()
    
    def _row_to_dict(self, metadata):
        """
        Convert a row of metadata to a dictionary.
        """
        return metadata.to_dict()

    def step(self, action):
        
        self._take_action(action)
        next_state, done, meta = self._calculate_next_state()
        
        if done:
            current_session_index = self.current_session_index if \
                self.current_session_index != self.current_session.shape[0] else self.current_session.shape[0] - 1
        
            self.metadata['ended'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
            self.metadata['reward'] = self.current_session.iloc[current_session_index]['reward']
            if self.evalution:
                self.episode_bins.append(self._row_to_dict(self.metadata))
            return next_state, float(self.reward), done, {}
        else:
            self.reward = self.current_session.iloc[self.current_session_index]['reward'] 
            self.current_session_index += 1        
            return next_state, float(self.reward), done, meta
    
    def _metadata(self):
        session_metadata = self.current_session.iloc[0][RL_STAT_COLS]
        session_metadata['ended'] = 0
        session_metadata['incentive_index'] = 0
        return session_metadata
    
    
    def _calculate_next_state(self):
        
        if (self.current_session_index == self.current_session.shape[0]):
            return None, True, {}

        if self._continuing_in_session():
            return self._state(), False, {}
    
        return None, True, {}
        
      
  
    def _continuing_in_session(self):
        sim_counts = self.metadata['sim_size']
        current_session_count = self.current_session.iloc[self.current_session_index]['cum_session_event_raw']
        if current_session_count < sim_counts:
            return True
        
        extending_session = self._probability_extending_session(current_session_count)
        
        return all([extending_session >= .3, extending_session <= .7])
        
    
    def _probability_extending_session(self, current_session_count):
        if self.metadata['incentive_index'] == 0:
            return 0
        
        scale = max(5, int(self.metadata['session_size'] / 4))
        continue_session = norm(
            loc=self.metadata['incentive_index'],
            scale=scale
        ).cdf(current_session_count)
        
        return continue_session
        

    def _get_events(self, user_id, session):
        subset = self.dataset[
            (self.dataset['user_id'] == user_id) &
            (self.dataset['session_30_raw'] == session)
        ]
        
        subset = subset.sort_values(by=['session_30_raw', 'cum_session_event_raw'])
       
        print(subset[['date_time', 'cum_session_event_raw', 'cum_session_time_raw', 'reward']]) 
        assert subset['cum_session_event_raw'].is_monotonic_increasing
        assert subset['cum_session_time_raw'].is_monotonic_increasing
        assert subset['reward'].is_monotonic_increasing
        return subset
    
    def _take_action(self, action):
        if action == 0 or self.metadata['incentive_index'] > 0:
            return
        
        current_session_index = self.current_session_index if \
            self.current_session_index != self.current_session.shape[0] else self.current_session.shape[0] - 1
        
        self.metadata['incentive_index'] = self.current_session.iloc[current_session_index]['cum_session_event_raw']
        self.metadata['incentive_time'] = self.current_session.iloc[current_session_index]['cum_session_time_raw']
        
    def _state(self):

        if self.current_session_index > self.n_sequences:
            events = self.current_session.iloc[self.current_session_index - (self.n_sequences + 1):self.current_session_index][self.out_features].values
            
        else:
            delta = min((self.n_sequences + 1)- self.current_session_index, self.n_sequences)
            zero_cat = np.zeros((delta, len(self.out_features)))
            events = self.current_session.iloc[:max(self.current_session_index, 1)][self.out_features].values
            events = np.concatenate((zero_cat, events), axis=0)
            

        return events.astype(np.float32).T

In [5]:
# %load policies/cnn_policy
# %load policies/cnn_policy
from typing import Dict, List, Type, Union

import gym
import torch
from gym import spaces
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.dqn.policies import DQNPolicy
from torch import nn


class CustomConv1dFeatures(BaseFeaturesExtractor):
    
    @classmethod
    def setup_sequences_features(cls, n_sequences, n_features):
        cls.n_sequences = n_sequences
        cls.n_features = n_features
        
    
    def __init__(self, observation_space: spaces.Box, features_dim=20):
        super().__init__(observation_space, features_dim)
        
        
        self.cnn_1 = nn.Sequential(
            nn.Conv1d(self.n_features, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.ReLU(),
            
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.ReLU(),
            
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features*2),
            nn.Conv1d(self.n_features*2, self.n_features*2, kernel_size=3, padding=1),
            
            nn.AvgPool1d(2)
        )
        
        self.cnn_2 = nn.Sequential(
            nn.Conv1d(self.n_features*2, self.n_features, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features),
            nn.ReLU(),
            
            nn.Conv1d(self.n_features, self.n_features, kernel_size=3, padding=1),
            nn.BatchNorm1d(self.n_features),
            nn.ReLU()
        )
        
        self.act = nn.Sequential(
            nn.MaxPool1d(2),
            nn.Flatten(),
        )
        
        with torch.no_grad():
            out_shape = self.act(self.cnn_2(self.cnn_1(torch.zeros((1, self.n_features, self.n_sequences))))).shape[1]
            self.linear = nn.Linear(out_shape, features_dim)
    
    def forward(self, obs):
        out = self.cnn_1(obs)
        out = self.cnn_2(out)
        out = self.act(out)
        return self.linear(out)


        

In [6]:
# %load incentive_reinforcement_learning_cpu.py
import argparse
import logging
import os
from datetime import datetime
from functools import reduce
from pprint import pformat
from typing import Callable
import random
import numpy as np
import pandas as pd
import torch
from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.common.callbacks import (CallbackList,
                                                CheckpointCallback,
                                                StopTrainingOnMaxEpisodes)
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from stable_baselines3.dqn.policies import DQNPolicy



logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
torch.set_printoptions(precision=2, linewidth=200, sci_mode=False)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)


def parse_args():
    parse = argparse.ArgumentParser()
    parse.add_argument('--read_path', type=str, default='rl_ready_data_conv')
    parse.add_argument('--n_files', type=int, default=2)
    parse.add_argument('--n_episodes', type=int, default=50)
    parse.add_argument('--n_envs', type=int, default=100)
    parse.add_argument('--lstm', type=str, default='seq_10')
    parse.add_argument('--part', type=str, default='train')
    parse.add_argument('--feature_extractor', type=str, default='cnn') 
    args = parse.parse_args()
    return args

def _lstm_loader(lstm):
    
    return LABEL[0] if lstm == 'label' else lstm

def load_and_dedupe(read_path, cols):
    df = pd.read_parquet(read_path, columns=cols)
    df = df.reset_index(drop=True)
    return df


S3_BASELINE_PATH = 's3://dissertation-data-dmiller'
N_SEQUENCES = 40
CHECKPOINT_FREQ = 100_000
TB_LOG = 10_000
WINDOW = 2


def main(args):
    
    exec_time = datetime.now().strftime("%Y-%m-%d-%H-%M")
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    
    logger.info('Starting Incentive Reinforcement Learning')
    logger.info(pformat(args.__dict__))
    
    read_path, n_files, n_episodes, n_envs, lstm, part, feature_ext = (
        args.read_path, 
        args.n_files, 
        args.n_episodes, 
        args.n_envs,
        args.lstm,
        args.part,
        args.feature_extractor,
    )
    
    read_path = os.path.join(
        'rl_ready_data_conv',
        f'files_used_{n_files}',
        f'window_{WINDOW}_{part}.parquet'
    )
    
    if not os.path.exists(read_path):
        raise ValueError(f'No data found at {read_path}')
    
    logger.info(f'Reading data from {read_path}')
    load_cols, out_features = (TORCH_LOAD_COLS + [lstm] if lstm else TORCH_LOAD_COLS, OUT_FEATURE_COLUMNS + [lstm] if lstm else OUT_FEATURE_COLUMNS)

    df = load_and_dedupe(read_path, load_cols)
    
    df = df.sort_values(by=['session_30_raw', 'cum_session_event_raw'])
    
    
    if not os.path.exists(read_path):
        raise ValueError(f'No data found at {read_path}')
 
    lstm = _lstm_loader(lstm)
     
    logger.info(f'Loaded data with shape {df.shape}')
    logger.info(f'Setting up convolution over {WINDOW}T minutes')
 
    citizen_science_vec =DummyVecEnv([lambda: CitizenScienceEnv(df, out_features, N_SEQUENCES) for i in range(n_envs)])
    logger.info(f'Vectorized environments created')
    
    base_path = os.path.join(
        S3_BASELINE_PATH,
        'reinforcement_learning_incentives_3',
        f'n_files_{n_files}',
        feature_ext + '_' + 'label' if lstm.startswith('continue') else feature_ext + f'_{lstm}',
        'results',
        exec_time,
    ) 
    
    
    tensorboard_dir, checkpoint_dir = (
        os.path.join(base_path, 'training_metrics'),
        os.path.join(base_path, 'checkpoints')
    )

    logger.info(f'Creating callbacks, monitors and loggerss')
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=n_episodes, verbose=1)
    checkpoint_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ// (n_envs // 2), save_path=checkpoint_dir, name_prefix='rl_model')
    callback_list = CallbackList([callback_max_episodes, checkpoint_callback])
    monitor_train = VecMonitor(citizen_science_vec)
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    if feature_ext == 'cnn':
        CustomConv1dFeatures.setup_sequences_features(N_SEQUENCES + 1,len(out_features))
        logger.info('Using custom 1 dimensional CNN feature extractor')
        policy_kwargs = dict(
            features_extractor_class=CustomConv1dFeatures,
            net_arch=[10]
        )
        model = DQN(policy='CnnPolicy', env=monitor_train, verbose=1, tensorboard_log=tensorboard_dir, policy_kwargs=policy_kwargs, device=device, stats_window_size=1000)
    else:
        logger.info('Using default MLP feature extractor')
        model = DQN(policy='MlpPolicy', env=monitor_train, verbose=1, tensorboard_log=tensorboard_dir, device=device, stats_window_size=1000)
    
    logger.info(f'Model created: policy')
    
    logger.info(pformat(model.policy))
        
    logger.info(f'Beginning training') 
            
    logger.info(pformat([
        'n_episodes: {}'.format(n_episodes),
        'read_path: {}'.format(read_path),
        'n_files: {}'.format(n_files),
        'n_sequences: {}'.format(N_SEQUENCES),
        'n_envs: {}'.format(n_envs),
        'total_timesteps: {}'.format(df.shape),
        'device: {}'.format(device),
        'tensorboard_dir: {}'.format(tensorboard_dir),
        'checkpoint_dir: {}'.format(checkpoint_dir)
    ]))
    
    model.learn(total_timesteps=25_000_000, progress_bar=True, log_interval=TB_LOG, callback=callback_list)


In [7]:
class Argument:
    read_path = 'rl_ready_data'
    n_files = 30
    n_episodes = 500_000
    n_envs = 1000
    lstm = 'seq_40'
    part = 'train'
    feature_extractor = 'cnn'

In [8]:

main(Argument)

05/30/2023 09:37:37 AM Starting Incentive Reinforcement Learning
05/30/2023 09:37:37 AM mappingproxy({'__dict__': <attribute '__dict__' of 'Argument' objects>,
              '__doc__': None,
              '__module__': '__main__',
              '__weakref__': <attribute '__weakref__' of 'Argument' objects>,
              'feature_extractor': 'cnn',
              'lstm': 'seq_40',
              'n_envs': 1000,
              'n_episodes': 500000,
              'n_files': 30,
              'part': 'train',
              'read_path': 'rl_ready_data'})
05/30/2023 09:37:37 AM Reading data from rl_ready_data_conv/files_used_30/window_2_train.parquet
05/30/2023 09:37:38 AM Loaded data with shape (1048575, 36)
05/30/2023 09:37:38 AM Setting up convolution over 2T minutes
05/30/2023 09:37:38 AM Vectorized environments created
05/30/2023 09:37:38 AM Creating callbacks, monitors and loggerss
05/30/2023 09:37:38 AM Using custom 1 dimensional CNN feature extractor


Using cuda device


05/30/2023 09:37:39 AM Model created: policy
05/30/2023 09:37:39 AM CnnPolicy(
  (q_net): QNetwork(
    (features_extractor): CustomConv1dFeatures(
      (cnn_1): Sequential(
        (0): Conv1d(21, 42, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): BatchNorm1d(42, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): Conv1d(42, 42, kernel_size=(3,), stride=(1,), padding=(1,))
        (4): BatchNorm1d(42, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (5): ReLU()
        (6): Conv1d(42, 42, kernel_size=(3,), stride=(1,), padding=(1,))
        (7): BatchNorm1d(42, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (8): Conv1d(42, 42, kernel_size=(3,), stride=(1,), padding=(1,))
        (9): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
      )
      (cnn_2): Sequential(
        (0): Conv1d(42, 21, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): BatchNorm1d(21, eps=1e-05

                 date_time  cum_session_event_raw  cum_session_time_raw     reward
470565 2020-12-12 16:34:57                      7              5.882812   5.882812
460245 2020-12-12 16:38:35                     14              9.515625   9.515625
459417 2020-12-12 16:40:26                     18             11.351562  11.351562
461097 2020-12-12 16:42:50                     20             13.750000  13.750000
468471 2020-12-12 16:44:52                     24             15.781250  15.781250
460546 2020-12-12 16:46:32                     31             17.468750  17.468750
460776 2020-12-12 16:48:36                     37             19.531250  19.531250
462194 2020-12-12 16:50:52                     45             21.796875  21.796875
471020 2020-12-12 16:52:07                     51             23.046875  23.046875
461800 2020-12-12 16:54:50                     60             25.765625  25.765625
466696 2020-12-12 16:56:43                     68             27.656250  27.656250
    

AssertionError: 