In [46]:
!python -m pip install torch --quiet
!python -m pip install gym stable-baselines3[extra] python-dotenv fsspec["s3"] s3fs==2022.11.0 --quiet

[0m

In [47]:
%load_ext dotenv
%dotenv env

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [48]:
LABEL = [
    "session_terminates_30_minutes"
]

METADATA = [
    "user_id",
    "session_30_raw",
    
    "cum_session_event_raw",
    "cum_session_time_raw",
    
    "cum_platform_event_raw",
    "global_events_user",
    "global_session_time_minutes",
]

DATE_TIME = [
    "date_time",
]

OUT_FEATURE_COLUMNS = [
    "country_count",
    "timestamp_raw",
    "date_hour_sin",
    
    "date_hour_cos",
    "session_5_count",
    "session_30_count",
    
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time_minutes",
    
    "expanding_click_average",
    "cum_platform_time_minutes",
    "cum_platform_events",
    
    "cum_projects",
    "average_event_time",
    "rolling_session_time",
    
    "rolling_session_events",
    "rolling_session_gap",
    "session_event_count",
]

In [49]:
import numpy as np
import torch
torch.set_printoptions(precision=4, linewidth=200, sci_mode=False)
np.set_printoptions(precision=4, linewidth=200, suppress=True)

USER_INDEX = 1
SESSION_INDEX = 2
TIMESTAMP_INDEX = 11
TRAIN_SPLIT = 0.7
EVAL_SPLIT = 0.15
TORCH_LOAD_COLS = LABEL + METADATA + DATE_TIME + OUT_FEATURE_COLUMNS

In [50]:
# %load callback
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat
import numpy as np
import pandas as pd
class DistributionCallback(BaseCallback):

    def _on_training_start(self) -> None:
        self._log_freq = 100
        output_formats = self.logger.output_formats
        self.tb_formatter = next(f for f in output_formats if isinstance(f, TensorBoardOutputFormat))
    
    def _on_step(self) -> bool:
        if self.n_calls % self._log_freq == 0:
            dist_list = self.training_env.env_method('dists')
            values_to_log = np.concatenate([d for d in dist_list if d.shape[0] > 0])

            session_size, sim_size, session_minutes, ended, incentive_index, reward = (
                values_to_log[:, 0],
                values_to_log[:, 1],
                values_to_log[:, 2],
                values_to_log[:, 3],
                values_to_log[:, 4],
                values_to_log[:, 5]
            )
            
            dist_session_time = (session_minutes - reward).mean()
            dist_session_end = (session_size - ended).mean()
            dist_incentive_session = (session_size - incentive_index).mean()
            dist_incentive_end = (ended - incentive_index).mean()
            n_call = self.n_calls / 100
            
            self.tb_formatter.writer.add_scalar('time/session_time', dist_session_time, n_call)
            self.tb_formatter.writer.add_scalar('event/session_end', dist_session_end, n_call)
            self.tb_formatter.writer.add_scalar('event/incentive_session', dist_incentive_session, n_call)
            self.tb_formatter.writer.add_scalar('event/incentive_end', dist_incentive_end, n_call)
            
            self.tb_formatter.writer.flush()
        return True

In [51]:
# %load environment
import numpy as np
from scipy.stats import norm 

METADATA_STAT_COLUMNS = [
    'session_size',
    'sim_size',
    'session_minutes',
    'ended',
    'incentive_index',
    'reward'
]

import gym

class CitizenScienceEnv(gym.Env):
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self, dataset, session_ranges, n_sequences):
        """
        trajectories: dictionary of user_id to their respective trajectories.
        n_sequences: number of sequences used for preprocessing.
        n_features: number of features used for preprocessing.
        """
        super(CitizenScienceEnv, self).__init__()
        self.dataset = dataset
        self.session_ranges = session_ranges
        self.n_sequences = n_sequences
        self.current_session = None
        self.current_session_index = 0
        self.reward = 0
        self.metadata_container = []

        self.action_space = gym.spaces.Discrete(2)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(n_sequences + 1, 18), dtype=np.float16)
        self.n_sequences = n_sequences

    def reset(self):
        session_id = np.random.choice(self.session_ranges)
        self.current_session = self._get_events(session_id)
        self.metadata = self._metadata()
        self.current_session_index = 1
        self.reward = 0
        return self._state()

    def step(self, action):
        self._take_action(action)
        next_state, done, meta = self._calculate_next_state()
        if done:
            self.metadata['ended'] = self.current_session_index
            self.metadata['reward'] = self.reward
            self.metadata_container.append(self.metadata[METADATA_STAT_COLUMNS].values)
            return next_state, self.reward, done, meta
        self.reward += (self.current_session.iloc[self.current_session_index]['reward'] / 60)
        self.current_session_index += 1        
        return next_state, self.reward, done, meta
    
    def _metadata(self):
        session_metadata = self.current_session.iloc[0][['user_id', 'session_30_raw', 'session_size', 'sim_size', 'session_minutes']]
        session_metadata['ended'] = 0
        session_metadata['incentive_index'] = 0
        session_metadata['reward'] = 0
        return session_metadata
    
    
    def _calculate_next_state(self):
        
        if self.current_session_index == self.current_session.shape[0]:
            return None, True, {}
        
        if self._continuing_in_session():
            return self._state(), False, {}
      
        return None, True, {}
  
    def _continuing_in_session(self):
        sim_counts = self.metadata['sim_size']
        if self.current_session_index < sim_counts:
            return True
        
        extending_session = self._probability_extending_session()
        
        return all([extending_session >= .3, extending_session <= .8])
        
    
    def _probability_extending_session(self):
        if self.metadata['incentive_index'] == 0:
            return 0
        
        scale = max(5, int(self.metadata['session_size'] / 4))
        continue_session = norm(
            loc=self.metadata['incentive_index'],
            scale=scale
        ).cdf(self.current_session_index)
        
        return continue_session
        

    def _get_events(self, session_id):
        subset = self.dataset[self.dataset['session_30_raw'] == session_id]
        subset_user = subset['user_id'].sample(1).values[0]
        subset = subset[subset['user_id'] == subset_user]
        return subset.sort_values('cum_session_event_raw').reset_index(drop=True)
    
    def _take_action(self, action):
        if action == 0 or self.metadata['incentive_index'] > 0:
            return
        
        self.metadata['incentive_index'] = self.current_session_index
        
    def _state(self):

        if self.current_session_index > self.n_sequences:
            events = self.current_session.iloc[self.current_session_index - (self.n_sequences + 1):self.current_session_index][OUT_FEATURE_COLUMNS].values
            
        else:
            delta = (self.n_sequences + 1)- self.current_session_index
            zero_cat = np.zeros((delta, len(OUT_FEATURE_COLUMNS)))
            events = self.current_session.iloc[:max(self.current_session_index, 1)][OUT_FEATURE_COLUMNS].values
            events = np.concatenate((zero_cat, events), axis=0)
            

        return events.astype(np.float32)
  
    
    def dists(self):
        metadata_container = self.metadata_container.copy()
        self.metadata_container = []
        return np.array(metadata_container)


In [52]:
from stable_baselines3.common.callbacks import EveryNTimesteps, BaseCallback, EvalCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat
from stable_baselines3.common.logger import Figure
import numpy as np
import pandas as pd
from torch.utils.tensorboard import SummaryWriter
class DistributionCallback(BaseCallback):

    def _on_training_start(self) -> None:
        self._log_freq = 20
        output_formats = self.logger.output_formats
        self.tb_formatter = next(f for f in output_formats if isinstance(f, TensorBoardOutputFormat))
    
    def _on_step(self) -> bool:
        if self.n_calls % self._log_freq == 0:
            dist_list = self.training_env.env_method('dists')
            values_to_log = np.concatenate([d for d in dist_list if d.shape[0] > 0])

            session_size, sim_size, session_minutes, ended, incentive_index, reward = (
                values_to_log[:, 0],
                values_to_log[:, 1],
                values_to_log[:, 2],
                values_to_log[:, 3],
                values_to_log[:, 4],
                values_to_log[:, 5]
            )
            
            dist_session_time = (session_minutes - reward).mean()
            dist_session_end = (session_size - ended).mean()
            dist_incentive_session = (session_size - incentive_index).mean()
            dist_incentive_end = (ended - incentive_index).mean()
            n_call = self.n_calls / 20
            
            self.tb_formatter.writer.add_scalar('event/dist_sess_time', dist_session_time, n_call)
            self.tb_formatter.writer.add_scalar('event/dist_sess_end', dist_session_end, n_call)
            self.tb_formatter.writer.add_scalar('event/dist_incentive_sess_size', dist_incentive_session, n_call)
            self.tb_formatter.writer.add_scalar('event/dist_incentive_sess_ended', dist_incentive_end, n_call)
            
            self.tb_formatter.writer.flush()
        return True

In [53]:
# %load incentive_reinforcement_learning_cpu.py
import numpy as np
import torch
torch.set_printoptions(precision=4, linewidth=200, sci_mode=False)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
from stable_baselines3.common.callbacks import CallbackList, StopTrainingOnMaxEpisodes
from stable_baselines3 import A2C
import logging
USER_INDEX = 1
SESSION_INDEX = 2
CUM_SESSION_EVENT_RAW = 3
TIMESTAMP_INDEX = 11
TRAIN_SPLIT = 0.7
EVAL_SPLIT = 0.15
import pandas as pd
import cudf as gpu_pd
from stable_baselines3.common.vec_env import DummyVecEnv
from datetime import datetime
from stable_baselines3.common.vec_env import VecMonitor
from pprint import pformat
import os
from cuml.preprocessing import MinMaxScalerGPU
from sklearn.preprocessing import MinMaxScaler

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
torch.set_printoptions(precision=2, linewidth=200, sci_mode=False)


S3_BASELINE_PATH = 's3://dissertation-data-dmiller'


def train_eval_split(dataset, logger):
    train_split = int(dataset.shape[0] * TRAIN_SPLIT)
    eval_split = int(dataset.shape[0] * EVAL_SPLIT)
    test_split = dataset.shape[0] - train_split - eval_split
    logger.info(f'Train size: 0:{train_split}, eval size: {train_split}:{train_split+eval_split}: test size: {train_split + eval_split}:{dataset.shape[0]}')
    train_dataset, eval_dataset, test_split = dataset[:train_split], dataset[train_split:train_split+eval_split], dataset[train_split+eval_split:]
    
    return {
        'train': train_dataset,
        'eval': eval_dataset,
        'test': test_split
    }

def generate_metadata(dataset):
    
    session_size = dataset.groupby(['user_id', 'session_30_raw']).size().reset_index(name='session_size')
    session_minutes = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_time_raw'].max().reset_index(name='session_minutes')
    session_size['sim_size'] = (session_size['session_size'] * .75).astype(int).apply(lambda x: x if x > 1 else 1)
    dataset = dataset.merge(session_size, on=['user_id', 'session_30_raw'])
    dataset = dataset.merge(session_minutes, on=['user_id', 'session_30_raw'])
    return dataset
    



def run_reinforcement_learning_incentives(environment, logger, n_episodes=1):
    for epoch in range(n_episodes):
        environment_comp = False
        state = environment.reset()
        i = 0
        while not environment_comp:
            next_action = (
                1 if np.random.uniform(low=0, high=1) > 0.8 else 0
            )
            state, rewards, environment_comp, meta = environment.step(next_action)
            i +=1
            if i % 100 == 0:
                logger.info(f'Step: {i} - Reward: {rewards}')
                
        logger.info(f'Epoch: {epoch} - Reward: {rewards}')
        print(environment.user_sessions.head(10))

    

def main(args):
    
    exec_time = datetime.now().strftime("%Y-%m-%d-%H-%M")
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    
    logger.info('Starting Incentive Reinforcement Learning')
    
    read_path, n_files, n_sequences, n_episodes, device, n_envs = (
        args.read_path, 
        args.n_files, 
        args.n_sequences, 
        args.n_episodes, 
        args.device,
        args.n_envs
    )
    
    logger.info(f'Reading data from {read_path}_{n_files}')
    df = gpu_pd.read_parquet(f'{read_path}_{n_files}', columns=TORCH_LOAD_COLS)
    df['date_time'] = gpu_pd.to_datetime(df['date_time'])
    df = df.sort_values(by=['date_time'])
    logger.info('Data read: generating metadata')
    df['reward'] = df['delta_last_event']
    df = generate_metadata(df)
    df[OUT_FEATURE_COLUMNS] = MinMaxScalerGPU().fit_transform(df[OUT_FEATURE_COLUMNS])
    df[OUT_FEATURE_COLUMNS] = df[OUT_FEATURE_COLUMNS].astype(np.float16)
    df = df.to_pandas()
    max_session = df['session_30_raw'].max()
    unique_episodes = df[['user_id', 'session_30_raw']].drop_duplicates().shape[0]
    session_ranges = np.arange(1, max_session + 1)
    logger.info(f'Metadata generated: instantiating environment: session_ranges: 1 => {max_session}')
    logger.info(f'Parralelizing environment with {n_envs} environments')

    citizen_science_vec = DummyVecEnv([lambda: CitizenScienceEnv(df, session_ranges, n_sequences) for _ in range(args.n_envs)])
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=1000, verbose=1)
    dist_callback = DistributionCallback()
    callback_list = CallbackList([callback_max_episodes, dist_callback])
    monitor_train = VecMonitor(citizen_science_vec)
    logger.info(f'Vectorized environments created, wrapping with monitor')

    base_path = os.path.join(
        S3_BASELINE_PATH,
        'reinforcement_learning_incentives',
        f'n_files_{n_files}',
        'results',
        exec_time,
    ) 
    
    tensorboard_dir, checkpoint_dir = (
        os.path.join(base_path, 'training_metrics'),
        os.path.join(base_path, 'checkpoints')
    )
    
    model = A2C("MlpPolicy", monitor_train, verbose=2, tensorboard_log=tensorboard_dir)
            
    logger.info(pformat([
        'n_epochs: {}'.format(n_episodes),
        'read_path: {}'.format(read_path),
        'n_files: {}'.format(n_files),
        'n_sequences: {}'.format(n_sequences),
        'n_envs: {}'.format(n_envs),
        'total_timesteps: {}'.format(df.shape),
        f'unique_episodes: {unique_episodes}',
        'device: {}'.format(device),
        'tensorboard_dir: {}'.format(tensorboard_dir),
        'checkpoint_dir: {}'.format(checkpoint_dir)
    ]))


    model.learn(total_timesteps=1_000_000, progress_bar=True, log_interval=10, callback=callback_list)


In [54]:
class Argument:
    read_path = 'calculated_features/files_used'
    n_files = 10
    n_sequences = 10
    n_episodes = 20
    n_envs = 500
    device = 'cpu'

In [55]:

main(Argument)

04/19/2023 08:45:44 AM Starting Incentive Reinforcement Learning
04/19/2023 08:45:44 AM Reading data from calculated_features/files_used_10
04/19/2023 08:45:54 AM Data read: generating metadata
04/19/2023 08:46:38 AM Metadata generated: instantiating environment: session_ranges: 1 => 458
04/19/2023 08:46:38 AM Parralelizing environment with 500 environments
04/19/2023 08:46:38 AM Vectorized environments created, wrapping with monitor
04/19/2023 08:46:38 AM ['n_epochs: 20',
 'read_path: calculated_features/files_used',
 'n_files: 10',
 'n_sequences: 10',
 'n_envs: 500',
 'total_timesteps: (12833662, 31)',
 'unique_episodes: 223297',
 'device: cpu',
 'tensorboard_dir: '
 's3://dissertation-data-dmiller/reinforcement_learning_incentives/n_files_10/results/2023-04-19-08-45/training_metrics',
 'checkpoint_dir: '
 's3://dissertation-data-dmiller/reinforcement_learning_incentives/n_files_10/results/2023-04-19-08-45/checkpoints']


Using cuda device
Logging to s3://dissertation-data-dmiller/reinforcement_learning_incentives/n_files_10/results/2023-04-19-08-45/training_metrics/A2C_1


Output()

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 28.4     |
|    ep_rew_mean        | 340.3898 |
| time/                 |          |
|    fps                | 611      |
|    iterations         | 10       |
|    time_elapsed       | 40       |
|    total_timesteps    | 25000    |
| train/                |          |
|    entropy_loss       | -0.692   |
|    explained_variance | 0.00362  |
|    learning_rate      | 0.0007   |
|    n_updates          | 9        |
|    policy_loss        | 27.5     |
|    value_loss         | 5.02e+03 |
------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 68.2      |
|    ep_rew_mean        | 1367.9648 |
| time/                 |           |
|    fps                | 724       |
|    iterations         | 20        |
|    time_elapsed       | 69        |
|    total_timesteps    | 50000     |
| train/                |           |
|    entropy_loss       | -0.667    |
|    explained_variance | 0.00203   |
|    learning_rate      | 0.0007    |
|    n_updates          | 19        |
|    policy_loss        | 45.2      |
|    value_loss         | 1.27e+04  |
-------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 103       |
|    ep_rew_mean        | 2342.6482 |
| time/                 |           |
|    fps                | 773       |
|    iterations         | 30        |
|    time_elapsed       | 96        |
|    total_timesteps    | 75000     |
| train/                |           |
|    entropy_loss       | -0.673    |
|    explained_variance | 0.00129   |
|    learning_rate      | 0.0007    |
|    n_updates          | 29        |
|    policy_loss        | 53.1      |
|    value_loss         | 1.68e+04  |
-------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 111      |
|    ep_rew_mean        | 2833.625 |
| time/                 |          |
|    fps                | 798      |
|    iterations         | 40       |
|    time_elapsed       | 125      |
|    total_timesteps    | 100000   |
| train/                |          |
|    entropy_loss       | -0.691   |
|    explained_variance | 0.00112  |
|    learning_rate      | 0.0007   |
|    n_updates          | 39       |
|    policy_loss        | 60.1     |
|    value_loss         | 1.92e+04 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 145      |
|    ep_rew_mean        | 4265.547 |
| time/                 |          |
|    fps                | 818      |
|    iterations         | 50       |
|    time_elapsed       | 152      |
|    total_timesteps    | 125000   |
| train/                |          |
|    entropy_loss       | -0.682   |
|    explained_variance | 0.00132  |
|    learning_rate      | 0.0007   |
|    n_updates          | 49       |
|    policy_loss        | 64.8     |
|    value_loss         | 2.34e+04 |
------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 138       |
|    ep_rew_mean        | 3562.1274 |
| time/                 |           |
|    fps                | 831       |
|    iterations         | 60        |
|    time_elapsed       | 180       |
|    total_timesteps    | 150000    |
| train/                |           |
|    entropy_loss       | -0.692    |
|    explained_variance | 0.000811  |
|    learning_rate      | 0.0007    |
|    n_updates          | 59        |
|    policy_loss        | 71.5      |
|    value_loss         | 2.89e+04  |
-------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 144      |
|    ep_rew_mean        | 4326.839 |
| time/                 |          |
|    fps                | 842      |
|    iterations         | 70       |
|    time_elapsed       | 207      |
|    total_timesteps    | 175000   |
| train/                |          |
|    entropy_loss       | -0.693   |
|    explained_variance | 0.00118  |
|    learning_rate      | 0.0007   |
|    n_updates          | 69       |
|    policy_loss        | 78.9     |
|    value_loss         | 3.15e+04 |
------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 148       |
|    ep_rew_mean        | 4338.5913 |
| time/                 |           |
|    fps                | 852       |
|    iterations         | 80        |
|    time_elapsed       | 234       |
|    total_timesteps    | 200000    |
| train/                |           |
|    entropy_loss       | -0.692    |
|    explained_variance | 0.000838  |
|    learning_rate      | 0.0007    |
|    n_updates          | 79        |
|    policy_loss        | 89.5      |
|    value_loss         | 3.84e+04  |
-------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 201      |
|    ep_rew_mean        | 6317.937 |
| time/                 |          |
|    fps                | 859      |
|    iterations         | 90       |
|    time_elapsed       | 261      |
|    total_timesteps    | 225000   |
| train/                |          |
|    entropy_loss       | -0.691   |
|    explained_variance | 0.000729 |
|    learning_rate      | 0.0007   |
|    n_updates          | 89       |
|    policy_loss        | 91.4     |
|    value_loss         | 4.14e+04 |
------------------------------------


KeyboardInterrupt: 