In [1]:
!python -m pip install torch --quiet
!python -m pip install gym stable-baselines3[extra] python-dotenv fsspec["s3"] s3fs==2022.11.0 --quiet

[0m

In [2]:
%load_ext dotenv
%dotenv env

In [3]:
LABEL = [
    "session_terminates_30_minutes"
]

METADATA = [
    "user_id",
    "session_30_raw",
    
    "cum_session_event_raw",
    "cum_session_time_raw",
    
    "cum_platform_event_raw",
    "global_events_user",
    "global_session_time_minutes",
]

DATE_TIME = [
    "date_time",
]

OUT_FEATURE_COLUMNS = [
    "country_count",
    "timestamp_raw",
    "date_hour_sin",
    
    "date_hour_cos",
    "session_5_count",
    "session_30_count",
    
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time_minutes",
    
    "expanding_click_average",
    "cum_platform_time_minutes",
    "cum_platform_events",
    
    "cum_projects",
    "average_event_time",
    "rolling_session_time",
    
    "rolling_session_events",
    "rolling_session_gap",
    "session_event_count",
]

In [4]:
import numpy as np
import torch
torch.set_printoptions(precision=4, linewidth=200, sci_mode=False)
np.set_printoptions(precision=4, linewidth=200, suppress=True)

USER_INDEX = 1
SESSION_INDEX = 2
TIMESTAMP_INDEX = 11
TRAIN_SPLIT = 0.7
EVAL_SPLIT = 0.15
TORCH_LOAD_COLS = LABEL + METADATA + DATE_TIME + OUT_FEATURE_COLUMNS

In [5]:
# %load callback
from stable_baselines3.common.callbacks import EveryNTimesteps, BaseCallback, EvalCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat
from stable_baselines3.common.logger import Figure
import numpy as np
import pandas as pd
from torch.utils.tensorboard import SummaryWriter
class DistributionCallback(BaseCallback):

    def _on_training_start(self) -> None:
        self.log_freq = 500
        output_formats = self.logger.output_formats
        self.tb_formatter = next(f for f in output_formats if isinstance(f, TensorBoardOutputFormat))
    
    def _on_step(self) -> bool:
        if self.n_calls % self._log_freq == 0:
            dist_list = self.training_env.env_method('dists')
            values_to_log = np.concatenate([d for d in dist_list if d.shape[0] > 0])

            session_size, sim_size, session_minutes, ended, incentive_index, reward = (
                values_to_log[:, 0],
                values_to_log[:, 1],
                values_to_log[:, 2],
                values_to_log[:, 3],
                values_to_log[:, 4],
                values_to_log[:, 5]
            )
            
            dist_session_time = (session_minutes - reward).mean()
            dist_session_end = (session_size - ended).mean()
            dist_incentive_session = (session_size - incentive_index).mean()
            dist_incentive_end = (ended - incentive_index).mean()
            dist_incentive_sim_size = (ended - sim_size).mean()
            
            n_call = self.n_calls // self._log_freq
            
            self.tb_formatter.writer.add_scalar('event/sess_time_sub_sime_time::decrease', dist_session_time, n_call)
            self.tb_formatter.writer.add_scalar('event/sess_index_sub_sim_index::decrease', dist_session_end, n_call)
            self.tb_formatter.writer.add_scalar('event/sim_incentive_index_sub_index_no_reward::increase', dist_incentive_sim_size, n_call)
            
            self.tb_formatter.writer.add_scalar('event/sess_index_sub_incentive_index', dist_incentive_session, n_call)
            self.tb_formatter.writer.add_scalar('event/sim_index_sub_incentive_index', dist_incentive_end, n_call)
            
            self.tb_formatter.writer.flush()
        return True

In [6]:
# %load environment
import torch
import gym
import numpy as np
from scipy.stats import norm 

METADATA_STAT_COLUMNS = [
    'session_size',
    'sim_size',
    'session_minutes',
    'ended',
    'incentive_index',
    'reward'
]

class CitizenScienceEnv(gym.Env):
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self, dataset, n_sequences):
        """
        trajectories: dictionary of user_id to their respective trajectories.
        n_sequences: number of sequences used for preprocessing.
        n_features: number of features used for preprocessing.
        """
        super(CitizenScienceEnv, self).__init__()
        self.dataset = dataset
        self.n_sequences = n_sequences
        self.current_session = None
        self.current_session_index = 0
        self.reward = 0
        self.metadata_container = []

        self.action_space = gym.spaces.Discrete(2)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(n_sequences + 1, 18), dtype=np.float32)
        self.n_sequences = n_sequences

    def reset(self):
        session_to_run = self.dataset.sample()
        if torch.cuda.is_available():
            session_to_run = session_to_run.to_pandas()
        self.current_session = self._get_events(session_to_run.to_dict('records')[0])
        self.metadata = self._metadata()
        self.current_session_index = 1
        self.reward = 0
        return self._state()

    def step(self, action):
        self._take_action(action)
        next_state, done, meta = self._calculate_next_state()
        if done:
            self.metadata['ended'] = self.current_session_index
            self.metadata['reward'] = self.reward
            self.metadata_container.append(self.metadata[METADATA_STAT_COLUMNS].values)
            return next_state, self.reward, done, meta
        self.reward += (self.current_session.iloc[self.current_session_index]['reward'] / 60)
        self.current_session_index += 1        
        return next_state, self.reward, done, meta
    
    def _metadata(self):
        session_metadata = self.current_session.iloc[0][['user_id', 'session_30_raw', 'session_size', 'sim_size', 'session_minutes']]
        session_metadata['ended'] = 0
        session_metadata['incentive_index'] = 0
        session_metadata['reward'] = 0
        return session_metadata
    
    
    def _calculate_next_state(self):
        
        if self.current_session_index == self.current_session.shape[0]:
            return None, True, {}
        
        if self._continuing_in_session():
            return self._state(), False, {}
      
        return None, True, {}
  
    def _continuing_in_session(self):
        sim_counts = self.metadata['sim_size']
        if self.current_session_index < sim_counts:
            return True
        
        extending_session = self._probability_extending_session()
        
        return all([extending_session >= .3, extending_session <= .8])
        
    
    def _probability_extending_session(self):
        if self.metadata['incentive_index'] == 0:
            return 0
        
        scale = max(5, int(self.metadata['session_size'] / 4))
        continue_session = norm(
            loc=self.metadata['incentive_index'],
            scale=scale
        ).cdf(self.current_session_index)
        
        return continue_session
        

    def _get_events(self, session):
        subset = self.dataset[
            (self.dataset['session_30_raw'] == session['session_30_raw']) & 
            (self.dataset['user_id'] == session['user_id'])
        ]

        if torch.cuda.is_available():
            subset = subset.to_pandas()
        
        subset = subset.astype(np.float16)
        return subset.sort_values('cum_session_event_raw').reset_index(drop=True)
    
    def _take_action(self, action):
        if action == 0 or self.metadata['incentive_index'] > 0:
            return
        
        self.metadata['incentive_index'] = self.current_session_index
        
    def _state(self):

        if self.current_session_index > self.n_sequences:
            events = self.current_session.iloc[self.current_session_index - (self.n_sequences + 1):self.current_session_index][OUT_FEATURE_COLUMNS].values
            
        else:
            delta = (self.n_sequences + 1)- self.current_session_index
            zero_cat = np.zeros((delta, len(OUT_FEATURE_COLUMNS)))
            events = self.current_session.iloc[:max(self.current_session_index, 1)][OUT_FEATURE_COLUMNS].values
            events = np.concatenate((zero_cat, events), axis=0)
            

        return events.astype(np.float16)
  
    
    def dists(self):
        metadata_container = self.metadata_container.copy()
        self.metadata_container = []
        return np.array(metadata_container)


In [7]:
# %load incentive_reinforcement_learning_cpu.py
import argparse
import numpy as np
import torch
torch.set_printoptions(precision=4, linewidth=200, sci_mode=False)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
from stable_baselines3.common.callbacks import CallbackList, StopTrainingOnMaxEpisodes, CheckpointCallback
from stable_baselines3 import A2C
import logging
import pandas as pd
from stable_baselines3.common.vec_env import DummyVecEnv
from datetime import datetime
from stable_baselines3.common.vec_env import VecMonitor
from pprint import pformat
import os
if torch.cuda.is_available():
    import cudf as pd
    from cuml.preprocessing import MinMaxScaler
else:
    import pandas as pd
    from sklearn.preprocessing import MinMaxScaler


logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
torch.set_printoptions(precision=2, linewidth=200, sci_mode=False)

USER_INDEX = 1
SESSION_INDEX = 2
CUM_SESSION_EVENT_RAW = 3
TIMESTAMP_INDEX = 11
TRAIN_SPLIT = 0.7
EVAL_SPLIT = 0.15
S3_BASELINE_PATH = 's3://dissertation-data-dmiller'

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--read_path', type=str, default='datasets/calculated_features/files_used')
    parser.add_argument('--n_files', type=str, default=2)
    parser.add_argument('--n_sequences', type=int, default=10)
    parser.add_argument('--n_episodes', type=int, default=20)
    parser.add_argument('--n_envs', type=int, default=100)
    parser.add_argument('--device', type=str, default='cpu')
    
    args = parser.parse_args()
    return args

def train_eval_split(dataset, logger):
    train_split = int(dataset.shape[0] * TRAIN_SPLIT)
    eval_split = int(dataset.shape[0] * EVAL_SPLIT)
    test_split = dataset.shape[0] - train_split - eval_split
    logger.info(f'Train size: 0:{train_split}, eval size: {train_split}:{train_split+eval_split}: test size: {train_split + eval_split}:{dataset.shape[0]}')
    train_dataset, eval_dataset, test_split = dataset[:train_split], dataset[train_split:train_split+eval_split], dataset[train_split+eval_split:]
    
    return {
        'train': train_dataset,
        'eval': eval_dataset,
        'test': test_split
    }

def generate_metadata(dataset):
    
    session_size = dataset.groupby(['user_id', 'session_30_raw']).size().reset_index(name='session_size')
    session_minutes = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_time_raw'].max().reset_index(name='session_minutes')
    session_size['sim_size'] = (session_size['session_size'] * .75).astype(int).apply(lambda x: x if x > 1 else 1)
    dataset = dataset.merge(session_size, on=['user_id', 'session_30_raw'])
    dataset = dataset.merge(session_minutes, on=['user_id', 'session_30_raw'])
    return dataset
    



def run_reinforcement_learning_incentives(environment, logger, n_episodes=1):
    for epoch in range(n_episodes):
        environment_comp = False
        state = environment.reset()
        i = 0
        while not environment_comp:
            next_action = (
                1 if np.random.uniform(low=0, high=1) > 0.8 else 0
            )
            state, rewards, environment_comp, meta = environment.step(next_action)
            i +=1
            if i % 100 == 0:
                logger.info(f'Step: {i} - Reward: {rewards}')
                
        logger.info(f'Epoch: {epoch} - Reward: {rewards}')
        print(environment.user_sessions.head(10))

    

def main(args):
    
    exec_time = datetime.now().strftime("%Y-%m-%d-%H-%M")
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    
    logger.info('Starting Incentive Reinforcement Learning')
    
    read_path, n_files, n_sequences, n_episodes, device, n_envs = (
        args.read_path, 
        args.n_files, 
        args.n_sequences, 
        args.n_episodes, 
        args.device,
        args.n_envs
    )
    
    logger.info(f'Reading data from {read_path}_{n_files}.parquet')
    df = pd.read_parquet(f'{read_path}_{n_files}.parquet', columns=TORCH_LOAD_COLS)
    df['date_time'] = pd.to_datetime(df['date_time'])
    df = df.sort_values(by=['date_time']).drop(columns=['date_time'])
    df = df.head(int(df.shape[0] * 0.7))
    logger.info('Data read: generating metadata')
    df['reward'] = df['delta_last_event']
    df = generate_metadata(df)
    df[OUT_FEATURE_COLUMNS] = MinMaxScaler().fit_transform(df[OUT_FEATURE_COLUMNS])
    unique_episodes = df[['user_id', 'session_30_raw']].drop_duplicates().shape[0]
    logger.info(f'Parralelizing environment with {n_envs} environments')

    citizen_science_vec = DummyVecEnv([lambda: CitizenScienceEnv(df, n_sequences) for _ in range(n_envs)])

    logger.info(f'Vectorized environments created, wrapping with monitor')

    base_path = os.path.join(
        S3_BASELINE_PATH,
        'reinforcement_learning_incentives',
        f'n_files_{n_files}',
        'results',
        exec_time,
    ) 
    
    tensorboard_dir, checkpoint_dir = (
        os.path.join(base_path, 'training_metrics'),
        os.path.join(base_path, 'checkpoints')
    )

    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=n_episodes, verbose=1)
    checkpoint_callback = CheckpointCallback(save_freq=1000 // n_envs, save_path=checkpoint_dir, name_prefix='rl_model')
    dist_callback = DistributionCallback()
    callback_list = CallbackList([callback_max_episodes, dist_callback, checkpoint_callback])
    monitor_train = VecMonitor(citizen_science_vec)
    
    model = A2C(
        "MlpPolicy", 
        monitor_train, 
        verbose=2, 
        tensorboard_log=tensorboard_dir,
        stats_window_size=1000)
            
    logger.info(pformat([
        'n_epochs: {}'.format(n_episodes),
        'read_path: {}'.format(read_path),
        'n_files: {}'.format(n_files),
        'n_sequences: {}'.format(n_sequences),
        'n_envs: {}'.format(n_envs),
        'total_timesteps: {}'.format(df.shape),
        f'unique_episodes: {unique_episodes}',
        'device: {}'.format(device),
        'tensorboard_dir: {}'.format(tensorboard_dir),
        'checkpoint_dir: {}'.format(checkpoint_dir)
    ]))


    model.learn(total_timesteps=100_000_000, progress_bar=True, log_interval=100, callback=callback_list)



In [8]:
class Argument:
    read_path = 'calculated_features/files_used'
    n_files = 30
    n_sequences = 10
    n_episodes = 250_000
    n_envs = 500
    device = 'cpu'

In [9]:

main(Argument)

04/20/2023 09:19:33 AM Starting Incentive Reinforcement Learning
04/20/2023 09:19:33 AM Reading data from calculated_features/files_used_30.parquet
04/20/2023 09:19:38 AM Data read: generating metadata
04/20/2023 09:19:43 AM Parralelizing environment with 500 environments
04/20/2023 09:19:43 AM Vectorized environments created, wrapping with monitor
04/20/2023 09:19:43 AM ['n_epochs: 250000',
 'read_path: calculated_features/files_used',
 'n_files: 30',
 'n_sequences: 10',
 'n_envs: 500',
 'total_timesteps: (26950693, 30)',
 'unique_episodes: 466103',
 'device: cpu',
 'tensorboard_dir: '
 's3://dissertation-data-dmiller/reinforcement_learning_incentives/n_files_30/results/2023-04-20-09-19/training_metrics',
 'checkpoint_dir: '
 's3://dissertation-data-dmiller/reinforcement_learning_incentives/n_files_30/results/2023-04-20-09-19/checkpoints']


Using cuda device


04/20/2023 09:20:10 AM Found credentials in environment variables.
04/20/2023 09:20:11 AM Found credentials in environment variables.


Logging to s3://dissertation-data-dmiller/reinforcement_learning_incentives/n_files_30/results/2023-04-20-09-19/training_metrics/A2C_1


Output()

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 126      |
|    ep_rew_mean        | 3601.674 |
| time/                 |          |
|    fps                | 1237     |
|    iterations         | 100      |
|    time_elapsed       | 201      |
|    total_timesteps    | 250000   |
| train/                |          |
|    entropy_loss       | -0.594   |
|    explained_variance | 0.00096  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 70.7     |
|    value_loss         | 3.39e+04 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 238      |
|    ep_rew_mean        | 9342.826 |
| time/                 |          |
|    fps                | 1329     |
|    iterations         | 200      |
|    time_elapsed       | 376      |
|    total_timesteps    | 500000   |
| train/                |          |
|    entropy_loss       | -0.689   |
|    explained_variance | 0.000396 |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | 101      |
|    value_loss         | 5.19e+04 |
------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 280       |
|    ep_rew_mean        | 12217.732 |
| time/                 |           |
|    fps                | 1365      |
|    iterations         | 300       |
|    time_elapsed       | 549       |
|    total_timesteps    | 750000    |
| train/                |           |
|    entropy_loss       | -0.691    |
|    explained_variance | 0.0008    |
|    learning_rate      | 0.0007    |
|    n_updates          | 299       |
|    policy_loss        | 109       |
|    value_loss         | 6.14e+04  |
-------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 298       |
|    ep_rew_mean        | 15171.032 |
| time/                 |           |
|    fps                | 1390      |
|    iterations         | 400       |
|    time_elapsed       | 719       |
|    total_timesteps    | 1000000   |
| train/                |           |
|    entropy_loss       | -0.626    |
|    explained_variance | 0.000422  |
|    learning_rate      | 0.0007    |
|    n_updates          | 399       |
|    policy_loss        | 97.5      |
|    value_loss         | 6.23e+04  |
-------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 316       |
|    ep_rew_mean        | 15936.054 |
| time/                 |           |
|    fps                | 1409      |
|    iterations         | 500       |
|    time_elapsed       | 887       |
|    total_timesteps    | 1250000   |
| train/                |           |
|    entropy_loss       | -0.364    |
|    explained_variance | 0.000272  |
|    learning_rate      | 0.0007    |
|    n_updates          | 499       |
|    policy_loss        | 58.7      |
|    value_loss         | 7.47e+04  |
-------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 317       |
|    ep_rew_mean        | 17770.775 |
| time/                 |           |
|    fps                | 1419      |
|    iterations         | 600       |
|    time_elapsed       | 1056      |
|    total_timesteps    | 1500000   |
| train/                |           |
|    entropy_loss       | -0.326    |
|    explained_variance | 0.000206  |
|    learning_rate      | 0.0007    |
|    n_updates          | 599       |
|    policy_loss        | 49.7      |
|    value_loss         | 6.59e+04  |
-------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 323       |
|    ep_rew_mean        | 17555.455 |
| time/                 |           |
|    fps                | 1428      |
|    iterations         | 700       |
|    time_elapsed       | 1225      |
|    total_timesteps    | 1750000   |
| train/                |           |
|    entropy_loss       | -0.419    |
|    explained_variance | 0.000145  |
|    learning_rate      | 0.0007    |
|    n_updates          | 699       |
|    policy_loss        | 71.1      |
|    value_loss         | 8e+04     |
-------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 314       |
|    ep_rew_mean        | 16782.191 |
| time/                 |           |
|    fps                | 1433      |
|    iterations         | 800       |
|    time_elapsed       | 1394      |
|    total_timesteps    | 2000000   |
| train/                |           |
|    entropy_loss       | -0.653    |
|    explained_variance | 9.73e-05  |
|    learning_rate      | 0.0007    |
|    n_updates          | 799       |
|    policy_loss        | 121       |
|    value_loss         | 1.03e+05  |
-------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 319       |
|    ep_rew_mean        | 19567.025 |
| time/                 |           |
|    fps                | 1439      |
|    iterations         | 900       |
|    time_elapsed       | 1563      |
|    total_timesteps    | 2250000   |
| train/                |           |
|    entropy_loss       | -0.592    |
|    explained_variance | 5.2e-05   |
|    learning_rate      | 0.0007    |
|    n_updates          | 899       |
|    policy_loss        | 103       |
|    value_loss         | 8.41e+04  |
-------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 339       |
|    ep_rew_mean        | 20352.361 |
| time/                 |           |
|    fps                | 1445      |
|    iterations         | 1000      |
|    time_elapsed       | 1729      |
|    total_timesteps    | 2500000   |
| train/                |           |
|    entropy_loss       | -0.385    |
|    explained_variance | 4.82e-05  |
|    learning_rate      | 0.0007    |
|    n_updates          | 999       |
|    policy_loss        | 66.9      |
|    value_loss         | 8.42e+04  |
-------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 337      |
|    ep_rew_mean        | 20426.98 |
| time/                 |          |
|    fps                | 1446     |
|    iterations         | 1100     |
|    time_elapsed       | 1900     |
|    total_timesteps    | 2750000  |
| train/                |          |
|    entropy_loss       | -0.428   |
|    explained_variance | 7.78e-05 |
|    learning_rate      | 0.0007   |
|    n_updates          | 1099     |
|    policy_loss        | 74.9     |
|    value_loss         | 8.45e+04 |
------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 325       |
|    ep_rew_mean        | 19694.783 |
| time/                 |           |
|    fps                | 1449      |
|    iterations         | 1300      |
|    time_elapsed       | 2242      |
|    total_timesteps    | 3250000   |
| train/                |           |
|    entropy_loss       | -0.529    |
|    explained_variance | 4.08e-05  |
|    learning_rate      | 0.0007    |
|    n_updates          | 1299      |
|    policy_loss        | 93.9      |
|    value_loss         | 8.59e+04  |
-------------------------------------
