In [1]:
!python -m pip install torch --quiet
!python -m pip install gym stable-baselines3[extra] python-dotenv fsspec["s3"] s3fs==2022.11.0 --quiet

[0m

In [2]:
%load_ext dotenv
%dotenv env

In [3]:
LABEL = [
    "session_terminates_30_minutes"
]

METADATA = [
    "user_id",
    "session_30_raw",
    
    "cum_session_event_raw",
    "cum_session_time_raw",
    
    "cum_platform_event_raw",
    "global_events_user",
    "global_session_time_minutes",
]

DATE_TIME = [
    "date_time",
]

OUT_FEATURE_COLUMNS = [
    "country_count",
    "timestamp_raw",
    "date_hour_sin",
    
    "date_hour_cos",
    "session_5_count",
    "session_30_count",
    
    "cum_session_event_count",
    "delta_last_event",
    "cum_session_time_minutes",
    
    "expanding_click_average",
    "cum_platform_time_minutes",
    "cum_platform_events",
    
    "cum_projects",
    "average_event_time",
    "rolling_session_time",
    
    "rolling_session_events",
    "rolling_session_gap",
    "session_event_count",
]

METADATA_STAT_COLUMNS = [
    'session_size',
    'sim_size',
    'session_minutes',
    'ended',
    'incentive_index',
    'reward',
    'n_episodes',
    'time_in_session'
]


In [4]:
import numpy as np
import torch
torch.set_printoptions(precision=4, linewidth=200, sci_mode=False)
np.set_printoptions(precision=4, linewidth=200, suppress=True)

USER_INDEX = 1
SESSION_INDEX = 2
TIMESTAMP_INDEX = 11
TRAIN_SPLIT = 0.7
EVAL_SPLIT = 0.15
TORCH_LOAD_COLS = LABEL + METADATA + DATE_TIME + OUT_FEATURE_COLUMNS + ['prediction']

In [5]:
# %load callback
from stable_baselines3.common.callbacks import  BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat
import numpy as np
import pandas as pd

class DistributionCallback(BaseCallback):
    
    metadata_stat = METADATA_STAT_COLUMNS + ['time_in_session']
    @classmethod
    def tensorboard_setup(cls, log_dir, log_freq):
        cls._log_dir = log_dir
        cls._log_freq = log_freq

    def _on_training_start(self) -> None:
        output_formats = self.logger.output_formats
        self.tb_formatter = next(f for f in output_formats if isinstance(f, TensorBoardOutputFormat))
    
    def _on_step(self) -> bool:
        if self.n_calls % self._log_freq == 0:
            dist_list = self.training_env.env_method('dists')
            values_to_log = np.concatenate([d for d in dist_list if d.shape[0] > 0])

            values_df = pd.DataFrame(
                values_to_log, 
                columns=METADATA_STAT_COLUMNS
            )
            
            dist_session_time = (values_df['session_minutes'] - values_df['time_in_session']).mean()
            dist_session_end = (values_df['session_size'] - values_df['ended']).mean()
            dist_inc_session = (values_df['session_size'] - values_df['incentive_index']).mean()
            dist_session_end = (values_df['ended'] - values_df['incentive_index']).mean()
            dist_inc_sim_size = (values_df['ended'] - values_df['sim_size']).mean()
            dist_inc_sim_index = (values_df['incentive_index'] - values_df['sim_size']).mean()

            n_call = self.n_calls // self._log_freq
            
            self.tb_formatter.writer.add_scalar('event/sess_time_sub_sime_time::decrease', dist_session_time, n_call)
            self.tb_formatter.writer.add_scalar('event/sess_index_sub_sim_index::decrease', dist_session_end, n_call)
            self.tb_formatter.writer.add_scalar('event/sim_incentive_index_sub_index_no_reward::increase', dist_inc_sim_size, n_call)
            
            self.tb_formatter.writer.add_scalar('event/sess_index_sub_incentive_index', dist_inc_session, n_call)
            self.tb_formatter.writer.add_scalar('event/sim_index_sub_incentive_index', dist_inc_sim_index, n_call)
            self.tb_formatter.writer.flush()
            
            values_df.to_parquet(f'{self._log_dir}/dist_{n_call}.parquet')
            
        return True

In [6]:
# %load environment
import numpy as np
from scipy.stats import norm 


import gym

import numpy as np
from scipy.stats import norm 


import gym



class CitizenScienceEnv(gym.Env):
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self, dataset, unique_episodes, unique_sessions, n_sequences):
        """
        trajectories: dictionary of user_id to their respective trajectories.
        n_sequences: number of sequences used for preprocessing.
        n_features: number of features used for preprocessing.
        """
        super(CitizenScienceEnv, self).__init__()
        self.dataset = dataset
        self.unique_episodes = unique_episodes
        self.n_episodes = 0
        self.n_sequences = n_sequences
        self.unique_sessions = unique_sessions
        self.current_session = None
        self.current_session_index = 0
        self.reward = 0
        self.metadata_container = []
        self.n_sequences = n_sequences
        self.out_features = OUT_FEATURE_COLUMNS + ['prediction']
        
        self.action_space = gym.spaces.Discrete(2)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(n_sequences + 1, len(self.out_features)), dtype=np.float32)

    def reset(self):
        self.n_episodes += 1
        session_to_run = self.unique_sessions.sample()['session_30_raw'].values[0]
        user_to_run = self.unique_episodes[self.unique_episodes['session_30_raw'] == session_to_run].sample()['user_id'].values[0]
        self.current_session = self._get_events(user_to_run, session_to_run)
        self.metadata = self._metadata()
        self.current_session_index = 0
        self.reward = 0
        return self._state()

    def step(self, action):
        self._take_action(action)
        next_state, done, meta = self._calculate_next_state()
        if done:
            self.metadata['ended'] = self.current_session_index + 1
            self.metadata['reward'] = self.reward
            time_minute_index = min(self.current_session_index, self.current_session.shape[0] - 1)
            self.metadata['time_in_session'] = self.current_session.iloc[time_minute_index]['cum_session_time_minutes']
            self.metadata_container.append(self.metadata[METADATA_STAT_COLUMNS].values)
            return next_state, self.reward, done, meta
        else:
            self.reward += (self.current_session.iloc[self.current_session_index]['reward'] / 60)
            self.current_session_index += 1        
        return next_state, self.reward, done, meta
    
    def _metadata(self):
        session_metadata = self.current_session.iloc[0][['user_id', 'session_30_raw', 'session_size', 'sim_size', 'session_minutes']]
        session_metadata['ended'] = 0
        session_metadata['incentive_index'] = 0
        session_metadata['reward'] = 0
        session_metadata['n_episodes'] = self.n_episodes
        return session_metadata
    
    
    def _calculate_next_state(self):
        
        if (self.current_session_index == self.current_session.shape[0]):
            return None, True, {}

        if self._continuing_in_session():
            return self._state(), False, {}
    
        return None, True, {}
        
      
  
    def _continuing_in_session(self):
        sim_counts = self.metadata['sim_size']
        if self.current_session_index < sim_counts:
            return True
        
        extending_session = self._probability_extending_session()
        
        return all([extending_session >= .3, extending_session <= .7])
        
    
    def _probability_extending_session(self):
        if self.metadata['incentive_index'] == 0:
            return 0
        
        scale = max(5, int(self.metadata['session_size'] / 4))
        continue_session = norm(
            loc=self.metadata['incentive_index'],
            scale=scale
        ).cdf(self.current_session_index)
        
        return continue_session
        

    def _get_events(self, user_id, session):
        subset = self.dataset[
            (self.dataset['user_id'] == user_id) &
            (self.dataset['session_30_raw'] == session)
        ]
   
        return subset.sort_values('cum_session_event_raw').reset_index(drop=True)
    
    def _take_action(self, action):
        if action == 0 or self.metadata['incentive_index'] > 0:
            return
        
        self.metadata['incentive_index'] = self.current_session_index
        
    def _state(self):

        if self.current_session_index > self.n_sequences:
            events = self.current_session.iloc[self.current_session_index - (self.n_sequences + 1):self.current_session_index][self.out_features].values
            
        else:
            delta = min((self.n_sequences + 1)- self.current_session_index, 10)
            zero_cat = np.zeros((delta, len(self.out_features)))
            events = self.current_session.iloc[:max(self.current_session_index, 1)][self.out_features].values
            events = np.concatenate((zero_cat, events), axis=0)
            

        return events.astype(np.float32)
  
    
    def dists(self):
        metadata_container = self.metadata_container.copy()
        self.metadata_container = []
        return np.array(metadata_container)

In [7]:
# %load incentive_reinforcement_learning_cpu.py
import argparse
import numpy as np
import torch
torch.set_printoptions(precision=4, linewidth=200, sci_mode=False)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
from stable_baselines3.common.callbacks import CallbackList, StopTrainingOnMaxEpisodes, CheckpointCallback
from stable_baselines3 import A2C
from stable_baselines3.common.env_checker import check_env
import logging
USER_INDEX = 1
SESSION_INDEX = 2
CUM_SESSION_EVENT_RAW = 3
TIMESTAMP_INDEX = 11
TRAIN_SPLIT = 0.7
EVAL_SPLIT = 0.15
import pandas as pd
from stable_baselines3.common.vec_env import DummyVecEnv
from datetime import datetime
from stable_baselines3.common.vec_env import VecMonitor
from pprint import pformat
import os

if torch.cuda.is_available():
    import cudf as gpu_pd
    from cuml.preprocessing import MinMaxScaler
else:
    from sklearn.preprocessing import MinMaxScaler

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
torch.set_printoptions(precision=2, linewidth=200, sci_mode=False)


S3_BASELINE_PATH = 's3://dissertation-data-dmiller'

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--read_path', type=str, default='datasets/rl_ready_data')
    parser.add_argument('--n_files', type=str, default=2)
    parser.add_argument('--n_sequences', type=int, default=10)
    parser.add_argument('--n_episodes', type=int, default=20)
    parser.add_argument('--n_envs', type=int, default=100)
    parser.add_argument('--lstm', type=str, default='ordinal_10')
    parser.add_argument('--device', type=str, default='cpu')
    
    args = parser.parse_args()
    return args

import numpy as np
import torch
torch.set_printoptions(precision=4, linewidth=200, sci_mode=False)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
from stable_baselines3.common.callbacks import CallbackList, StopTrainingOnMaxEpisodes
from stable_baselines3 import A2C
import logging
USER_INDEX = 1
SESSION_INDEX = 2
CUM_SESSION_EVENT_RAW = 3
TIMESTAMP_INDEX = 11
TRAIN_SPLIT = 0.7
EVAL_SPLIT = 0.15
import pandas as pd
# import cudf as gpu_pd
from stable_baselines3.common.vec_env import DummyVecEnv
from datetime import datetime
from stable_baselines3.common.vec_env import VecMonitor
from pprint import pformat
import os

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
torch.set_printoptions(precision=2, linewidth=200, sci_mode=False)


S3_BASELINE_PATH = 's3://dissertation-data-dmiller'


def train_eval_split(dataset, logger):
    train_split = int(dataset.shape[0] * TRAIN_SPLIT)
    eval_split = int(dataset.shape[0] * EVAL_SPLIT)
    test_split = dataset.shape[0] - train_split - eval_split
    logger.info(f'Train size: 0:{train_split}, eval size: {train_split}:{train_split+eval_split}: test size: {train_split + eval_split}:{dataset.shape[0]}')
    train_dataset, eval_dataset, test_split = dataset[:train_split], dataset[train_split:train_split+eval_split], dataset[train_split+eval_split:]
    
    return {
        'train': train_dataset,
        'eval': eval_dataset,
        'test': test_split
    }

def generate_metadata(dataset):
    
    session_size = dataset.groupby(['user_id', 'session_30_raw']).size().reset_index(name='session_size')
    session_minutes = dataset.groupby(['user_id', 'session_30_raw'])['cum_session_time_raw'].max().reset_index(name='session_minutes')
    session_size['sim_size'] = (session_size['session_size'] * .7).astype(int).apply(lambda x: x if x > 1 else 1)
    dataset = dataset.merge(session_size, on=['user_id', 'session_30_raw'])
    dataset = dataset.merge(session_minutes, on=['user_id', 'session_30_raw'])
    return dataset
    



def run_reinforcement_learning_incentives(environment, logger, n_episodes=1):
    for epoch in range(n_episodes):
        environment_comp = False
        state = environment.reset()
        i = 0
        while not environment_comp:
            next_action = (
                1 if np.random.uniform(low=0, high=1) > 0.8 else 0
            )
            state, rewards, environment_comp, meta = environment.step(next_action)
            i +=1
            if i % 100 == 0:
                logger.info(f'Step: {i} - Reward: {rewards}')
                
        logger.info(f'Epoch: {epoch} - Reward: {rewards}')
        print(environment.user_sessions.head(10))

    
def main(args):
    
    exec_time = datetime.now().strftime("%Y-%m-%d-%H-%M")
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    
    logger.info('Starting Incentive Reinforcement Learning')
    
    read_path, n_files, n_sequences, n_episodes, device, n_envs, lstm = (
        args.read_path, 
        args.n_files, 
        args.n_sequences, 
        args.n_episodes, 
        args.device,
        args.n_envs,
        args.lstm
    )
    
    file_ext = '.gzip' if not torch.cuda.is_available() else ""
    
    read_path = os.path.join(
        read_path,
        f'files_used_{n_files}',
        f'rl_ready_data.parquet{file_ext}'
    )
    
    logger.info(f'Reading data from {read_path}_{n_files}.parquet')
    if torch.cuda.is_available():
        df = gpu_pd.read_parquet(read_path, columns=TORCH_LOAD_COLS)
        df['date_time'] = gpu_pd.to_datetime(df['date_time'])
    else:
        df = pd.read_parquet(read_path, columns=TORCH_LOAD_COLS)
        df['date_time'] = pd.to_datetime(df['date_time'])
        
        
    df = df.sort_values(by=['date_time'])
    df = df.head(int(df.shape[0] * 0.7))
    logger.info('Data read: generating metadata')
    df['reward'] = df['delta_last_event']
    df = generate_metadata(df)
    
    logger.info(f'Metadata generated: scaling features')
    df[OUT_FEATURE_COLUMNS] = MinMaxScaler().fit_transform(df[OUT_FEATURE_COLUMNS])
    logger.info(f'Features Scaled')

    unique_episodes = df[['user_id', 'session_30_raw']].drop_duplicates()
    unique_sessions = df[['session_30_raw']].drop_duplicates()
    logger.info(f'Parralelizing environment with {n_envs} environments')
    if torch.cuda.is_available():
        df, unique_episodes, unique_sessions = df.to_pandas(), unique_episodes.to_pandas(), unique_sessions.to_pandas()
        

    citizen_science_vec = DummyVecEnv([lambda: CitizenScienceEnv(df, unique_episodes, unique_sessions, n_sequences) for _ in range(n_envs)])

    logger.info(f'Vectorized environments created, wrapping with monitor')

    base_path = os.path.join(
        S3_BASELINE_PATH,
        'reinforcement_learning_incentives',
        f'n_files_{n_files}',
        'results',
        f'lstm_{lstm}',
        exec_time,
    ) 
    
    tensorboard_dir, checkpoint_dir = (
        os.path.join(base_path, 'training_metrics'),
        os.path.join(base_path, 'checkpoints')
    )

    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=n_episodes, verbose=1)
    checkpoint_callback = CheckpointCallback(save_freq=100_000 // n_envs, save_path=checkpoint_dir, name_prefix='rl_model')
    dist_callback = DistributionCallback()
    DistributionCallback.tensorboard_setup(tensorboard_dir, 500)
    callback_list = CallbackList([callback_max_episodes, dist_callback, checkpoint_callback])
    monitor_train = VecMonitor(citizen_science_vec)
    
    model = A2C("MlpPolicy", monitor_train, verbose=2, tensorboard_log=tensorboard_dir)
            
    logger.info(pformat([
        'n_episodes: {}'.format(n_episodes),
        'read_path: {}'.format(read_path),
        'n_files: {}'.format(n_files),
        'n_sequences: {}'.format(n_sequences),
        'n_envs: {}'.format(n_envs),
        'total_timesteps: {}'.format(df.shape),
        f'unique_episodes: {unique_episodes.shape[0]}',
        'device: {}'.format(device),
        'tensorboard_dir: {}'.format(tensorboard_dir),
        'checkpoint_dir: {}'.format(checkpoint_dir),
        'lstm_option: {}'.format(lstm)
    ]))


    model.learn(total_timesteps=100_000_000, progress_bar=True, log_interval=100, callback=callback_list)
    


In [8]:
class Argument:
    read_path = 'rl_ready_data'
    n_files = 30
    n_sequences = 10
    n_episodes = 500_000
    n_envs = 250
    lstm = 'ordinal_10'
    device = 'cuda'
    

In [9]:

main(Argument)

04/25/2023 04:29:25 PM Starting Incentive Reinforcement Learning
04/25/2023 04:29:25 PM Reading data from rl_ready_data/files_used_30/rl_ready_data.parquet_30.parquet
04/25/2023 04:29:30 PM Data read: generating metadata
04/25/2023 04:29:31 PM Metadata generated: scaling features
04/25/2023 04:29:35 PM Features Scaled
04/25/2023 04:29:35 PM Parralelizing environment with 250 environments
04/25/2023 04:29:44 PM Vectorized environments created, wrapping with monitor
04/25/2023 04:29:44 PM ['n_episodes: 500000',
 'read_path: rl_ready_data/files_used_30/rl_ready_data.parquet',
 'n_files: 30',
 'n_sequences: 10',
 'n_envs: 250',
 'total_timesteps: (26950693, 32)',
 'unique_episodes: 457187',
 'device: cuda',
 'tensorboard_dir: '
 's3://dissertation-data-dmiller/reinforcement_learning_incentives/n_files_30/results/lstm_ordinal_10/2023-04-25-16-29/training_metrics',
 'checkpoint_dir: '
 's3://dissertation-data-dmiller/reinforcement_learning_incentives/n_files_30/results/lstm_ordinal_10/2023-0

Using cuda device


04/25/2023 04:30:07 PM Found credentials in environment variables.
04/25/2023 04:30:08 PM Found credentials in environment variables.


Logging to s3://dissertation-data-dmiller/reinforcement_learning_incentives/n_files_30/results/lstm_ordinal_10/2023-04-25-16-29/training_metrics/A2C_1


Output()

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 57.7      |
|    ep_rew_mean        | 1325.3723 |
| time/                 |           |
|    fps                | 365       |
|    iterations         | 100       |
|    time_elapsed       | 341       |
|    total_timesteps    | 125000    |
| train/                |           |
|    entropy_loss       | -0.293    |
|    explained_variance | 0.0026    |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | 17.6      |
|    value_loss         | 2.54e+04  |
-------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 52.2      |
|    ep_rew_mean        | 1426.2798 |
| time/                 |           |
|    fps                | 396       |
|    iterations         | 200       |
|    time_elapsed       | 629       |
|    total_timesteps    | 250000    |
| train/                |           |
|    entropy_loss       | -0.237    |
|    explained_variance | 0.00171   |
|    learning_rate      | 0.0007    |
|    n_updates          | 199       |
|    policy_loss        | 12.6      |
|    value_loss         | 3.52e+04  |
-------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 41.9     |
|    ep_rew_mean        | 891.6065 |
| time/                 |          |
|    fps                | 407      |
|    iterations         | 300      |
|    time_elapsed       | 920      |
|    total_timesteps    | 375000   |
| train/                |          |
|    entropy_loss       | -0.347   |
|    explained_variance | 0.000954 |
|    learning_rate      | 0.0007   |
|    n_updates          | 299      |
|    policy_loss        | 33.6     |
|    value_loss         | 5.35e+04 |
------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 45.3      |
|    ep_rew_mean        | 1591.6954 |
| time/                 |           |
|    fps                | 417       |
|    iterations         | 400       |
|    time_elapsed       | 1196      |
|    total_timesteps    | 500000    |
| train/                |           |
|    entropy_loss       | -0.537    |
|    explained_variance | 0.000611  |
|    learning_rate      | 0.0007    |
|    n_updates          | 399       |
|    policy_loss        | 55.1      |
|    value_loss         | 5.76e+04  |
-------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 72.8     |
|    ep_rew_mean        | 5386.509 |
| time/                 |          |
|    fps                | 420      |
|    iterations         | 500      |
|    time_elapsed       | 1487     |
|    total_timesteps    | 625000   |
| train/                |          |
|    entropy_loss       | -0.403   |
|    explained_variance | 0.000272 |
|    learning_rate      | 0.0007   |
|    n_updates          | 499      |
|    policy_loss        | 39.5     |
|    value_loss         | 4.49e+04 |
------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 52        |
|    ep_rew_mean        | 1238.6245 |
| time/                 |           |
|    fps                | 425       |
|    iterations         | 600       |
|    time_elapsed       | 1761      |
|    total_timesteps    | 750000    |
| train/                |           |
|    entropy_loss       | -0.347    |
|    explained_variance | 0.000221  |
|    learning_rate      | 0.0007    |
|    n_updates          | 599       |
|    policy_loss        | 35.6      |
|    value_loss         | 5.1e+04   |
-------------------------------------
