In [1]:
!python -m pip install torch --quiet
!python -m pip install gym stable-baselines3[extra] python-dotenv fsspec["s3"] boto3 s3fs==2022.11.0 tensorboard --quiet

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-cudf 22.10.1+2.gca9a422da9 requires cupy-cuda115<12.0.0a0,>=9.5.0, which is not installed.
cudf 22.10.1+2.gca9a422da9 requires cupy-cuda115<12.0.0a0,>=9.5.0, which is not installed.
cudf 22.10.1+2.gca9a422da9 requires cuda-python<11.7.1,>=11.5, but you have cuda-python 11.7.1 which is incompatible.[0m[31m
[0m

In [2]:
%load_ext dotenv
%dotenv env

In [3]:
import numpy as np
import torch
torch.set_printoptions(precision=4, linewidth=200, sci_mode=False)
np.set_printoptions(precision=4, linewidth=200, suppress=True)

USER_INDEX = 1
SESSION_INDEX = 2
TIMESTAMP_INDEX = 11
TRAIN_SPLIT = 0.7
EVAL_SPLIT = 0.15


In [4]:
# %load npz_extractor.py
import logging
import os
import zipfile

import numpy as np
import logging 

class NPZExtractor:
    logger = logging.getLogger(__name__)
    def __init__(self, input_path, n_files, n_sequences, s3_client, data_partition) -> None:
        self.input_path = input_path
        self.n_files = n_files
        self.n_sequences = n_sequences
        self.s3_client = s3_client
        self.data_partition = data_partition


    def get_dataset_pointer(self):

        read_path = os.path.join(self.input_path, f'files_used_{self.n_files}')
        if not os.path.exists(read_path):
            print(f'Creating directory: {read_path}')
            os.makedirs(read_path)


        for _ in range(0, self.n_sequences +1, 10):
            key_zip, key_npy = (
                os.path.join(read_path, f'sequence_index_{_}.npz'),
                os.path.join(read_path, f'sequence_index_{_}')
            )

            self.logger.info(f'Loading pointer to dataset: {key_npy}: derived from {key_zip}')

            if not os.path.exists(key_npy):
                self.logger.info(f'Zip file to extract: {key_zip}: npy file to load: {key_npy}')
                # self.s3_client.download_file(
                #     'dissertation-data-dmiller',
                #     key_zip,
                #     key_zip
                # )
                self.logger.info(f'Zip file downloaded: {key_zip}')
                self._zip_extract(key_zip, key_npy)

        lz_concatenated_results = self._lazy_concatenate()

        if self.data_partition:
            return [p[:self.data_partition] for p in lz_concatenated_results]
        else:
            return lz_concatenated_results


    def _zip_extract(self, key_zip, key_npy):
        self.logger.info(f'Extracting file: {key_zip} -> {key_npy}')

        with zipfile.ZipFile(key_zip, 'r') as zip_ref:
            zip_ref.extractall(path=key_npy, members=['arr_0.npy'])


        self.logger.info(f'Zip file exracted: {key_zip} -> {key_npy}/arr_0.npy')

    def _lazy_concatenate(self):
        lz_concat = []
        for _ in range(0, self.n_sequences +1, 10):
            path_to_load = os.path.join(self.input_path, f'files_used_{self.n_files}', f'sequence_index_{_}', f'arr_0.npy')
            self.logger.info(f'Loading: {path_to_load}')
            lz_concat.append(np.load(path_to_load))
        return lz_concat

In [5]:
# %load callback
from stable_baselines3.common.callbacks import EveryNTimesteps, BaseCallback, EvalCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat
from stable_baselines3.common.logger import Figure
import numpy as np
import pandas as pd
from torch.utils.tensorboard import SummaryWriter
class DistributionCallback(BaseCallback):

    def _on_training_start(self) -> None:
        self._dist_log_freq = 1000
        self._reward_log_freq = 100
        output_formats = self.logger.output_formats
        self.tb_formatter = next(f for f in output_formats if isinstance(f, TensorBoardOutputFormat))
    
    def _on_step(self) -> bool:
        if self.n_calls % self._dist_log_freq == 0:
            dist_list = self.training_env.env_method('dists')
            episode_list = self.training_env.get_attr('current_episode')
            try:
                for episode, dist in zip(episode_list, dist_list):
                    self.tb_formatter.writer.add_histogram('incentive_index', dist[:, 0], episode)
                    self.tb_formatter.writer.add_histogram('distance_session_end', dist[:, 1], episode)
                    self.tb_formatter.writer.add_histogram('distance_incentive_allocated', dist[:, 2], episode)
                self.tb_formatter.writer.flush()
            except Exception as e:
                raise Exception('Unable to log distributions: {}'.format(e))
            
        
        if self.n_calls % self._reward_log_freq == 0:
            episode_list = self.training_env.get_attr('current_episode')
            rewards = self.training_env.get_attr('user_sessions')
            reward_dict = {}
            for episode, reward in zip(episode_list, rewards):
                reward_dict[f'episode_{episode}'] = reward['total_reward'].sum()
           
            self.tb_formatter.writer.add_scalars(
                'cum_reward', reward_dict, (self.n_calls / np.max(episode_list)) // self._reward_log_freq
            )
            
            self.tb_formatter.writer.flush()
                
        return True


In [6]:
# %load environment
import gym
import argparse
import numpy as np
import torch
import pdb

USER_INDEX = 1
SESSION_INDEX = 2
TASK_INDEX = 3

N_EVENT_INDEX = -1

USER_IN_SESSION_INDEX = 0
SESSION_COUNT_INDEX = 1
TASK_IN_SESSION_INDEX = 2
REWARD_ALLOCATED_INDEX = 3

SESSION_FINISHED_INDEX = -1

CUM_PLATFORM_TIME_INDEX = 4
METADATA_INDEX = 12

import logging
from scipy.stats import norm 
from stable_baselines3.common.logger import TensorBoardOutputFormat


class CitizenScienceEnv(gym.Env):
    
    logger = logging.getLogger(__name__) 
    metadata = {'render.modes': ['human']}
    
    def __init__(self, user_sessions, experience_dataset, n_sequences, n_features) -> None:
        """
        trajectories: dictionary of user_id to their respective trajectories.
        n_sequences: number of sequences used for preprocessing.
        n_features: number of features used for preprocessing.
        """
        super(CitizenScienceEnv, self).__init__()
        self.user_sessions = user_sessions
        self.experience_dataset = experience_dataset

        self.action_space = gym.spaces.Discrete(2)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(n_sequences + 1, n_features), dtype=np.float32)
        self.n_sequences = n_sequences
        self.n_features = n_features
        self.current_session = None
        self.current_episode = 0
        
    def _extract_features(self, feature_array):
        
        metadata, features = feature_array[:, :METADATA_INDEX], feature_array[:, METADATA_INDEX:]
        features = features.reshape((features.shape[0], self.n_sequences + 1, self.n_features))
        features = np.flip(features, axis=1).squeeze(0)
        return metadata.squeeze(0), features

    def _state(self, user, session, task_count):
        
        """
        get index of current state
        """ 
        current_state = self.experience_dataset[
            (self.experience_dataset[:, USER_INDEX] == user) &
            (self.experience_dataset[:, SESSION_INDEX] == session) &
            (self.experience_dataset[:, TASK_INDEX] == task_count)
        ]

        metadata, features = self._extract_features(current_state)
        cum_platform_time = metadata[CUM_PLATFORM_TIME_INDEX]
        return features, cum_platform_time

    
    def _seed_user_session(self):
        """
        find all users sessions that have not been completed
        select random user session from list
        """
        current_session = self.user_sessions[self.user_sessions['ended'] == 0].sample(1)
        current_session['task_index'] = 1
        self.current_session = current_session
        
    def step(self, action):
        
        self._take_action(action)
            
        state, rewards, done, meta = self._calculate_next_state() 
        if not done:
            self._update_session_metadata(self.current_session)
        
        return state, rewards, done, meta

    def _update_session_metadata(self, current_session):
        self.user_sessions.loc[current_session.index] = current_session 
        
    def _calculate_next_state(self):
        
        next_state = self.current_session['task_index'] + 1
        extending = self._extending()
        if not extending:
            self.logger.debug(f'User: {self.current_session} has completed their session')
            self._user_session_terminate()
            if self.user_sessions['ended'].all():
                self.logger.debug('All users have completed their sessions')
                return None, self.user_sessions['total_reward'].sum().astype(float), True, {}
            
            self._seed_user_session()
            user, session, count = self.current_session[['user_id', 'session_id', 'task_index']].values[0]
            return (
                self._state(user, session, count)[0], 
                self.user_sessions['total_reward'].sum().astype(float),
                False,
                {}
            )
        self.logger.debug(f'User: {self.current_session} has moving to next state: {next_state}')
        self.current_session['task_index'] = next_state
        user, session, count = self.current_session[['user_id', 'session_id', 'task_index']].values[0]
        state, cum_platform_time = self._state(user, session, count)
        self.current_session['total_reward'] = cum_platform_time
        return (
            state,
            self.user_sessions['total_reward'].sum().astype(float),
            False,
            {}
        )
    
    
    def _extending(self):
        current_session = self.current_session.to_dict('records')[0]
        if current_session['task_index'] == current_session['counts']:
            return False
    
        if current_session['task_index'] <= current_session['sim_counts']:
            return True

        continue_session = self._probability_extending(current_session)
        return all([continue_session >= 0.3, continue_session <= 0.9])
    
    
    def _probability_extending(self, current_session):
        if current_session['incentive_index'] == 0:
            return 0
        else:
            scale = min(5, current_session['counts'] // 4)
            continue_session = norm(
                loc=current_session['incentive_index'],
                scale=scale
            ).cdf(current_session['task_index']) + self._gaussian_noise()
       
        return continue_session
        
    def _gaussian_noise(self):
        return 0
        return np.clip(
            np.random.normal(0.01, 0.05, 1)[0],
            0.1,
            -0.1
        )
     
    def _user_session_terminate(self):
        self.current_session['ended'] = 1
        self._update_session_metadata(self.current_session)
    
    def _take_action(self, action):
        
        current_session = self.current_session.to_dict('records')[0]
        
        if current_session['incentive_index'] > 0 or action == 0:
            self.logger.debug(f'Incentive already allocation for session or no-op: {action}, {current_session}')
            return
        
    
        self.logger.debug('Taking action and allocating incentive')
        self.current_session['incentive_index'] = self.current_session['task_index']
        self.current_session['reward_allocated'] = action
        
        self.logger.debug('Taking action and allocating incentive: updating user session')
        self.logger.debug(f'User session: {self.current_session}')

    def reset(self):
        self.user_sessions = self.user_sessions.sample(frac=1)
        self.user_sessions['incentive_index'] = 0
        self.user_sessions['task_index'] = 0
        self.user_sessions['ended'] = 0
        self.user_sessions['total_reward'] = 0
        self.user_sessions['total_reward'] = self.user_sessions['total_reward'].astype(float)
        
        self._seed_user_session()
        self._update_session_metadata(self.current_session)
        user, session, count = self.current_session[['user_id', 'session_id', 'task_index']].values[0]
        self.current_episode += 1
        return self._state(user, session, count)[0]
        
    
    def render(self, mode='human'):
        print('rendering')
        
    def dists(self):
        incentive_index = self.user_sessions['incentive_index'].values
        distance_end = (self.user_sessions['counts'] - self.user_sessions['incentive_index']).values
        distance_reward = (self.user_sessions['total_reward'] - self.user_sessions['incentive_index']).values
        return np.array([incentive_index, distance_end, distance_reward])

In [7]:
import argparse
import numpy as np
import torch
torch.set_printoptions(precision=4, linewidth=200, sci_mode=False)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
from stable_baselines3.common.callbacks import EvalCallback, CallbackList, StopTrainingOnMaxEpisodes, CheckpointCallback
from stable_baselines3 import PPO, A2C, DQN
import logging
USER_INDEX = 1
SESSION_INDEX = 2
CUM_SESSION_EVENT_RAW = 3
TIMESTAMP_INDEX = 11
TRAIN_SPLIT = 0.7
EVAL_SPLIT = 0.15
import pandas as pd
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines3.common.noise import NormalActionNoise
from datetime import datetime
from stable_baselines3.common.vec_env import VecMonitor
from pprint import pformat
import os

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
np.set_printoptions(precision=4, linewidth=200, suppress=True)
torch.set_printoptions(precision=2, linewidth=200, sci_mode=False)


S3_BASELINE_PATH = 's3://dissertation-data-dmiller'

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--read_path', type=str, default='datasets/torch_ready_data')
    parser.add_argument('--n_files', type=str, default=2)
    parser.add_argument('--n_sequences', type=int, default=10)
    parser.add_argument('--n_features', type=int, default=18)
    parser.add_argument('--n_episodes', type=int, default=20)
    parser.add_argument('--return_distribution', type=str, default='stack_overflow_v1')
    parser.add_argument('--agent', type=str, default='constant_20')
    parser.add_argument('--device', type=str, default='cpu')
    parser.add_argument('--event_sample', type=float, default=0.25)
    
    args = parser.parse_args()
    return args

def train_eval_split(dataset, logger):
    train_split = int(dataset.shape[0] * TRAIN_SPLIT)
    eval_split = int(dataset.shape[0] * EVAL_SPLIT)
    test_split = dataset.shape[0] - train_split - eval_split
    logger.info(f'Train size: 0:{train_split}, eval size: {train_split}:{train_split+eval_split}: test size: {train_split + eval_split}:{dataset.shape[0]}')
    train_dataset, eval_dataset, test_split = dataset[:train_split], dataset[train_split:train_split+eval_split], dataset[train_split+eval_split:]
    
    return {
        'train': train_dataset,
        'eval': eval_dataset,
        'test': test_split
    }

def generate_metadata(dataset, logger):
     
    logger.info('Generating metadata tasks per session')
    sessions = pd.DataFrame(
        dataset[:, [USER_INDEX, SESSION_INDEX]],
        columns=['user_id', 'session_id']
    )
    
    event_in_session = pd.DataFrame(
        dataset[:, [USER_INDEX, SESSION_INDEX, CUM_SESSION_EVENT_RAW]],
        columns=['user_id', 'session_id', 'event_in_session']
    )
    
    event_in_session['event_in_session'] = event_in_session['event_in_session'].astype(int)
    
    event_in_session['event_in_session_resampled'] = event_in_session.groupby(['user_id', 'session_id']).cumcount() + 1
    
    sessions = sessions.groupby(['user_id', 'session_id']).size().reset_index(name='counts')
    sessions['sim_counts'] = (sessions['counts'] * 0.8).astype(int)
    sessions['sim_counts'] = sessions['sim_counts'].apply(lambda x: 1 if x == 0 else x)
    sessions['incentive_index'] = 0
    
    sessions['task_index'] = 0
    sessions['total_reward'] = 0
    sessions['total_reward'] = sessions['total_reward'].astype(float)
    sessions['ended'] = 0
    return sessions


def run_reinforcement_learning_incentives(environment, logger, n_episodes=1):
    for epoch in range(n_episodes):
        environment_comp = False
        state = environment.reset()
        i = 0
        while not environment_comp:
            next_action = (
                1 if np.random.uniform(low=0, high=1) > 0.8 else 0
            )
            state, rewards, environment_comp, meta = environment.step(next_action)
            i +=1
            if i % 100 == 0:
                logger.info(f'Step: {i} - Reward: {rewards}')
                
        logger.info(f'Epoch: {epoch} - Reward: {rewards}')
        print(environment.user_sessions.head(10))

    

def main(args):
    
    exec_time = datetime.now().strftime("%Y-%m-%d-%H-%M")
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    
    
    read_path, n_files, n_sequences, n_features, n_episodes, device, event_sample = (
        args.read_path, 
        args.n_files, 
        args.n_sequences, 
        args.n_features, 
        args.n_episodes, 
        args.device,
        args.event_sample
    )
    
    npz_extractor = NPZExtractor(
        read_path,
        n_files,
        n_sequences,
        None,
        None
    )
    
    cpu_count = int(os.cpu_count() * .8)
   
    logger.info(f'Starting experiment at {exec_time}') 
    logger.info(f'Extracting dataset from npz files to tensor' )
    dataset = np.concatenate(npz_extractor.get_dataset_pointer(), axis=1)
    logger.info('Dataset shape: {}'.format(dataset.shape))
    train_data = train_eval_split(dataset, logger)['train']
    logger.info(f'Following sampling: {train_data.shape}')
    train_data = train_data[:int(train_data.shape[0] * event_sample)]
    logger.info(f'Dataset shape: {train_data.shape}: generating metadata tensor')
    sessions_train = generate_metadata(train_data, logger)
    logger.info(f'Metadata train: {sessions_train.shape}')
    logger.info(f'resetting number of sessions to sample: {sessions_train.shape}')

    logger.info(f'Creating vectorized training environment: num envs: {cpu_count}')
   
    citizen_science_vec = SubprocVecEnv([lambda: CitizenScienceEnv(sessions_train, train_data, n_sequences, n_features) for _ in range(cpu_count)])

    """
    Eval environment is not used in training and is used after training to evaluate the agent
    """    
    logger.info(f'Vectorized environments created, wrapping with monitor')
    
    base_path = os.path.join(
        S3_BASELINE_PATH,
        'reinforcement_learning_incentives',
        f'n_files_{n_files}',
        'results',
        exec_time,
    ) 
 
    tensorboard_dir, checkpoint_dir = (
        os.path.join(base_path, 'training_metrics'),
        os.path.join(base_path, 'checkpoints'),
    )
 
    monitor_train = VecMonitor(citizen_science_vec)
    agent = DQN(
        'MlpPolicy',
        monitor_train,
        verbose=1,
        device=args.device,
        batch_size=4096,
        tensorboard_log=tensorboard_dir,
    )

    checkpoint_callback = CheckpointCallback(
        save_freq=10000 // 2,
        name_prefix='a2c',
        save_path=checkpoint_dir,
        verbose=1,
    )
        
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=n_episodes, verbose=1)
    
    dist_callback = DistributionCallback()
    callback_list = CallbackList([dist_callback, callback_max_episodes, checkpoint_callback])

    logger.info(pformat([
        'n_episodes: {}'.format(n_episodes),
        'read_path: {}'.format(read_path),
        'n_files: {}'.format(n_files),
        'n_sequences: {}'.format(n_sequences),
        'n_features: {}'.format(n_features),
        'total_timesteps: {}'.format(dataset.shape[0] -1),
        'device: {}'.format(device),
        'tensorboard_dir: {}'.format(tensorboard_dir),
        'checkpoint_dir: {}'.format(checkpoint_dir),
        'event_sample: {}'.format(event_sample)
    ]))

    agent.learn(
        total_timesteps=int(10e7),
        log_interval=100, 
        progress_bar=True,
        callback=callback_list
    )




In [10]:
class Argument:
    read_path = 'torch_ready_data'
    n_files = 2
    n_sequences = 10
    n_features = 18
    n_episodes = 20
    device = 'cuda'
    event_sample = 0.5

In [11]:

main(Argument)

04/17/2023 06:26:30 PM Starting experiment at 2023-04-17-18-26
04/17/2023 06:26:30 PM Extracting dataset from npz files to tensor
04/17/2023 06:26:30 PM Loading pointer to dataset: torch_ready_data/files_used_2/sequence_index_0: derived from torch_ready_data/files_used_2/sequence_index_0.npz
04/17/2023 06:26:30 PM Loading pointer to dataset: torch_ready_data/files_used_2/sequence_index_10: derived from torch_ready_data/files_used_2/sequence_index_10.npz
04/17/2023 06:26:30 PM Loading: torch_ready_data/files_used_2/sequence_index_0/arr_0.npy
04/17/2023 06:26:30 PM Loading: torch_ready_data/files_used_2/sequence_index_10/arr_0.npy
04/17/2023 06:26:34 PM Dataset shape: (2566734, 210)
04/17/2023 06:26:34 PM Train size: 0:1796713, eval size: 1796713:2181723: test size: 2181723:2566734
04/17/2023 06:26:34 PM Following sampling: (1796713, 210)
04/17/2023 06:26:34 PM Dataset shape: (898356, 210): generating metadata tensor
04/17/2023 06:26:34 PM Generating metadata tasks per session
04/17/2023

Using cuda device


04/17/2023 06:27:01 PM ['n_episodes: 20',
 'read_path: torch_ready_data',
 'n_files: 2',
 'n_sequences: 10',
 'n_features: 18',
 'total_timesteps: 2566733',
 'device: cuda',
 'tensorboard_dir: '
 's3://dissertation-data-dmiller/reinforcement_learning_incentives/n_files_2/results/2023-04-17-18-26/training_metrics',
 'checkpoint_dir: '
 's3://dissertation-data-dmiller/reinforcement_learning_incentives/n_files_2/results/2023-04-17-18-26/checkpoints',
 'event_sample: 0.5']
04/17/2023 06:27:01 PM Found credentials in environment variables.


Logging to s3://dissertation-data-dmiller/reinforcement_learning_incentives/n_files_2/results/2023-04-17-18-26/training_metrics/DQN_1


Output()

KeyboardInterrupt: 