In [None]:
# default_exp reward

# Reward

> Rewards - non-differentiable scores for samples

## Overview

Rewards are non-differentiable score functions for evaluating samples. Rewards should generally follow the format `reward = f(sample)`

Rewards in MRL occupy five events in the fit loop:
- `before_compute_reward` - set up necessary values prior to reward calculation (if needed)
- `compute_reward` - compute reward
- `after_compute_reward` - compute metrics (if needed)
- `reward_modification` - adjust rewards
- `after_reward_modification` - compute metrics (if needed)

### Rewards vs Reward Modifications

MRl breaks rewards up into two phases - rewards and reward modifications. The difference between the two phases is that __reward__ values are saved in the batch log, while __reward_modifications__ are not. 

In this framework, rewards are absolute scores for samples that are used to evaluate the sample relative to all other samples in the log. Reward modifications are transient scores that depend on the current training context.

A reward modification might be something like adding a score bonus to compounds the first time they are created during training to encourage diversity, or penalizing compounds if they appear more than 3 times in the last 5 batches. These types of reward modifications allow us to influence the behavior of the generative model without having these scores effect the true rewards we save in the log

In [None]:
#hide
from nbdev.showdoc import *
%load_ext autoreload
%autoreload 2

In [None]:
# export

from mrl.imports import *
from mrl.core import *
from mrl.callbacks import *
from mrl.torch_imports import *
from mrl.torch_core import *

## Reward Callbacks

Rewards are used to generate non-differentiable scores from samples. Differentiable scores should be implemented as a `LossCallback`.

Rewards are calculated after sample creation but before generating model outputs. Use the `before_compute_reward` event to create any values needed for your reward calculation.


Rewards are called after sample creation. At this point in the fit loop, rewards have access to the samples themselves and any other attributes generated during the sample process. Rewards do not have access to model outputs

LossCallback

before_compute_reward


Loss function callbacks compute some loss value from the current batch state and add the resulting value to `BatchState.loss`.

`LossCallback` provides a simple hook for custom loss funcions. Any object with a `from_batch_state` method that returns a scalar value can be passed to `LossCallback`. Ex:

```
class MyLoss():
    def from_batch_state(self, batch_state):
        loss = self.do_loss_calculation()
        return loss
```

In [None]:
# export

class Reward():
    def __init__(self, reward_function, weight=1, bs=None, log=True):
        
        self.reward_function = reward_function
        self.weight = weight
        self.bs = bs
        self.score_log = {}
        self.log = log
        
    def load_data(self, samples, values):
        for i in range(len(samples)):
            self.score_log[samples[i]] = values[i]
            
    def __call__(self, samples):
        
        rewards = np.array([0. for i in samples])
        
        to_score = []
        to_score_idxs = []
        
        for i in range(len(samples)):
                
            if self.log:
                if samples[i] in self.score_log:
                    rewards[i] = self.score_log[samples[i]]
                else:
                    to_score.append(samples[i])
                    to_score_idxs.append(i)

            else:
                to_score.append(samples[i])
                to_score_idxs.append(i)
                    
        if to_score:
            new_rewards = self.compute_batched_reward(to_score)

            for i in range(len(to_score)):
                batch_idx = to_score_idxs[i]
                reward = new_rewards[i]
                rewards[batch_idx] = reward

                if self.log:
                    self.score_log[to_score[i]] = reward
                
        rewards = to_device(torch.tensor(rewards).float()).squeeze()
        rewards = rewards * self.weight

        return rewards
            
    def _compute_reward(self, samples):
        return self.reward_function(samples)
    
    def compute_batched_reward(self, samples):
        if self.bs is not None:
            sample_chunks = chunk_list(samples, self.bs)
            rewards = []
            for chunk in sample_chunks:
                rewards_iter = self._compute_reward(chunk)
                if isinstance(rewards_iter, torch.Tensor):
                    rewards_iter = rewards_iter.detach().cpu()
                    
                rewards += list(rewards_iter)
            
        else:
            rewards = self._compute_reward(samples)
            if isinstance(rewards, torch.Tensor):
                rewards = rewards.detach().cpu()
            
        return rewards


In [None]:
# export

class RewardCallback(Callback):
    def __init__(self, reward, name, sample_name='samples',
                order=10, track=True):
        super().__init__(name=name, order=order)
        
        self.reward = reward
        self.sample_name = sample_name
        self.track = track
        
    def setup(self):
        log = self.environment.log
        log.add_log(self.name)
        if self.track:
            log.add_metric(self.name)
            
    def compute_reward(self):
        env = self.environment
        batch_state = env.batch_state
        samples = batch_state[self.sample_name]
        
        if samples:
            rewards = self.reward(samples)
        else:
            rewards = 0.

        batch_state.rewards += rewards
        batch_state[self.name] = rewards
        
        if self.track:
            env.log.update_metric(self.name, rewards.mean().detach().cpu().numpy())

In [None]:
# export

class RewardModification(Callback):
    def __init__(self, reward, name, sample_name='samples',
                order=10, track=True):
        super().__init__(name=name, order=order)
        
        self.reward = reward
        self.sample_name = sample_name
        self.track = track
        
    def setup(self):
        log = self.environment.log
        log.add_log(self.name)
        if self.track:
            log.add_metric(self.name)
            
    def reward_modification(self):
        env = self.environment
        batch_state = env.batch_state
        samples = batch_state[self.sample_name]
        
        if samples:
            rewards = self.reward(samples)
        else:
            rewards = 0.

        batch_state.rewards += rewards
        batch_state[self.name] = rewards
        
        if self.track:
            env.log.update_metric(self.name, rewards.mean().detach().cpu().numpy())

In [None]:
# export

class NoveltyReward(Callback):
    def __init__(self, weight=1., track=True):
        super().__init__(name='novel')
        
        self.weight = weight
        self.track = track
        
    def setup(self):
        log = self.environment.log
        log.add_log(self.name)
        if self.track:
            log.add_metric(self.name)
            
    def reward_modification(self):
        env = self.environment
        batch_state = env.batch_state
        samples = batch_state.samples
        
        df = env.log.df
        new = (~pd.Series(samples).isin(df.samples)).values
        
        rewards = np.array([float(i) for i in new])*self.weight
        rewards = to_device(torch.from_numpy(rewards).float())

        batch_state.rewards += rewards
        batch_state[self.name] = rewards
        
        if self.track:
            env.log.update_metric(self.name, rewards.mean().detach().cpu().numpy())

In [None]:
# export

class ContrastiveReward(RewardCallback):
    def __init__(self, base_reward, max_score=None):
        super().__init__(reward = base_reward.reward,
                         name = base_reward.name,
                         sample_name = base_reward.sample_name,
                         order = base_reward.order,
                         track = base_reward.track)
        
        self.base_reward = base_reward
        self.max_score = max_score
    
    def setup(self):
        self.base_reward.environment = self.environment
        
    def __call__(self, event_name):
        
        event = getattr(self, event_name, None)
        
        if event is not None:
            output = event()
        else:
            output = None
            
        if not event_name=='compute_reward':
            _ = self.base_reward(event_name)
            
        return output
        
    def compute_and_clean(self, samples):
        rewards = self.base_reward.reward(samples)
        if isinstance(rewards, torch.Tensor):
            rewards = rewards.detach().cpu()
            
        rewards = np.array(rewards)
        return rewards
        
    def _compute_reward(self, samples):
        source_samples = [i[0] for i in samples]
        target_samples = [i[1] for i in samples]
        
        source_rewards = self.compute_and_clean(source_samples)
        target_rewards = self.compute_and_clean(target_samples)
        
        rewards = target_rewards - source_rewards
        if self.max_score is not None:
            rewards = rewards / (self.max_score-source_rewards)
            
        rewards = to_device(torch.from_numpy(rewards).float())
            
        return rewards
    
    def compute_reward(self):
        env = self.environment
        batch_state = env.batch_state
        samples = batch_state[self.sample_name]
        
        rewards = self._compute_reward(samples)
        
        batch_state.rewards += rewards
        batch_state[self.name] = rewards
        
        if self.track:
            env.log.update_metric(self.name, rewards.mean().detach().cpu().numpy())
    