Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compute importance measure in mini-batches #16

Merged
merged 3 commits into from
Feb 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 8 additions & 6 deletions comp550/dataset/babi.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,19 +247,21 @@ def uncollate(self, batch):
batch['hypothesis'], batch['hypothesis_mask'], batch['hypothesis_length'],
batch['label'], batch['index'])]

def train_dataloader(self, batch_size=None, num_workers=None):
def train_dataloader(self, batch_size=None, num_workers=None, shuffle=True):
return DataLoader(self._train,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=True)
shuffle=shuffle)

def val_dataloader(self, batch_size=None, num_workers=None):
def val_dataloader(self, batch_size=None, num_workers=None, shuffle=False):
return DataLoader(self._val,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers)
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=shuffle)

def test_dataloader(self, batch_size=None, num_workers=None):
def test_dataloader(self, batch_size=None, num_workers=None, shuffle=False):
return DataLoader(self._test,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers)
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=shuffle)

14 changes: 8 additions & 6 deletions comp550/dataset/imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,18 +186,20 @@ def uncollate(self, batch):
in zip(batch['sentence'], batch['mask'], batch['length'],
batch['label'], batch['index'])]

def train_dataloader(self, batch_size=None, num_workers=None):
def train_dataloader(self, batch_size=None, num_workers=None, shuffle=True):
return DataLoader(self._train,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=True)
shuffle=shuffle)

def val_dataloader(self, batch_size=None, num_workers=None):
def val_dataloader(self, batch_size=None, num_workers=None, shuffle=False):
return DataLoader(self._val,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers)
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=shuffle)

def test_dataloader(self, batch_size=None, num_workers=None):
def test_dataloader(self, batch_size=None, num_workers=None, shuffle=False):
return DataLoader(self._test,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers)
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=shuffle)
14 changes: 8 additions & 6 deletions comp550/dataset/mimic.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,18 +297,20 @@ def uncollate(self, batch):
in zip(batch['sentence'], batch['mask'], batch['length'],
batch['label'], batch['index'])]

def train_dataloader(self, batch_size=None, num_workers=None):
def train_dataloader(self, batch_size=None, num_workers=None, shuffle=True):
return DataLoader(self._train,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=True)
shuffle=shuffle)

def val_dataloader(self, batch_size=None, num_workers=None):
def val_dataloader(self, batch_size=None, num_workers=None, shuffle=False):
return DataLoader(self._val,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers)
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=shuffle)

def test_dataloader(self, batch_size=None, num_workers=None):
def test_dataloader(self, batch_size=None, num_workers=None, shuffle=False):
return DataLoader(self._test,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers)
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=shuffle)
81 changes: 47 additions & 34 deletions comp550/dataset/roar.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,63 +87,74 @@ def collate(self, observations):
def uncollate(self, observations):
return self._base_dataset.uncollate(observations)

def _importance_measure_random(self, observation):
return torch.tensor(self._rng.rand(*observation['sentence'].shape))
def _importance_measure_random(self, batch):
return torch.tensor(self._rng.rand(*batch['sentence'].shape))

def _importance_measure_attention(self, observation):
def _importance_measure_attention(self, batch):
with torch.no_grad():
_, alpha = self._model(self.collate([observation]))
return torch.squeeze(alpha, dim=0)
_, alpha = self._model(batch)
return alpha

def _importance_measure_gradient(self, observation):
def _importance_measure_gradient(self, batch):
# Make a shallow copy, because batch['sentence'] will be overwritten
batch = batch.copy()

# TODO: Ensure that the padding is zero. In theory we don't need padding.
# Setup batch to be a one-hot encoded float32 with require_grad. This is neccesary
# as torch does not allow computing grad w.r.t. to an int-tensor.
batch = self.collate([observation])
batch['sentence'] = torch.nn.functional.one_hot(batch['sentence'], len(self.vocabulary))
batch['sentence'] = batch['sentence'].type(torch.float32)
batch['sentence'].requires_grad = True

# Compute model
y, _ = self._model(batch)

# Compute gradient
yc = y[0, observation['label']]
yc_wrt_x, = torch.autograd.grad(yc, (batch['sentence'], ))
# Select correct label, as we would like gradient of y[correct_label] w.r.t. x
yc = y[torch.arange(len(batch['label'])), batch['label']]
# autograd.grad must take a scalar, however we would like $d y_{i,c}/d x_i$
# to be computed as a batch, meaning for each $i$. To work around this,
# use that for $g(x) = \sum_i f(x_i)$, we have $d g(x)/d x_{x_i} = d f(x_i)/d x_{x_i}$.
# The gradient of the sum, is therefore equivalent to the batch_gradient.
yc_wrt_x, = torch.autograd.grad(torch.sum(yc, axis=0), (batch['sentence'], ))

# Normalize the vector-gradient per token into one scalar
return torch.norm(torch.squeeze(yc_wrt_x, 0), 2, dim=1)
return torch.norm(yc_wrt_x, 2, dim=2)

def _importance_measure_integrated_gradient(self, observation):
# Implement as x .* (1/k) .* sum([f'((i/k) .* x) for i in range(1, k+1))
pass

def _mask_observation(self, observation):
importance = self._importance_measure_fn(observation)
def _mask_batch(self, batch):
batch_importance = self._importance_measure_fn(batch)

masked_batch = []
with torch.no_grad():
# Prevent masked tokens from being "removed"
importance[torch.logical_not(observation['mask'])] = -np.inf
for importance, observation in zip(batch_importance, self.uncollate(batch)):
# Trim importance to the observation length
importance = importance[0:len(observation['sentence'])]

# Ensure that already "removed" tokens continues to be "removed"
importance[observation['sentence'] == self.tokenizer.mask_token_id] = np.inf
# Prevent masked tokens from being "removed"
importance[torch.logical_not(observation['mask'])] = -np.inf

# Tokens to remove.
# Ensure that k does not exceed the number of un-masked tokens, if it does
# masked tokens will be "removed" too.
k = torch.minimum(torch.tensor(self._k), torch.sum(observation['mask']))
_, remove_indices = torch.topk(importance, k=k, sorted=False)
# Ensure that already "removed" tokens continues to be "removed"
importance[observation['sentence'] == self.tokenizer.mask_token_id] = np.inf

# "Remove" top-k important tokens
observation['sentence'][remove_indices] = self.tokenizer.mask_token_id
# Tokens to remove.
# Ensure that k does not exceed the number of un-masked tokens, if it does
# masked tokens will be "removed" too.
k = torch.minimum(torch.tensor(self._k), torch.sum(observation['mask']))
_, remove_indices = torch.topk(importance, k=k, sorted=False)

return observation
# "Remove" top-k important tokens
observation['sentence'][remove_indices] = self.tokenizer.mask_token_id
masked_batch.append(observation)

return masked_batch

def _mask_dataset(self, dataloader, name):
outputs = []
for batched_observation in tqdm(dataloader(batch_size=1, num_workers=0), desc=f'Building {name} dataset', leave=False):
outputs.append(self._mask_observation(self.uncollate(batched_observation)[0]))
for batch in tqdm(dataloader(batch_size=self.batch_size, num_workers=0, shuffle=False),
desc=f'Building {name} dataset', leave=False):
outputs += self._mask_batch(batch)
return outputs

def prepare_data(self):
Expand Down Expand Up @@ -190,21 +201,23 @@ def setup(self, stage=None):
else:
raise ValueError(f'unexpected setup stage: {stage}')

def train_dataloader(self, batch_size=None, num_workers=None):
def train_dataloader(self, batch_size=None, num_workers=None, shuffle=True):
return DataLoader(
self._train,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=True)
shuffle=shuffle)

def val_dataloader(self, batch_size=None, num_workers=None):
def val_dataloader(self, batch_size=None, num_workers=None, shuffle=False):
return DataLoader(
self._val,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers)
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=shuffle)

def test_dataloader(self, batch_size=None, num_workers=None):
def test_dataloader(self, batch_size=None, num_workers=None, shuffle=False):
return DataLoader(
self._test,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers)
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=shuffle)
14 changes: 8 additions & 6 deletions comp550/dataset/stanford_nli.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,18 +202,20 @@ def uncollate(self, batch):
batch['hypothesis'], batch['hypothesis_mask'], batch['hypothesis_length'],
batch['label'], batch['index'])]

def train_dataloader(self, batch_size=None, num_workers=None):
def train_dataloader(self, batch_size=None, num_workers=None, shuffle=True):
return DataLoader(self._train,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=True)
shuffle=shuffle)

def val_dataloader(self, batch_size=None, num_workers=None):
def val_dataloader(self, batch_size=None, num_workers=None, shuffle=False):
return DataLoader(self._val,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers)
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=shuffle)

def test_dataloader(self, batch_size=None, num_workers=None):
def test_dataloader(self, batch_size=None, num_workers=None, shuffle=False):
return DataLoader(self._test,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers)
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=shuffle)
14 changes: 8 additions & 6 deletions comp550/dataset/stanford_sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,18 +164,20 @@ def uncollate(self, batch):
in zip(batch['sentence'], batch['mask'], batch['length'],
batch['label'], batch['index'])]

def train_dataloader(self, batch_size=None, num_workers=None):
def train_dataloader(self, batch_size=None, num_workers=None, shuffle=True):
return DataLoader(self._train,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=True)
shuffle=shuffle)

def val_dataloader(self, batch_size=None, num_workers=None):
def val_dataloader(self, batch_size=None, num_workers=None, shuffle=False):
return DataLoader(self._val,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers)
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=shuffle)

def test_dataloader(self, batch_size=None, num_workers=None):
def test_dataloader(self, batch_size=None, num_workers=None, shuffle=False):
return DataLoader(self._test,
batch_size=batch_size or self.batch_size, collate_fn=self.collate,
num_workers=self._num_workers if num_workers is None else num_workers)
num_workers=self._num_workers if num_workers is None else num_workers,
shuffle=shuffle)