Skip to content

Commit

Permalink
fix small bug where sequence length is not passed into attention class (
Browse files Browse the repository at this point in the history
#21)

* fix small bug where sequence length is not passed into attention class

* fix bug with mask and half values, as well as masking in dense attention

* make sure install deepspeed with pip sudo

This allows `gpt3small` to run but does not fix the problems with sparse attention. See #22
  • Loading branch information
lucidrains committed Jan 1, 2021
1 parent ef9369f commit 7043aac
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 10 deletions.
15 changes: 6 additions & 9 deletions gpt_neox/gpt_neox.py
Expand Up @@ -4,10 +4,6 @@

from einops import rearrange

# constants

MASK_VALUE = -1e7

# helpers

def exists(val):
Expand Down Expand Up @@ -49,12 +45,12 @@ def forward(self, x, **kwargs):

# attention

def dense_attn(q, k, v, key_padding_mask = None, dropout_fn = None):
def dense_attn(q, k, v, attn_mask = None, dropout_fn = None):
scale = q.shape[-1] ** -0.5
sim = einsum('b h i d, b h j d -> b h i j', q, k) * scale

if exists(key_padding_mask):
sim = sim + key_padding_mask[:, None, :, :]
if exists(attn_mask):
sim = sim + attn_mask[None, None, :, :]

attn = sim.softmax(dim=-1)

Expand Down Expand Up @@ -103,7 +99,8 @@ def forward(self, x, **kwargs):
i, j = q.shape[-2], k.shape[-2]
bool_mask = torch.ones(i, j, device=device).triu_(j - i + 1).bool()
mask = torch.zeros(i, j, device=device).to(q)
mask.masked_fill_(bool_mask, MASK_VALUE)
mask_value = -torch.finfo(q.dtype).max
mask.masked_fill_(bool_mask, mask_value)

out = self.attn_fn(q, k, v, attn_mask=mask)
out = rearrange(out, 'b h n d -> b n (h d)')
Expand Down Expand Up @@ -132,7 +129,7 @@ def __init__(self, *, num_tokens, dim, seq_len, depth, heads=8, dim_head=64, att

for _, layer_sparse_attn in zip(range(depth), layers_sparse_attn):
self.layers.append(nn.ModuleList([
PreNorm(dim, norm_class, Attention(dim=dim, heads=heads, dim_head=dim_head, dropout=attn_dropout, sparse_attn=layer_sparse_attn)),
PreNorm(dim, norm_class, Attention(dim=dim, heads=heads, seq_len=seq_len, dim_head=dim_head, dropout=attn_dropout, sparse_attn=layer_sparse_attn)),
PreNorm(dim, norm_class, FeedForward(dim=dim, dropout=ff_dropout)),
]))

Expand Down
2 changes: 1 addition & 1 deletion install_deepspeed.sh
@@ -1,3 +1,3 @@
sudo apt-get -y install llvm-9-dev cmake
git clone https://github.com/microsoft/DeepSpeed.git /tmp/Deepspeed
cd /tmp/Deepspeed && DS_BUILD_SPARSE_ATTN=1 ./install.sh
cd /tmp/Deepspeed && DS_BUILD_SPARSE_ATTN=1 ./install.sh -s

0 comments on commit 7043aac

Please sign in to comment.