fix small bug where sequence length is not passed into attention class (

#21) * fix small bug where sequence length is not passed into attention class * fix bug with mask and half values, as well as masking in dense attention * make sure install deepspeed with pip sudo This allows `gpt3small` to run but does not fix the problems with sparse attention. See #22
EleutherAI · Jan 1, 2021 · 7043aac · 7043aac
1 parent ef9369f
commit 7043aac
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 10 deletions.
diff --git a/gpt_neox/gpt_neox.py b/gpt_neox/gpt_neox.py
@@ -4,10 +4,6 @@
 
 from einops import rearrange
 
-# constants
-
-MASK_VALUE = -1e7
-
 # helpers
 
 def exists(val):
@@ -49,12 +45,12 @@ def forward(self, x, **kwargs):
 
 # attention
 
-def dense_attn(q, k, v, key_padding_mask = None, dropout_fn = None):
+def dense_attn(q, k, v, attn_mask = None, dropout_fn = None):
     scale = q.shape[-1] ** -0.5
     sim = einsum('b h i d, b h j d -> b h i j', q, k) * scale
 
-    if exists(key_padding_mask):
-        sim = sim + key_padding_mask[:, None, :, :]
+    if exists(attn_mask):
+        sim = sim + attn_mask[None, None, :, :]
 
     attn = sim.softmax(dim=-1)
 
@@ -103,7 +99,8 @@ def forward(self, x, **kwargs):
             i, j = q.shape[-2], k.shape[-2]
             bool_mask = torch.ones(i, j, device=device).triu_(j - i + 1).bool()
             mask = torch.zeros(i, j, device=device).to(q)
-            mask.masked_fill_(bool_mask, MASK_VALUE)
+            mask_value = -torch.finfo(q.dtype).max
+            mask.masked_fill_(bool_mask, mask_value)
 
         out = self.attn_fn(q, k, v, attn_mask=mask)
         out = rearrange(out, 'b h n d -> b n (h d)')
@@ -132,7 +129,7 @@ def __init__(self, *, num_tokens, dim, seq_len, depth, heads=8, dim_head=64, att
 
         for _, layer_sparse_attn in zip(range(depth), layers_sparse_attn):
             self.layers.append(nn.ModuleList([
-                PreNorm(dim, norm_class, Attention(dim=dim, heads=heads, dim_head=dim_head, dropout=attn_dropout, sparse_attn=layer_sparse_attn)),
+                PreNorm(dim, norm_class, Attention(dim=dim, heads=heads, seq_len=seq_len, dim_head=dim_head, dropout=attn_dropout, sparse_attn=layer_sparse_attn)),
                 PreNorm(dim, norm_class, FeedForward(dim=dim, dropout=ff_dropout)),
             ]))
 

diff --git a/install_deepspeed.sh b/install_deepspeed.sh
@@ -1,3 +1,3 @@
 sudo apt-get -y install llvm-9-dev cmake
 git clone https://github.com/microsoft/DeepSpeed.git /tmp/Deepspeed
-cd /tmp/Deepspeed && DS_BUILD_SPARSE_ATTN=1 ./install.sh
+cd /tmp/Deepspeed && DS_BUILD_SPARSE_ATTN=1 ./install.sh -s