Fix #8 again

Beomi · Apr 17, 2024 · c2f0746 · c2f0746
1 parent ca0b9ea
commit c2f0746
Showing 1 changed file with 4 additions and 2 deletions.
diff --git a/modeling_gemma.py b/modeling_gemma.py
@@ -836,8 +836,10 @@ def _retrieve_from_memory(self, query_states):
         debug_print("[Retrieve] self.norm_term.shape", self.norm_term.shape)
 
         # Broadcast norm_term to the shape of query_states, then sum across head_dim for normalization
-        norm_term_broadcastable = self.norm_term.expand_as(query_states).sum(
-            dim=3, keepdim=True
+        norm_term_broadcastable = torch.matmul(
+            query_states,
+            self.norm_term
+            .transpose(-2, -1),
         )
         debug_print(
             "[Broadcast] norm_term_broadcastable.shape", norm_term_broadcastable.shape