[FA3] Allow returning LSE via kwarg (#1851)

vasqu · web-flow · commit 203b9b3dba39 · 2025-08-29T14:25:35.000-07:00
* lse output

* style

* style

* revert test changes, introduce optional kwarg to output lse
diff --git a/hopper/flash_attn_interface.py b/hopper/flash_attn_interface.py
@@ -168,6 +168,7 @@ def forward(
         deterministic=False,
         num_heads_q=None,
         sm_margin=0,
+        return_softmax=False,
     ):
         if softmax_scale is None:
             softmax_scale = qkv.shape[-1] ** (-0.5)
@@ -210,8 +211,7 @@ def forward(
         ctx.deterministic = deterministic
         ctx.ndim = qkv.dim()
         ctx.sm_margin = sm_margin
-        # return out, softmax_lse
-        return out
+        return (out, softmax_lse) if return_softmax else out
 
     @staticmethod
     def backward(ctx, dout, *args):
@@ -270,6 +270,7 @@ def forward(
         pack_gqa=None,
         deterministic=False,
         sm_margin=0,
+        return_softmax=False,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
@@ -305,7 +306,7 @@ def forward(
         ctx.softcap = softcap
         ctx.deterministic = deterministic
         ctx.sm_margin = sm_margin
-        return out
+        return (out, softmax_lse) if return_softmax else out
 
     @staticmethod
     def backward(ctx, dout, *args):
@@ -363,6 +364,7 @@ def forward(
         pack_gqa=None,
         deterministic=False,
         sm_margin=0,
+        return_softmax=False,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
@@ -404,7 +406,7 @@ def forward(
         ctx.softcap = softcap
         ctx.deterministic = deterministic
         ctx.sm_margin = sm_margin
-        return out
+        return (out, softmax_lse) if return_softmax else out
 
     @staticmethod
     def backward(ctx, dout, *args):
@@ -451,6 +453,7 @@ def flash_attn_qkvpacked_func(
     deterministic=False,
     num_heads_q=None,
     sm_margin=0,
+    return_attn_probs=False,
 ):
     """dropout_p should be set to 0.0 during evaluation
     If Q, K, V are already stacked into 1 tensor, this function will be faster than
@@ -497,6 +500,7 @@ def flash_attn_qkvpacked_func(
         deterministic,
         num_heads_q,
         sm_margin,
+        return_attn_probs,
     )
 
 
@@ -515,6 +519,7 @@ def flash_attn_func(
     pack_gqa=None,
     deterministic=False,
     sm_margin=0,
+    return_attn_probs=False,
 ):
     """dropout_p should be set to 0.0 during evaluation
     Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
@@ -576,6 +581,7 @@ def flash_attn_func(
         pack_gqa,
         deterministic,
         sm_margin,
+        return_attn_probs,
     )
 
 
@@ -600,6 +606,7 @@ def flash_attn_varlen_func(
     pack_gqa=None,
     deterministic=False,
     sm_margin=0,
+    return_attn_probs=False,
 ):
     return FlashAttnVarlenFunc.apply(
         q,
@@ -622,6 +629,7 @@ def flash_attn_varlen_func(
         pack_gqa,
         deterministic,
         sm_margin,
+        return_attn_probs,
     )