Description
The following code
import torch
from nvfuser import FusionDefinition, DataType
def nvfuser_fusion_id9(fd : FusionDefinition) -> None :
T0 = fd.define_tensor(shape=[1, -1, -1], contiguity=[None, True, True], dtype=DataType.Float, is_cpu=False, stride_order=[2, 1, 0])
T1 = fd.define_tensor(shape=[-1, -1], contiguity=[True, True], dtype=DataType.Float, is_cpu=False, stride_order=[1, 0])
T2 = fd.ops.sum(T1, axes=[1], keepdim=False, dtype=DataType.Null)
T3 = fd.ops.sum(T0, axes=[1, 0], keepdim=False, dtype=DataType.Null)
S4 = fd.define_scalar(4, dtype=DataType.Int)
V5 = fd.define_vector([S4], dtype=DataType.Int)
T6 = fd.ops.reshape(T2, new_shape=V5)
S7 = fd.define_scalar(4, dtype=DataType.Int)
V8 = fd.define_vector([S7], dtype=DataType.Int)
T9 = fd.ops.reshape(T3, new_shape=V8)
T10 = fd.ops.mul(T6, T9)
T11 = fd.ops.sum(T10, axes=[0], keepdim=False, dtype=DataType.Null)
fd.add_output(T11)
with FusionDefinition() as fd:
nvfuser_fusion_id9(fd)
inputs = [ torch.randn((12,), dtype=torch.float32, device='cuda:0').as_strided((1, 3, 4), (12, 4, 1)),
torch.randn((12,), dtype=torch.float32, device='cuda:0').as_strided((4, 3), (3, 1)),
]
fd.execute(inputs)
fails with
RuntimeError: old_rfactor.size() == new_rfactor.size() INTERNAL ASSERT FAILED at "/workspace/Fuser/csrc/dynamic_transform.cpp":652, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Concretized re
shape rfactor size does not match symbolic rfactor