FEniCS · blechta · Mar 27, 2018 · Mar 27, 2018 · May 3, 2018 · May 9, 2018
diff --git a/AUTHORS b/AUTHORS
@@ -49,6 +49,10 @@ Contributors:
     email: dag@f.kth.se
     www:   http://www.f.kth.se/~dag/
 
+    Fabian Löschner
+    email: fabian.loeschner@rwth-aachen.de
+    www:   https://w1th0utnam3.github.io/
+
     Ola Skavhaug
     email: skavhaug@simula.no
     www:   http://home.simula.no/~skavhaug/

diff --git a/ffc/uflacs/build_uflacs_ir.py b/ffc/uflacs/build_uflacs_ir.py
@@ -59,7 +59,7 @@ def get_common_block_data(blockdata):
 
 
 preintegrated_block_data_t = namedtuple("preintegrated_block_data_t",
-                                        common_block_data_fields + ["is_uniform", "name"])
+                                        common_block_data_fields + ["is_uniform", "name", "unroll", "inline"])
 
 premultiplied_block_data_t = namedtuple("premultiplied_block_data_t",
                                         common_block_data_fields + ["is_uniform", "name"])
@@ -195,6 +195,7 @@ def uflacs_default_parameters(optimize):
         "enable_sum_factorization": False,
         "enable_block_transpose_reuse": False,
         "enable_table_zero_compression": False,
+        "max_preintegrated_unrolled_table_size": 1024,
 
         # Code generation parameters
         "vectorize": False,
@@ -543,8 +544,18 @@ def build_uflacs_ir(cell, integral_type, entitytype, integrands, tensor_shape,
                                                  unique_table_num_dofs)
                     ptable = clamp_table_small_numbers(
                         ptable, rtol=p["table_rtol"], atol=p["table_atol"])
+                else:
+                    ptable = unique_tables[pname]
+
+                # Decide whether to unroll dofblock assignment
+                max_unroll_size = ir["params"]["max_preintegrated_unrolled_table_size"]
+                unroll = numpy.prod(ptable.shape[1:]) <= max_unroll_size  # First dimension is entity
+                inline = unroll and integral_type == "cell"
 
+                if pname is None:
+                    # Store the table on the cache miss
                     pname = "PI%d" % (len(cache, ))
+                    pname += "_inline" if inline else ""
                     cache[unames] = pname
                     unique_tables[pname] = ptable
                     unique_table_types[pname] = "preintegrated"
@@ -553,7 +564,7 @@ def build_uflacs_ir(cell, integral_type, entitytype, integrands, tensor_shape,
                 block_unames = (pname, )
                 blockdata = preintegrated_block_data_t(
                     block_mode, ttypes, factor_index, factor_is_piecewise, block_unames,
-                    block_restrictions, block_is_transposed, block_is_uniform, pname)
+                    block_restrictions, block_is_transposed, block_is_uniform, pname, unroll, inline)
                 block_is_piecewise = True
 
             elif block_mode == "premultiplied":

diff --git a/ffc/uflacs/integralgenerator.py b/ffc/uflacs/integralgenerator.py
@@ -17,6 +17,7 @@
 from ufl import product
 from ufl.classes import Condition
 from ufl.measure import custom_integral_types, point_integral_types
+from ufl.utils.indexflattening import shape_to_strides
 
 logger = logging.getLogger(__name__)
 
@@ -289,7 +290,6 @@ def generate_element_tables(self):
 
         tables = self.ir["unique_tables"]
         table_types = self.ir["unique_table_types"]
-        inline_tables = self.ir["integral_type"] == "cell"
 
         alignas = self.ir["params"]["alignas"]
         padlen = self.ir["params"]["padlen"]
@@ -304,14 +304,14 @@ def generate_element_tables(self):
         for name in table_names:
             table = tables[name]
 
-            # Don't pad preintegrated tables
+            # Don't pad preintegrated tables. FIXME: Why?!
             if name[0] == "P":
                 p = 1
             else:
                 p = padlen
 
             # Skip tables that are inlined in code generation
-            if inline_tables and name[:2] == "PI":
+            if "inline" in name:
                 continue
 
             decl = L.ArrayDecl(
@@ -591,9 +591,13 @@ def generate_dofblock_partition(self, num_points):
 
         blocks = [(blockmap, blockdata)
                   for blockmap, contributions in sorted(block_contributions.items())
-                  for blockdata in contributions if blockdata.block_mode != "preintegrated"]
+                  for blockdata in contributions]
 
         for blockmap, blockdata in blocks:
+            # Skip unrolled preintegration blocks
+            if blockdata.block_mode == "preintegrated" and blockdata.unroll:
+                continue
+
             # Get symbol for already defined block B if it exists
             common_block_data = get_common_block_data(blockdata)
             B = self.shared_blocks.get(common_block_data)
@@ -793,7 +797,7 @@ def generate_block_parts(self, num_points, blockmap, blockdata):
                 # Plan for vectorization of coefficient evaluation over iq:
                 # 1) Define w0_c1 etc as arrays e.g. "double w0_c1[nq] = {};" outside quadloop
                 # 2) Access as w0_c1[iq] of course
-                # 3) Splitquadrature loops, coefficients before fw computation
+                # 3) Split quadrature loops, coefficients before fw computation
                 # 4) Possibly swap loops over iq and ic:
                 #    for(ic) for(iq) w0_c1[iq] = w[0][ic] * FE[iq][ic];
 
@@ -904,6 +908,10 @@ def generate_block_parts(self, num_points, blockmap, blockdata):
                 # Preintegrated should never get into quadloops
                 assert num_points is None
 
+                # Inlining is only possible with unrolled blocks, which should not be passed to this function
+                assert not blockdata.unroll
+                assert not blockdata.inline
+
                 # Define B = B_rhs = f * PI where PI = sum_q weight * u * v
                 PI = L.Symbol(blockdata.name)[P_ii]
                 B_rhs = L.float_product([f, PI])
@@ -945,22 +953,18 @@ def generate_preintegrated_dofblock_partition(self):
         # Get symbol, dimensions, and loop index symbols for A
         A_shape = self.ir["tensor_shape"]
         A_size = product(A_shape)
-        A_rank = len(A_shape)
-
-        # TODO: there's something like shape2strides(A_shape) somewhere
-        A_strides = [1] * A_rank
-        for i in reversed(range(0, A_rank - 1)):
-            A_strides[i] = A_strides[i + 1] * A_shape[i + 1]
+        A_strides = shape_to_strides(A_shape)
 
+        # List for unrolled assignments
         A_values = [0.0] * A_size
 
-        for blockmap, blockdata in blocks:
+        # Generate code for unrolled blocks, non-unrolled blocks are treated together with premultiplied blocks
+        for block_id, (blockmap, blockdata) in enumerate(blocks):
             # Accumulate A[blockmap[...]] += f*PI[...]
 
             # Get table for inlining
             tables = self.ir["unique_tables"]
             table = tables[blockdata.name]
-            inline_table = self.ir["integral_type"] == "cell"
 
             # Get factor expression
             v = self.ir["piecewise_ir"]["V"][blockdata.factor_index]
@@ -969,45 +973,46 @@ def generate_preintegrated_dofblock_partition(self):
             # Define rhs expression for A[blockmap[arg_indices]] += A_rhs
             # A_rhs = f * PI where PI = sum_q weight * u * v
             PI = L.Symbol(blockdata.name)
-            # block_rank = len(blockmap)
-
-            # # Override dof index with quadrature loop index for arguments with
-            # # quadrature element, to index B like B[iq*num_dofs + iq]
-            # arg_indices = tuple(
-            #     self.backend.symbols.argument_loop_index(i) for i in range(block_rank))
 
             # Define indices into preintegrated block
             P_entity_indices = self.get_entities(blockdata)
-            if inline_table:
+            if blockdata.inline:
                 assert P_entity_indices == (L.LiteralInt(0), )
                 assert table.shape[0] == 1
+            assert ("inline" in blockdata.name) == blockdata.inline
 
             # Unroll loop
             blockshape = [len(DM) for DM in blockmap]
             blockrange = [range(d) for d in blockshape]
 
-            for ii in itertools.product(*blockrange):
-                A_ii = sum(A_strides[i] * blockmap[i][ii[i]] for i in range(len(ii)))
-                if blockdata.transposed:
-                    P_arg_indices = (ii[1], ii[0])
-                else:
-                    P_arg_indices = ii
-
-                if inline_table:
-                    # Extract float value of PI[P_ii]
-                    Pval = table[0]  # always entity 0
-                    for i in P_arg_indices:
-                        Pval = Pval[i]
-                    A_rhs = Pval * f
-                else:
-                    # Index the static preintegrated table:
-                    P_ii = P_entity_indices + P_arg_indices
-                    A_rhs = f * PI[P_ii]
+            if blockdata.unroll:
+                # Generate unrolled assignments for the current block
+                for ii in itertools.product(*blockrange):
+                    A_ii = sum(A_strides[i] * blockmap[i][ii[i]]
+                               for i in range(len(ii)))
+                    if blockdata.transposed:
+                        P_arg_indices = (ii[1], ii[0])
+                    else:
+                        P_arg_indices = ii
+
+                    if blockdata.inline:
+                        # Extract float value of PI[P_ii]
+                        Pval = table[0]  # always entity 0
+                        for i in P_arg_indices:
+                            Pval = Pval[i]
+                        A_rhs = Pval * f
+                    else:
+                        # Index the static preintegrated table:
+                        P_ii = P_entity_indices + P_arg_indices
+                        A_rhs = f * PI[P_ii]
+
+                    A_values[A_ii] += A_rhs
 
-                A_values[A_ii] = A_values[A_ii] + A_rhs
+        # Generate unrolled code zeroing whole tensor
+        code_unroll = self.generate_tensor_value_initialization(A_values)
+        code_unroll = L.commented_code_list(code_unroll, "UFLACS block mode: preintegrated unroll")
 
-        code = self.generate_tensor_value_initialization(A_values)
-        return L.commented_code_list(code, "UFLACS block mode: preintegrated")
+        return code_unroll
 
     def generate_tensor_value_initialization(self, A_values):
         parts = []
@@ -1016,9 +1021,14 @@ def generate_tensor_value_initialization(self, A_values):
         A = self.backend.symbols.element_tensor()
         A_size = len(A_values)
 
-        init_mode = self.ir["params"]["tensor_init_mode"]
         z = L.LiteralFloat(0.0)
 
+        if all(A[j] in [0.0, z] for j in range(A_size)):
+            # We are just zeroing the tensor
+            init_mode = "upfront"
+        else:
+            init_mode = self.ir["params"]["tensor_init_mode"]
+
         k = L.Symbol("k")  # Index for zeroing arrays
 
         if init_mode == "direct":
@@ -1099,11 +1109,6 @@ def generate_tensor_copyout_statements(self):
         A_shape = self.ir["tensor_shape"]
         A_rank = len(A_shape)
 
-        # TODO: there's something like shape2strides(A_shape) somewhere
-        A_strides = [1] * A_rank
-        for i in reversed(range(0, A_rank - 1)):
-            A_strides[i] = A_strides[i + 1] * A_shape[i + 1]
-
         Asym = self.backend.symbols.element_tensor()
         A = L.FlattenedArray(Asym, dims=A_shape)
 

diff --git a/ffc/uflacs/language/cnodes.py b/ffc/uflacs/language/cnodes.py
@@ -93,18 +93,10 @@ def float_product(factors):
         return Product(factors)
 
 
-# Note: removed as part of C++ -> C transition. Not using memset because
-# it is not safe for floats
-# def MemZeroRange(name, begin, end):
-#     name = as_cexpr_or_string_symbol(name)
-#     return Call("std::fill", (name + begin, name + end, LiteralFloat(0.0)))
-
-# Note: removed as part of C++ -> C transition. Not using memset because
-# it is not safe for floats
-# def MemZero(name, size):
-#     name = as_cexpr_or_string_symbol(name)
-#     size = as_cexpr(size)
-#     return Call("std::fill_n", (name, size, LiteralFloat(0.0)))
+def MemZero(name, size):
+    name = as_cexpr_or_string_symbol(name)
+    size = as_cexpr_or_string_symbol("{} * sizeof(*{})".format(size, name))
+    return Call("memset", (name, LiteralInt(0), size))
 
 
 def MemCopy(src, dst, size, type):

diff --git a/test/regression/ffc-reference-data-id b/test/regression/ffc-reference-data-id
@@ -1 +1 @@
-e9e9321dfaf9251e159dc7263505299a8a59035f
+fd47851ba0790b5fad5e97d3fa0dee39bcc49d0f