diff --git a/AUTHORS b/AUTHORS
index 57d62f031..43359304a 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -49,6 +49,10 @@ Contributors:
     email: dag@f.kth.se
     www:   http://www.f.kth.se/~dag/
 
+    Fabian Löschner
+    email: fabian.loeschner@rwth-aachen.de
+    www:   https://w1th0utnam3.github.io/
+
     Ola Skavhaug
     email: skavhaug@simula.no
     www:   http://home.simula.no/~skavhaug/
diff --git a/ffc/uflacs/build_uflacs_ir.py b/ffc/uflacs/build_uflacs_ir.py
index 7d9e26404..3610ebc3f 100644
--- a/ffc/uflacs/build_uflacs_ir.py
+++ b/ffc/uflacs/build_uflacs_ir.py
@@ -59,7 +59,7 @@ def get_common_block_data(blockdata):
 
 
 preintegrated_block_data_t = namedtuple("preintegrated_block_data_t",
-                                        common_block_data_fields + ["is_uniform", "name"])
+                                        common_block_data_fields + ["is_uniform", "name", "unroll", "inline"])
 
 premultiplied_block_data_t = namedtuple("premultiplied_block_data_t",
                                         common_block_data_fields + ["is_uniform", "name"])
@@ -195,6 +195,7 @@ def uflacs_default_parameters(optimize):
         "enable_sum_factorization": False,
         "enable_block_transpose_reuse": False,
         "enable_table_zero_compression": False,
+        "max_preintegrated_unrolled_table_size": 1024,
 
         # Code generation parameters
         "vectorize": False,
@@ -543,8 +544,18 @@ def build_uflacs_ir(cell, integral_type, entitytype, integrands, tensor_shape,
                                                  unique_table_num_dofs)
                     ptable = clamp_table_small_numbers(
                         ptable, rtol=p["table_rtol"], atol=p["table_atol"])
+                else:
+                    ptable = unique_tables[pname]
+
+                # Decide whether to unroll dofblock assignment
+                max_unroll_size = ir["params"]["max_preintegrated_unrolled_table_size"]
+                unroll = numpy.prod(ptable.shape[1:]) <= max_unroll_size  # First dimension is entity
+                inline = unroll and integral_type == "cell"
 
+                if pname is None:
+                    # Store the table on the cache miss
                     pname = "PI%d" % (len(cache, ))
+                    pname += "_inline" if inline else ""
                     cache[unames] = pname
                     unique_tables[pname] = ptable
                     unique_table_types[pname] = "preintegrated"
@@ -553,7 +564,7 @@ def build_uflacs_ir(cell, integral_type, entitytype, integrands, tensor_shape,
                 block_unames = (pname, )
                 blockdata = preintegrated_block_data_t(
                     block_mode, ttypes, factor_index, factor_is_piecewise, block_unames,
-                    block_restrictions, block_is_transposed, block_is_uniform, pname)
+                    block_restrictions, block_is_transposed, block_is_uniform, pname, unroll, inline)
                 block_is_piecewise = True
 
             elif block_mode == "premultiplied":
diff --git a/ffc/uflacs/integralgenerator.py b/ffc/uflacs/integralgenerator.py
index 112400d8a..a73a52ed4 100644
--- a/ffc/uflacs/integralgenerator.py
+++ b/ffc/uflacs/integralgenerator.py
@@ -17,6 +17,7 @@
 from ufl import product
 from ufl.classes import Condition
 from ufl.measure import custom_integral_types, point_integral_types
+from ufl.utils.indexflattening import shape_to_strides
 
 logger = logging.getLogger(__name__)
 
@@ -289,7 +290,6 @@ def generate_element_tables(self):
 
         tables = self.ir["unique_tables"]
         table_types = self.ir["unique_table_types"]
-        inline_tables = self.ir["integral_type"] == "cell"
 
         alignas = self.ir["params"]["alignas"]
         padlen = self.ir["params"]["padlen"]
@@ -304,14 +304,14 @@ def generate_element_tables(self):
         for name in table_names:
             table = tables[name]
 
-            # Don't pad preintegrated tables
+            # Don't pad preintegrated tables. FIXME: Why?!
             if name[0] == "P":
                 p = 1
             else:
                 p = padlen
 
             # Skip tables that are inlined in code generation
-            if inline_tables and name[:2] == "PI":
+            if "inline" in name:
                 continue
 
             decl = L.ArrayDecl(
@@ -591,9 +591,13 @@ def generate_dofblock_partition(self, num_points):
 
         blocks = [(blockmap, blockdata)
                   for blockmap, contributions in sorted(block_contributions.items())
-                  for blockdata in contributions if blockdata.block_mode != "preintegrated"]
+                  for blockdata in contributions]
 
         for blockmap, blockdata in blocks:
+            # Skip unrolled preintegration blocks
+            if blockdata.block_mode == "preintegrated" and blockdata.unroll:
+                continue
+
             # Get symbol for already defined block B if it exists
             common_block_data = get_common_block_data(blockdata)
             B = self.shared_blocks.get(common_block_data)
@@ -793,7 +797,7 @@ def generate_block_parts(self, num_points, blockmap, blockdata):
                 # Plan for vectorization of coefficient evaluation over iq:
                 # 1) Define w0_c1 etc as arrays e.g. "double w0_c1[nq] = {};" outside quadloop
                 # 2) Access as w0_c1[iq] of course
-                # 3) Splitquadrature loops, coefficients before fw computation
+                # 3) Split quadrature loops, coefficients before fw computation
                 # 4) Possibly swap loops over iq and ic:
                 #    for(ic) for(iq) w0_c1[iq] = w[0][ic] * FE[iq][ic];
 
@@ -904,6 +908,10 @@ def generate_block_parts(self, num_points, blockmap, blockdata):
                 # Preintegrated should never get into quadloops
                 assert num_points is None
 
+                # Inlining is only possible with unrolled blocks, which should not be passed to this function
+                assert not blockdata.unroll
+                assert not blockdata.inline
+
                 # Define B = B_rhs = f * PI where PI = sum_q weight * u * v
                 PI = L.Symbol(blockdata.name)[P_ii]
                 B_rhs = L.float_product([f, PI])
@@ -945,22 +953,22 @@ def generate_preintegrated_dofblock_partition(self):
         # Get symbol, dimensions, and loop index symbols for A
         A_shape = self.ir["tensor_shape"]
         A_size = product(A_shape)
-        A_rank = len(A_shape)
-
-        # TODO: there's something like shape2strides(A_shape) somewhere
-        A_strides = [1] * A_rank
-        for i in reversed(range(0, A_rank - 1)):
-            A_strides[i] = A_strides[i + 1] * A_shape[i + 1]
+        A_strides = shape_to_strides(A_shape)
 
+        # List for unrolled assignments
         A_values = [0.0] * A_size
 
+        # Generate block-wise assignments
         for blockmap, blockdata in blocks:
-            # Accumulate A[blockmap[...]] += f*PI[...]
+            # Non-unrolled blocks are handled by generate_dofblock_partition (similar to premultiplied blocks)
+            if not blockdata.unroll:
+                continue
+
+            # Accumulate A[blockmap[...]] += f*PI[...] for unrolled blocks
 
             # Get table for inlining
             tables = self.ir["unique_tables"]
             table = tables[blockdata.name]
-            inline_table = self.ir["integral_type"] == "cell"
 
             # Get factor expression
             v = self.ir["piecewise_ir"]["V"][blockdata.factor_index]
@@ -969,23 +977,19 @@ def generate_preintegrated_dofblock_partition(self):
             # Define rhs expression for A[blockmap[arg_indices]] += A_rhs
             # A_rhs = f * PI where PI = sum_q weight * u * v
             PI = L.Symbol(blockdata.name)
-            # block_rank = len(blockmap)
-
-            # # Override dof index with quadrature loop index for arguments with
-            # # quadrature element, to index B like B[iq*num_dofs + iq]
-            # arg_indices = tuple(
-            #     self.backend.symbols.argument_loop_index(i) for i in range(block_rank))
 
             # Define indices into preintegrated block
             P_entity_indices = self.get_entities(blockdata)
-            if inline_table:
+            if blockdata.inline:
                 assert P_entity_indices == (L.LiteralInt(0), )
                 assert table.shape[0] == 1
+            assert ("inline" in blockdata.name) == blockdata.inline
 
             # Unroll loop
             blockshape = [len(DM) for DM in blockmap]
             blockrange = [range(d) for d in blockshape]
 
+            # Generate unrolled assignments for the current block
             for ii in itertools.product(*blockrange):
                 A_ii = sum(A_strides[i] * blockmap[i][ii[i]] for i in range(len(ii)))
                 if blockdata.transposed:
@@ -993,7 +997,7 @@ def generate_preintegrated_dofblock_partition(self):
                 else:
                     P_arg_indices = ii
 
-                if inline_table:
+                if blockdata.inline:
                     # Extract float value of PI[P_ii]
                     Pval = table[0]  # always entity 0
                     for i in P_arg_indices:
@@ -1004,10 +1008,13 @@ def generate_preintegrated_dofblock_partition(self):
                     P_ii = P_entity_indices + P_arg_indices
                     A_rhs = f * PI[P_ii]
 
-                A_values[A_ii] = A_values[A_ii] + A_rhs
+                A_values[A_ii] += A_rhs
+
+        # Generate unrolled code zeroing whole tensor
+        code_unroll = self.generate_tensor_value_initialization(A_values)
+        code_unroll = L.commented_code_list(code_unroll, "UFLACS block mode: preintegrated unroll")
 
-        code = self.generate_tensor_value_initialization(A_values)
-        return L.commented_code_list(code, "UFLACS block mode: preintegrated")
+        return code_unroll
 
     def generate_tensor_value_initialization(self, A_values):
         parts = []
@@ -1016,9 +1023,14 @@ def generate_tensor_value_initialization(self, A_values):
         A = self.backend.symbols.element_tensor()
         A_size = len(A_values)
 
-        init_mode = self.ir["params"]["tensor_init_mode"]
         z = L.LiteralFloat(0.0)
 
+        if all(A[j] in [0.0, z] for j in range(A_size)):
+            # We are just zeroing the tensor
+            init_mode = "upfront"
+        else:
+            init_mode = self.ir["params"]["tensor_init_mode"]
+
         k = L.Symbol("k")  # Index for zeroing arrays
 
         if init_mode == "direct":
@@ -1089,7 +1101,7 @@ def generate_expr_copyout_statements(self):
         for i, fi in enumerate(V_targets):
             parts.append(L.Assign(values[i], self.get_var(None, V[fi])))
 
-        return parts
+        return L.commented_code_list(parts, "Expression copyout")
 
     def generate_tensor_copyout_statements(self):
         L = self.backend.language
@@ -1099,11 +1111,6 @@ def generate_tensor_copyout_statements(self):
         A_shape = self.ir["tensor_shape"]
         A_rank = len(A_shape)
 
-        # TODO: there's something like shape2strides(A_shape) somewhere
-        A_strides = [1] * A_rank
-        for i in reversed(range(0, A_rank - 1)):
-            A_strides[i] = A_strides[i + 1] * A_shape[i + 1]
-
         Asym = self.backend.symbols.element_tensor()
         A = L.FlattenedArray(Asym, dims=A_shape)
 
@@ -1154,7 +1161,7 @@ def generate_tensor_copyout_statements(self):
         # Place static dofmap tables first
         parts = dofmap_parts + parts
 
-        return parts
+        return L.commented_code_list(parts, "Tensor copyout")
 
     def generate_copyout_statements(self):
         """Generate statements copying results to output array."""