diff --git a/AUTHORS b/AUTHORS index 57d62f031..43359304a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -49,6 +49,10 @@ Contributors: email: dag@f.kth.se www: http://www.f.kth.se/~dag/ + Fabian Löschner + email: fabian.loeschner@rwth-aachen.de + www: https://w1th0utnam3.github.io/ + Ola Skavhaug email: skavhaug@simula.no www: http://home.simula.no/~skavhaug/ diff --git a/ffc/uflacs/build_uflacs_ir.py b/ffc/uflacs/build_uflacs_ir.py index 7d9e26404..3610ebc3f 100644 --- a/ffc/uflacs/build_uflacs_ir.py +++ b/ffc/uflacs/build_uflacs_ir.py @@ -59,7 +59,7 @@ def get_common_block_data(blockdata): preintegrated_block_data_t = namedtuple("preintegrated_block_data_t", - common_block_data_fields + ["is_uniform", "name"]) + common_block_data_fields + ["is_uniform", "name", "unroll", "inline"]) premultiplied_block_data_t = namedtuple("premultiplied_block_data_t", common_block_data_fields + ["is_uniform", "name"]) @@ -195,6 +195,7 @@ def uflacs_default_parameters(optimize): "enable_sum_factorization": False, "enable_block_transpose_reuse": False, "enable_table_zero_compression": False, + "max_preintegrated_unrolled_table_size": 1024, # Code generation parameters "vectorize": False, @@ -543,8 +544,18 @@ def build_uflacs_ir(cell, integral_type, entitytype, integrands, tensor_shape, unique_table_num_dofs) ptable = clamp_table_small_numbers( ptable, rtol=p["table_rtol"], atol=p["table_atol"]) + else: + ptable = unique_tables[pname] + + # Decide whether to unroll dofblock assignment + max_unroll_size = ir["params"]["max_preintegrated_unrolled_table_size"] + unroll = numpy.prod(ptable.shape[1:]) <= max_unroll_size # First dimension is entity + inline = unroll and integral_type == "cell" + if pname is None: + # Store the table on the cache miss pname = "PI%d" % (len(cache, )) + pname += "_inline" if inline else "" cache[unames] = pname unique_tables[pname] = ptable unique_table_types[pname] = "preintegrated" @@ -553,7 +564,7 @@ def build_uflacs_ir(cell, integral_type, entitytype, integrands, tensor_shape, block_unames = (pname, ) blockdata = preintegrated_block_data_t( block_mode, ttypes, factor_index, factor_is_piecewise, block_unames, - block_restrictions, block_is_transposed, block_is_uniform, pname) + block_restrictions, block_is_transposed, block_is_uniform, pname, unroll, inline) block_is_piecewise = True elif block_mode == "premultiplied": diff --git a/ffc/uflacs/integralgenerator.py b/ffc/uflacs/integralgenerator.py index 112400d8a..a73a52ed4 100644 --- a/ffc/uflacs/integralgenerator.py +++ b/ffc/uflacs/integralgenerator.py @@ -17,6 +17,7 @@ from ufl import product from ufl.classes import Condition from ufl.measure import custom_integral_types, point_integral_types +from ufl.utils.indexflattening import shape_to_strides logger = logging.getLogger(__name__) @@ -289,7 +290,6 @@ def generate_element_tables(self): tables = self.ir["unique_tables"] table_types = self.ir["unique_table_types"] - inline_tables = self.ir["integral_type"] == "cell" alignas = self.ir["params"]["alignas"] padlen = self.ir["params"]["padlen"] @@ -304,14 +304,14 @@ def generate_element_tables(self): for name in table_names: table = tables[name] - # Don't pad preintegrated tables + # Don't pad preintegrated tables. FIXME: Why?! if name[0] == "P": p = 1 else: p = padlen # Skip tables that are inlined in code generation - if inline_tables and name[:2] == "PI": + if "inline" in name: continue decl = L.ArrayDecl( @@ -591,9 +591,13 @@ def generate_dofblock_partition(self, num_points): blocks = [(blockmap, blockdata) for blockmap, contributions in sorted(block_contributions.items()) - for blockdata in contributions if blockdata.block_mode != "preintegrated"] + for blockdata in contributions] for blockmap, blockdata in blocks: + # Skip unrolled preintegration blocks + if blockdata.block_mode == "preintegrated" and blockdata.unroll: + continue + # Get symbol for already defined block B if it exists common_block_data = get_common_block_data(blockdata) B = self.shared_blocks.get(common_block_data) @@ -793,7 +797,7 @@ def generate_block_parts(self, num_points, blockmap, blockdata): # Plan for vectorization of coefficient evaluation over iq: # 1) Define w0_c1 etc as arrays e.g. "double w0_c1[nq] = {};" outside quadloop # 2) Access as w0_c1[iq] of course - # 3) Splitquadrature loops, coefficients before fw computation + # 3) Split quadrature loops, coefficients before fw computation # 4) Possibly swap loops over iq and ic: # for(ic) for(iq) w0_c1[iq] = w[0][ic] * FE[iq][ic]; @@ -904,6 +908,10 @@ def generate_block_parts(self, num_points, blockmap, blockdata): # Preintegrated should never get into quadloops assert num_points is None + # Inlining is only possible with unrolled blocks, which should not be passed to this function + assert not blockdata.unroll + assert not blockdata.inline + # Define B = B_rhs = f * PI where PI = sum_q weight * u * v PI = L.Symbol(blockdata.name)[P_ii] B_rhs = L.float_product([f, PI]) @@ -945,22 +953,22 @@ def generate_preintegrated_dofblock_partition(self): # Get symbol, dimensions, and loop index symbols for A A_shape = self.ir["tensor_shape"] A_size = product(A_shape) - A_rank = len(A_shape) - - # TODO: there's something like shape2strides(A_shape) somewhere - A_strides = [1] * A_rank - for i in reversed(range(0, A_rank - 1)): - A_strides[i] = A_strides[i + 1] * A_shape[i + 1] + A_strides = shape_to_strides(A_shape) + # List for unrolled assignments A_values = [0.0] * A_size + # Generate block-wise assignments for blockmap, blockdata in blocks: - # Accumulate A[blockmap[...]] += f*PI[...] + # Non-unrolled blocks are handled by generate_dofblock_partition (similar to premultiplied blocks) + if not blockdata.unroll: + continue + + # Accumulate A[blockmap[...]] += f*PI[...] for unrolled blocks # Get table for inlining tables = self.ir["unique_tables"] table = tables[blockdata.name] - inline_table = self.ir["integral_type"] == "cell" # Get factor expression v = self.ir["piecewise_ir"]["V"][blockdata.factor_index] @@ -969,23 +977,19 @@ def generate_preintegrated_dofblock_partition(self): # Define rhs expression for A[blockmap[arg_indices]] += A_rhs # A_rhs = f * PI where PI = sum_q weight * u * v PI = L.Symbol(blockdata.name) - # block_rank = len(blockmap) - - # # Override dof index with quadrature loop index for arguments with - # # quadrature element, to index B like B[iq*num_dofs + iq] - # arg_indices = tuple( - # self.backend.symbols.argument_loop_index(i) for i in range(block_rank)) # Define indices into preintegrated block P_entity_indices = self.get_entities(blockdata) - if inline_table: + if blockdata.inline: assert P_entity_indices == (L.LiteralInt(0), ) assert table.shape[0] == 1 + assert ("inline" in blockdata.name) == blockdata.inline # Unroll loop blockshape = [len(DM) for DM in blockmap] blockrange = [range(d) for d in blockshape] + # Generate unrolled assignments for the current block for ii in itertools.product(*blockrange): A_ii = sum(A_strides[i] * blockmap[i][ii[i]] for i in range(len(ii))) if blockdata.transposed: @@ -993,7 +997,7 @@ def generate_preintegrated_dofblock_partition(self): else: P_arg_indices = ii - if inline_table: + if blockdata.inline: # Extract float value of PI[P_ii] Pval = table[0] # always entity 0 for i in P_arg_indices: @@ -1004,10 +1008,13 @@ def generate_preintegrated_dofblock_partition(self): P_ii = P_entity_indices + P_arg_indices A_rhs = f * PI[P_ii] - A_values[A_ii] = A_values[A_ii] + A_rhs + A_values[A_ii] += A_rhs + + # Generate unrolled code zeroing whole tensor + code_unroll = self.generate_tensor_value_initialization(A_values) + code_unroll = L.commented_code_list(code_unroll, "UFLACS block mode: preintegrated unroll") - code = self.generate_tensor_value_initialization(A_values) - return L.commented_code_list(code, "UFLACS block mode: preintegrated") + return code_unroll def generate_tensor_value_initialization(self, A_values): parts = [] @@ -1016,9 +1023,14 @@ def generate_tensor_value_initialization(self, A_values): A = self.backend.symbols.element_tensor() A_size = len(A_values) - init_mode = self.ir["params"]["tensor_init_mode"] z = L.LiteralFloat(0.0) + if all(A[j] in [0.0, z] for j in range(A_size)): + # We are just zeroing the tensor + init_mode = "upfront" + else: + init_mode = self.ir["params"]["tensor_init_mode"] + k = L.Symbol("k") # Index for zeroing arrays if init_mode == "direct": @@ -1089,7 +1101,7 @@ def generate_expr_copyout_statements(self): for i, fi in enumerate(V_targets): parts.append(L.Assign(values[i], self.get_var(None, V[fi]))) - return parts + return L.commented_code_list(parts, "Expression copyout") def generate_tensor_copyout_statements(self): L = self.backend.language @@ -1099,11 +1111,6 @@ def generate_tensor_copyout_statements(self): A_shape = self.ir["tensor_shape"] A_rank = len(A_shape) - # TODO: there's something like shape2strides(A_shape) somewhere - A_strides = [1] * A_rank - for i in reversed(range(0, A_rank - 1)): - A_strides[i] = A_strides[i + 1] * A_shape[i + 1] - Asym = self.backend.symbols.element_tensor() A = L.FlattenedArray(Asym, dims=A_shape) @@ -1154,7 +1161,7 @@ def generate_tensor_copyout_statements(self): # Place static dofmap tables first parts = dofmap_parts + parts - return parts + return L.commented_code_list(parts, "Tensor copyout") def generate_copyout_statements(self): """Generate statements copying results to output array."""