Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate assignment loops for large preintegration tables #25

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
5f6d990
Generate assignment loops for large preintegration tables
w1th0utnam3 Mar 27, 2018
2da258b
Update reference data pointer to fd47851ba0790b5fad5e97d3fa0dee39bcc4…
w1th0utnam3 Mar 27, 2018
a84bc43
Add Fabian to Authors
blechta May 3, 2018
0b37f6b
Fixes for flake8 errors
w1th0utnam3 May 9, 2018
954e047
Merge pull request #28 from w1th0utnam3/fabian/fix-issue-173-rebase
blechta May 26, 2018
6bc1126
Merge branch 'master' into fabian/fix-issue-173-rebase
blechta Jul 6, 2018
b08263e
Remove remnant of conflict resolution
blechta Jul 6, 2018
125b2bf
Add back cnodes.MemZero using memset
blechta Jul 6, 2018
8812964
Sort unrolling in preintegratom
blechta Jul 6, 2018
2948e39
Revert change in demo
blechta Jul 6, 2018
f05e44a
flake8 fix
blechta Jul 6, 2018
58a203d
Fix zeroing preintegrated unrolled/looped mixed case
blechta Jul 6, 2018
4556fe2
Fix uninitialized variable in preintegration representation
blechta Jul 6, 2018
18f6bd3
Fix bug in MemZero
blechta Jul 6, 2018
4e5de24
Merge remote-tracking branch 'upstream/fabian/fix-issue-173-rebase' i…
w1th0utnam3 Jul 7, 2018
9d577d7
Use finalization_blocks for non-unrolled preintegrated blocks
w1th0utnam3 Jul 7, 2018
3681cb6
Removed blank line
w1th0utnam3 Jul 7, 2018
dd67f34
Merge pull request #39 from w1th0utnam3/fabian/fix-issue-173-rebase
blechta Jul 7, 2018
69e21b1
Remove some redundant code
w1th0utnam3 Jul 7, 2018
42a69a8
Remove code duplication for non-unrolled preintegrated blocks
w1th0utnam3 Jul 7, 2018
b41e97f
Fix flake8 error
w1th0utnam3 Jul 7, 2018
5006c67
Merge pull request #40 from w1th0utnam3/fabian/fix-issue-173-rebase
blechta Jul 8, 2018
7df243f
Issue copyout comment
blechta Jul 8, 2018
ec64ccb
Skip non-unrolled preintegrated blocks in unrolled code generation
w1th0utnam3 Jul 8, 2018
1444670
Merge pull request #41 from w1th0utnam3/fabian/fix-issue-173-rebase
w1th0utnam3 Jul 8, 2018
b8e51a7
Remove unnecessary line break
w1th0utnam3 Jul 8, 2018
e97e618
Merge pull request #42 from w1th0utnam3/fabian/fix-issue-173-rebase
w1th0utnam3 Jul 8, 2018
3a99317
Revert "Fix bug in MemZero"
blechta Jul 6, 2018
6565742
Revert "Add back cnodes.MemZero using memset"
blechta Jul 6, 2018
9550678
Revert "Update reference data pointer to fd47851ba0790b5fad5e97d3fa0d…
w1th0utnam3 Mar 27, 2018
45a7db1
Merge pull request #50 from w1th0utnam3/fabian/fix-issue-173-rebase
w1th0utnam3 Jul 20, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ Contributors:
email: dag@f.kth.se
www: http://www.f.kth.se/~dag/

Fabian Löschner
email: fabian.loeschner@rwth-aachen.de
www: https://w1th0utnam3.github.io/

Ola Skavhaug
email: skavhaug@simula.no
www: http://home.simula.no/~skavhaug/
Expand Down
15 changes: 13 additions & 2 deletions ffc/uflacs/build_uflacs_ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_common_block_data(blockdata):


preintegrated_block_data_t = namedtuple("preintegrated_block_data_t",
common_block_data_fields + ["is_uniform", "name"])
common_block_data_fields + ["is_uniform", "name", "unroll", "inline"])

premultiplied_block_data_t = namedtuple("premultiplied_block_data_t",
common_block_data_fields + ["is_uniform", "name"])
Expand Down Expand Up @@ -195,6 +195,7 @@ def uflacs_default_parameters(optimize):
"enable_sum_factorization": False,
"enable_block_transpose_reuse": False,
"enable_table_zero_compression": False,
"max_preintegrated_unrolled_table_size": 1024,

# Code generation parameters
"vectorize": False,
Expand Down Expand Up @@ -543,8 +544,18 @@ def build_uflacs_ir(cell, integral_type, entitytype, integrands, tensor_shape,
unique_table_num_dofs)
ptable = clamp_table_small_numbers(
ptable, rtol=p["table_rtol"], atol=p["table_atol"])
else:
ptable = unique_tables[pname]

# Decide whether to unroll dofblock assignment
max_unroll_size = ir["params"]["max_preintegrated_unrolled_table_size"]
unroll = numpy.prod(ptable.shape[1:]) <= max_unroll_size # First dimension is entity
inline = unroll and integral_type == "cell"

if pname is None:
# Store the table on the cache miss
pname = "PI%d" % (len(cache, ))
pname += "_inline" if inline else ""
cache[unames] = pname
unique_tables[pname] = ptable
unique_table_types[pname] = "preintegrated"
Expand All @@ -553,7 +564,7 @@ def build_uflacs_ir(cell, integral_type, entitytype, integrands, tensor_shape,
block_unames = (pname, )
blockdata = preintegrated_block_data_t(
block_mode, ttypes, factor_index, factor_is_piecewise, block_unames,
block_restrictions, block_is_transposed, block_is_uniform, pname)
block_restrictions, block_is_transposed, block_is_uniform, pname, unroll, inline)
block_is_piecewise = True

elif block_mode == "premultiplied":
Expand Down
97 changes: 51 additions & 46 deletions ffc/uflacs/integralgenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from ufl import product
from ufl.classes import Condition
from ufl.measure import custom_integral_types, point_integral_types
from ufl.utils.indexflattening import shape_to_strides

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -289,7 +290,6 @@ def generate_element_tables(self):

tables = self.ir["unique_tables"]
table_types = self.ir["unique_table_types"]
inline_tables = self.ir["integral_type"] == "cell"

alignas = self.ir["params"]["alignas"]
padlen = self.ir["params"]["padlen"]
Expand All @@ -304,14 +304,14 @@ def generate_element_tables(self):
for name in table_names:
table = tables[name]

# Don't pad preintegrated tables
# Don't pad preintegrated tables. FIXME: Why?!
if name[0] == "P":
p = 1
else:
p = padlen

# Skip tables that are inlined in code generation
if inline_tables and name[:2] == "PI":
if "inline" in name:
continue

decl = L.ArrayDecl(
Expand Down Expand Up @@ -591,9 +591,13 @@ def generate_dofblock_partition(self, num_points):

blocks = [(blockmap, blockdata)
for blockmap, contributions in sorted(block_contributions.items())
for blockdata in contributions if blockdata.block_mode != "preintegrated"]
for blockdata in contributions]

for blockmap, blockdata in blocks:
# Skip unrolled preintegration blocks
if blockdata.block_mode == "preintegrated" and blockdata.unroll:
continue

# Get symbol for already defined block B if it exists
common_block_data = get_common_block_data(blockdata)
B = self.shared_blocks.get(common_block_data)
Expand Down Expand Up @@ -793,7 +797,7 @@ def generate_block_parts(self, num_points, blockmap, blockdata):
# Plan for vectorization of coefficient evaluation over iq:
# 1) Define w0_c1 etc as arrays e.g. "double w0_c1[nq] = {};" outside quadloop
# 2) Access as w0_c1[iq] of course
# 3) Splitquadrature loops, coefficients before fw computation
# 3) Split quadrature loops, coefficients before fw computation
# 4) Possibly swap loops over iq and ic:
# for(ic) for(iq) w0_c1[iq] = w[0][ic] * FE[iq][ic];

Expand Down Expand Up @@ -904,6 +908,10 @@ def generate_block_parts(self, num_points, blockmap, blockdata):
# Preintegrated should never get into quadloops
assert num_points is None

# Inlining is only possible with unrolled blocks, which should not be passed to this function
assert not blockdata.unroll
assert not blockdata.inline

# Define B = B_rhs = f * PI where PI = sum_q weight * u * v
PI = L.Symbol(blockdata.name)[P_ii]
B_rhs = L.float_product([f, PI])
Expand Down Expand Up @@ -945,22 +953,18 @@ def generate_preintegrated_dofblock_partition(self):
# Get symbol, dimensions, and loop index symbols for A
A_shape = self.ir["tensor_shape"]
A_size = product(A_shape)
A_rank = len(A_shape)

# TODO: there's something like shape2strides(A_shape) somewhere
A_strides = [1] * A_rank
for i in reversed(range(0, A_rank - 1)):
A_strides[i] = A_strides[i + 1] * A_shape[i + 1]
A_strides = shape_to_strides(A_shape)

# List for unrolled assignments
A_values = [0.0] * A_size

for blockmap, blockdata in blocks:
# Generate code for unrolled blocks, non-unrolled blocks are treated together with premultiplied blocks
for block_id, (blockmap, blockdata) in enumerate(blocks):
# Accumulate A[blockmap[...]] += f*PI[...]

# Get table for inlining
tables = self.ir["unique_tables"]
table = tables[blockdata.name]
inline_table = self.ir["integral_type"] == "cell"

# Get factor expression
v = self.ir["piecewise_ir"]["V"][blockdata.factor_index]
Expand All @@ -969,45 +973,46 @@ def generate_preintegrated_dofblock_partition(self):
# Define rhs expression for A[blockmap[arg_indices]] += A_rhs
# A_rhs = f * PI where PI = sum_q weight * u * v
PI = L.Symbol(blockdata.name)
# block_rank = len(blockmap)

# # Override dof index with quadrature loop index for arguments with
# # quadrature element, to index B like B[iq*num_dofs + iq]
# arg_indices = tuple(
# self.backend.symbols.argument_loop_index(i) for i in range(block_rank))

# Define indices into preintegrated block
P_entity_indices = self.get_entities(blockdata)
if inline_table:
if blockdata.inline:
assert P_entity_indices == (L.LiteralInt(0), )
assert table.shape[0] == 1
assert ("inline" in blockdata.name) == blockdata.inline

# Unroll loop
blockshape = [len(DM) for DM in blockmap]
blockrange = [range(d) for d in blockshape]

for ii in itertools.product(*blockrange):
A_ii = sum(A_strides[i] * blockmap[i][ii[i]] for i in range(len(ii)))
if blockdata.transposed:
P_arg_indices = (ii[1], ii[0])
else:
P_arg_indices = ii

if inline_table:
# Extract float value of PI[P_ii]
Pval = table[0] # always entity 0
for i in P_arg_indices:
Pval = Pval[i]
A_rhs = Pval * f
else:
# Index the static preintegrated table:
P_ii = P_entity_indices + P_arg_indices
A_rhs = f * PI[P_ii]
if blockdata.unroll:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we could have just if blockdata.unroll: continue in the beginning of the loop?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, good catch.

# Generate unrolled assignments for the current block
for ii in itertools.product(*blockrange):
A_ii = sum(A_strides[i] * blockmap[i][ii[i]]
for i in range(len(ii)))
if blockdata.transposed:
P_arg_indices = (ii[1], ii[0])
else:
P_arg_indices = ii

if blockdata.inline:
# Extract float value of PI[P_ii]
Pval = table[0] # always entity 0
for i in P_arg_indices:
Pval = Pval[i]
A_rhs = Pval * f
else:
# Index the static preintegrated table:
P_ii = P_entity_indices + P_arg_indices
A_rhs = f * PI[P_ii]

A_values[A_ii] += A_rhs

A_values[A_ii] = A_values[A_ii] + A_rhs
# Generate unrolled code zeroing whole tensor
code_unroll = self.generate_tensor_value_initialization(A_values)
code_unroll = L.commented_code_list(code_unroll, "UFLACS block mode: preintegrated unroll")

code = self.generate_tensor_value_initialization(A_values)
return L.commented_code_list(code, "UFLACS block mode: preintegrated")
return code_unroll

def generate_tensor_value_initialization(self, A_values):
parts = []
Expand All @@ -1016,9 +1021,14 @@ def generate_tensor_value_initialization(self, A_values):
A = self.backend.symbols.element_tensor()
A_size = len(A_values)

init_mode = self.ir["params"]["tensor_init_mode"]
z = L.LiteralFloat(0.0)

if all(A[j] in [0.0, z] for j in range(A_size)):
# We are just zeroing the tensor
init_mode = "upfront"
else:
init_mode = self.ir["params"]["tensor_init_mode"]

k = L.Symbol("k") # Index for zeroing arrays

if init_mode == "direct":
Expand Down Expand Up @@ -1099,11 +1109,6 @@ def generate_tensor_copyout_statements(self):
A_shape = self.ir["tensor_shape"]
A_rank = len(A_shape)

# TODO: there's something like shape2strides(A_shape) somewhere
A_strides = [1] * A_rank
for i in reversed(range(0, A_rank - 1)):
A_strides[i] = A_strides[i + 1] * A_shape[i + 1]

Asym = self.backend.symbols.element_tensor()
A = L.FlattenedArray(Asym, dims=A_shape)

Expand Down
16 changes: 4 additions & 12 deletions ffc/uflacs/language/cnodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,18 +93,10 @@ def float_product(factors):
return Product(factors)


# Note: removed as part of C++ -> C transition. Not using memset because
# it is not safe for floats
# def MemZeroRange(name, begin, end):
# name = as_cexpr_or_string_symbol(name)
# return Call("std::fill", (name + begin, name + end, LiteralFloat(0.0)))

# Note: removed as part of C++ -> C transition. Not using memset because
# it is not safe for floats
# def MemZero(name, size):
# name = as_cexpr_or_string_symbol(name)
# size = as_cexpr(size)
# return Call("std::fill_n", (name, size, LiteralFloat(0.0)))
def MemZero(name, size):
name = as_cexpr_or_string_symbol(name)
size = as_cexpr_or_string_symbol("{} * sizeof(*{})".format(size, name))
return Call("memset", (name, LiteralInt(0), size))


def MemCopy(src, dst, size, type):
Expand Down
2 changes: 1 addition & 1 deletion test/regression/ffc-reference-data-id
Original file line number Diff line number Diff line change
@@ -1 +1 @@
e9e9321dfaf9251e159dc7263505299a8a59035f
fd47851ba0790b5fad5e97d3fa0dee39bcc49d0f