Finalise CFLAGS: Explain why chosen. Perhaps, Win32 and OSX might be …

…interested in the rational and implement something similar for their packages
skyjake · Oct 9, 2006 · 985caf5 · 985caf5
1 parent 09de61b
commit 985caf5
Showing 1 changed file with 154 additions and 8 deletions.
diff --git a/distrib/ubuntu/deng/debian/rules b/distrib/ubuntu/deng/debian/rules
@@ -13,20 +13,166 @@ include /usr/share/dpatch/dpatch.make
 DEBIAN_ARCH = $(shell dpkg-architecture -qDEB_BUILD_ARCH)
 
 
-CFLAGS = -Wall -Wextra -Wundef -Wpointer-arith -Wunreachable-code -Wstack-protector -ggdb3 -fPIC -fstack-protector
+CFLAGS = -Wall -Wextra -Wundef -Wpointer-arith -Wunreachable-code -Wstack-protector -ggdb3 -fPIC -fstack-protector -pipe
 
 ifneq (,$(findstring noopt,$(DEB_BUILD_OPTIONS)))
 	CFLAGS += -O0
 else
 #	CFLAGS += -fgcse-after-reload -fomit-frame-pointer -ffast-math -ftree-vectorize -ftree-vectorizer-verbose=5 -Q
-	CFLAGS += -fgcse-after-reload -fgcse-sm -fgcse-las -fweb -frename-registers -funswitch-loops -fomit-frame-pointer -ffast-math -ftree-vectorize -ftree-vectorizer-verbose=5
+	CFLAGS += -Os -funit-at-a-time -combine -ftree-pre -fgcse-after-reload -fgcse-sm -fgcse-las -fweb -frename-registers -fomit-frame-pointer -ffast-math -ftree-vectorize -ftree-vectorizer-verbose=5 -freorder-blocks -freorder-blocks-and-partition
 endif
 
-
-
-
-#These cflags below break gcc :)
-#	CFLAGS += -O2 -fPIC -fweb -ffast-math -funswitch-loops -fgcse-after-reload -fomit-frame-pointer -fgcse-lm -fgcse-sm -fgcse-las -ftree-loop-linear -ftree-loop-im -ftree-loop-ivcanon -fivopts -ftree-vectorize -ftracer -fvariable-expansion-in-unroller
+# Explanation of non-standard CFLAGS and why they where chosen:
+# Or why you should not change them without a GOOD reason.
+# 
+# Rational of the changes:
+# The primary strategy here is to take advantage of size optimization:
+# L1 and L2 cache are FAR FAR faster than main memory, and raw CPU cycles runs
+# circles around even cache speeds. Thus, optimizing for CPU speed at the
+# expense of size makes little sense, because all those saved cycles and more
+# are likely to be spent waiting for memory to return code that *would* have fit
+# in the cache were it size optimized.
+# 
+# Thus, for example, where traditional optimizations unroll loops into flat code
+# where possible, to avoid the expense of the jump back to the top of the loop,
+# that spreads out the loop to several times its original code size, thus taking
+# far more room in fast cache and forcing the CPU to wait far more often for
+# code to be fetched from main memory. I prefer to keep the loops, making the
+# code smaller and thus allowing more of it to fit in faster cache.
+#
+# If possible while processing those loops, activating SIMD features of modern
+# CPUs will further increase the apparent execution speed.
+# 
+# -freorder-blocks-and-partition, can make code slightly larger, but the effect
+# is the same as the above, increasing execution speed. What this optimization
+# does is separate code that is used often from that which is seldom used, so
+# the "hot" code is smaller and fits better in high speed cache, while the
+# "cold" code ends up in slower main memory most of the time. While a lower
+# percentage of the code may be in cache due to the larger size, cache will be
+# used far more effectively, as more "hot" code will be retained therein, with
+# the cold code that's not used so often allowed to drop out of cache into main
+# memory.
+# 
+# The secondary strategy here is to make as full a use of the registers
+# available on systems that support them (eg amd64 in 64-bit mode or powerpc).
+# Due to the lack of registers on x86 these optimizations have little effect.
+# Registers operate at the speed of the CPU, no wait at all, as there is for
+# even L1 cache, so it pays to use them as efficiently as possible. Several of
+# the flags (-frename-registers of course, -fweb, etc) therefore designed to
+# encourage gcc to do this. 
+# 
+# Our tertiary strategy is to allow gcc to optimize over as wide a scope whole
+# as possible. (units with -funit-at-a-time, or even multiple units with
+# -combine)
+#
+# We do have a potential counter-optimization with -fstack-protector (on x86
+# it uses an extra register on protected functions and x86 has a very limited
+# set of registers) but the benefits of run time buffer overflow protection far
+# outweigh any potential slowdowns caused by this option.
+# 
+# In detail summary of the CFLAGS used:
+# 
+# -fPIC
+# If supported for the target machine, emit position-independent code, suitable
+# for dynamic linking and avoiding any limit on the size of the global offset
+# table.
+# 
+# -fstack-protector
+# Emit extra code to check for buffer overflows, such as stack smashing attacks.
+# This is done by adding a guard variable to functions with vulnerable objects.
+# This includes functions that call alloca, and functions with buffers larger
+# than 8 bytes. The guards are initialized when a function is entered and then
+# checked when the function exits. If a guard check fails, an error message is
+# printed and the program exits.
+# 
+# -pipe
+# Use pipes rather than temporary files for communication between the various
+# stages of compilation. 
+# 
+# -Os
+# Optimize for size. -Os enables all -O2 optimizations that do not typically
+# increase code size. It also performs further optimizations designed to reduce
+# code size. -Os disables the following optimization flags: -falign-functions
+# -falign-jumps -falign-loops -falign-labels -freorder-blocks
+# -freorder-blocks-and-partition -fprefetch-loop-arrays -ftree-vect-loop-version
+# 
+# -funit-at-a-time
+# Parse the whole compilation unit before starting to produce code. This allows
+# some extra optimizations to take place but consumes more memory (in general)
+# 
+# -combine
+# If you are compiling multiple source files, this option tells the driver to
+# pass all the source files to the compiler at once (for those languages for
+# which the compiler can handle this). This will allow intermodule analysis
+# (IMA) to be performed by the compiler. Currently the only language for which
+# this is supported is C. If you pass source files for multiple languages to the
+# driver, using this option, the driver will invoke the compiler(s) that support
+# IMA once each, passing each compiler all the source files appropriate for it. 
+# 
+# -ftree-pre
+# Perform Partial Redundancy Elimination (PRE) on trees. This flag is enabled by
+#  default at -O2 and -O3.
+# 
+# -fgcse-after-reload
+# When -fgcse-after-reload is enabled, a redundant load elimination pass is
+# performed after reload. The purpose of this pass is to cleanup redundant
+#  spilling.
+# 
+# -fgcse-sm
+# When -fgcse-sm is enabled, a store motion pass is run after global common
+# subexpression elimination. This pass will attempt to move stores out of loops.
+# When used in conjunction with -fgcse-lm, loops containing a load/store
+# sequence can be changed to a load before the loop and a store after the loop.
+# 
+# -fgcse-las
+# When -fgcse-las is enabled, the global common subexpression elimination pass
+# eliminates redundant loads that come after stores to the same memory location
+# (both partial and full redundancies).
+# 
+# -fweb
+# Constructs webs as commonly used for register allocation purposes and assign
+# each web individual pseudo register. This allows the register allocation pass
+# to operate on pseudos directly, but also strengthens several other
+# optimization passes, such as CSE, loop optimizer and trivial dead code
+# remover. It can, however, make debugging impossible, since variables will no
+# longer stay in a “home register”.
+# 
+# -frename-registers
+# Attempt to avoid false dependencies in scheduled code by making use of
+# registers left over after register allocation. This optimization will most
+# benefit processors with lots of registers. Depending on the debug information
+# format adopted by the target, however, it can make debugging impossible, since
+# variables will no longer stay in a “home register”.
+# 
+# -fomit-frame-pointer
+# Don't keep the frame pointer in a register for functions that don't need one.
+# This avoids the instructions to save, set up and restore frame pointers; it
+# also makes an extra register available in many functions. It also makes
+# debugging impossible on some machines.
+# 
+# -ffast-math
+# Sets -fno-math-errno, -funsafe-math-optimizations, -fno-trapping-math,
+# -ffinite-math-only, -fno-rounding-math, -fno-signaling-nans and
+# -fcx-limited-range. This option causes the preprocessor macro __FAST_MATH__ to
+# be defined. This option should never be turned on by any -O option since it
+# can result in incorrect output for programs which depend on an exact
+# implementation of IEEE or ISO rules/specifications for math functions. 
+# 
+# -ftree-vectorize
+# Perform loop vectorization on trees. 
+# 
+# -ftree-vectorizer-verbose=5
+# Show results of vectorization attempts in the build logs
+# 
+# -freorder-blocks
+# Reorder basic blocks in the compiled function in order to reduce number of
+# taken branches and improve code locality.
+# 
+# -freorder-blocks-and-partition
+# In addition to reordering basic blocks in the compiled function, in order to
+# reduce number of taken branches, partitions hot and cold basic blocks into
+# separate sections of the assembly and .o files, to improve paging and cache
+# locality performance.
 
 ifeq (,$(findstring nostrip,$(DEB_BUILD_OPTIONS)))
 	INSTALL_PROGRAM += -s
@@ -35,7 +181,7 @@ endif
 
 # Now tune the engine based on our arch
 ifeq "$(DEBIAN_ARCH)" "i386"
-	CFLAGS += -march=i486 -mtune=pentium4 -O3
+	CFLAGS += -march=i486 -mtune=pentium4
 endif