In [50]:
# Please execute/shift-return this cell everytime you run the notebook.  Don't edit it. 
%load_ext autoreload
%autoreload 2
from notebook import * 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Case study: matrix multiplications

GEMM that computes C = A $\times$ B is the core of many AI/ML applications. The most naive implementation of GEMM takes $O(n^3)$. Assume it takes 1 second to perform GEMM on 1,024$\times$1,024$\times$1,024 matrices. How much time do you expect it would take for 2,048$\times$2,048$\times$2,048 matrices?

In [2]:
render_code("matrix_mul/mm.c", show=["//START","//END"])

In [3]:
! cd matrix_mul; make clean; make mm

rm -f blockmm mm blockmm_transpose cachegrind.* mm_dump
gcc -DHAVE_LINUX_PERF_EVENT_H -O3 mm.c perfstats.c -o mm 


In [4]:
! cd matrix_mul; echo "IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > mm.csv
! ./matrix_mul/mm 512 >> ./matrix_mul/mm.csv ;./matrix_mul/mm 1024 >> ./matrix_mul/mm.csv ; ./matrix_mul/mm 2048 >> ./matrix_mul/mm.csv
#! cs203 job memory "./matrix_mul/mm 1024 >> ./matrix_mul/mm.csv ; ./matrix_mul/mm 2048 >> ./matrix_mul/mm.csv"

234410496.000000,1406510080.000000,10521102336.000000,

In [51]:
display_df_mono(render_csv("matrix_mul/mm.csv"))

Unnamed: 0,index,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,512,1077432003,661148573,0.613634,0.217316,0.143678,0.243293,130824267,537722442
1,1024,8586764082,10945327062,1.274674,0.193525,2.118194,0.233284,1000159900,4287311885
2,2048,69005155569,122125385954,1.769801,0.192823,23.548576,0.308093,10616080516,34457335150


WOW! Compuational complexty breaks again! The GEMM performance go wild because of cache misses!

What kind of misses are we seeing?

In [6]:
! make -C matrix_mul mm_dump; ./matrix_mul/mm_dump 256 >& mm_dump_address.csv

make: Entering directory '/nfshome/htseng/courses/CS203/demo/memory/matrix_mul'
gcc -DHAVE_LINUX_PERF_EVENT_H -DDUMP -O3 mm.c perfstats.c -o mm_dump 
make: Leaving directory '/nfshome/htseng/courses/CS203/demo/memory/matrix_mul'


In [7]:
! echo "element,address" > mm_dump_addresses_digest.csv 
! head -n 101 mm_dump_address.csv | grep "b\[" >> mm_dump_addresses_digest.csv
df = pd.read_csv("mm_dump_addresses_digest.csv",skipfooter=1,engine='python')
df["address"] = df["address"].str.replace('0x','')
df["address"]=df[["address"]].apply(lambda x: x.astype(str).map(lambda x: int(x, base=16)))
# only show the first N addresses 
#N = 32
#df2 = df2.iloc[:N]
C = 49152
B = 64
A = 12
offset_bits = int(math.log2(B))
S = int(C/(B*A))
index_bits = int(math.log2(S))
df["tag"]=(df["address"].apply(lambda x: x >> (offset_bits+index_bits)))
df["tag"] = df["tag"].apply(lambda x: hex(x))
df["index"] = df["address"].apply(lambda x: hex((x>>offset_bits)%S))
df["address"] = df["address"].apply(lambda x: hex(x))
display_df_mono(df)

Unnamed: 0,element,address,tag,index
0,b[0][0],0x7b7c52eff000,0x7b7c52eff,0x0
1,b[1][0],0x7b7c52eff800,0x7b7c52eff,0x20
2,b[2][0],0x7b7c52f00000,0x7b7c52f00,0x0
3,b[3][0],0x7b7c52f00800,0x7b7c52f00,0x20
4,b[4][0],0x7b7c52f01000,0x7b7c52f01,0x0
5,b[5][0],0x7b7c52f01800,0x7b7c52f01,0x20
6,b[6][0],0x7b7c52f02000,0x7b7c52f02,0x0
7,b[7][0],0x7b7c52f02800,0x7b7c52f02,0x20
8,b[8][0],0x7b7c52f03000,0x7b7c52f03,0x0
9,b[9][0],0x7b7c52f03800,0x7b7c52f03,0x20


### Matrix tiling algorithm

Let's try to partition GEMM into smaller tiles!

In [52]:
render_code("matrix_mul/blockmm.c", show=["//START","//END"])

In [53]:
! cd matrix_mul/; make clean blockmm

rm -f blockmm mm blockmm_transpose cachegrind.* mm_dump rect_blockmm_trans blockmm_transpose_reg blockmm_reg
gcc -O4 -DHAVE_LINUX_PERF_EVENT_H blockmm.c perfstats.c -o blockmm 
[01m[Kblockmm.c:[m[K In function ‘[01m[Kmain[m[K’:
   48 |   printf("%d,[01;35m[K%lu[m[K,",ARRAY_SIZE,[32m[Ktile_size[m[K);
      |              [01;35m[K~~^[m[K              [32m[K~~~~~~~~~[m[K
      |                [01;35m[K|[m[K              [32m[K|[m[K
      |                [01;35m[K|[m[K              [32m[Kint[m[K
      |                [01;35m[Klong unsigned int[m[K
      |              [32m[K%u[m[K


In [58]:
! cd matrix_mul; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm.csv
! ./matrix_mul/blockmm 512 8 >> ./matrix_mul/blockmm.csv ;./matrix_mul/blockmm 1024 8 >> ./matrix_mul/blockmm.csv ; ./matrix_mul/blockmm 2048 8 >> ./matrix_mul/blockmm.csv; ./matrix_mul/blockmm 4096 8 >> ./matrix_mul/blockmm.csv

In [55]:
display_df_mono(render_csv("matrix_mul/mm.csv"))
display_df_mono(render_csv("matrix_mul/blockmm.csv"))

Unnamed: 0,index,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,512,1077432003,661148573,0.613634,0.217316,0.143678,0.243293,130824267,537722442
1,1024,8586764082,10945327062,1.274674,0.193525,2.118194,0.233284,1000159900,4287311885
2,2048,69005155569,122125385954,1.769801,0.192823,23.548576,0.308093,10616080516,34457335150


Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,512,8,1266427680,275297453,0.217381,0.239359,0.065895,0.009266,5524063,596155020
1,1,1024,8,10110546358,2167746299,0.214404,0.19399,0.420521,0.010271,48883460,4759532739
2,2,2048,8,81046235957,20743036584,0.255941,0.193622,4.016311,0.010452,398769644,38151767767
3,3,4096,8,648460832126,213888968362,0.329841,0.193504,41.388287,0.012343,3767687760,305247199626


In [56]:
! ./matrix_mul/blockmm 2048 4 >> ./matrix_mul/blockmm.csv
! ./matrix_mul/blockmm 2048 16 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 32 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 64 >> ./matrix_mul/blockmm.csv
! ./matrix_mul/blockmm 2048 128 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 256 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 512 >> ./matrix_mul/blockmm.csv 
display_df_mono(render_csv("matrix_mul/blockmm.csv"))

Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,512,8,1266427680,275297453,0.217381,0.239359,0.065895,0.009266,5524063,596155020
1,1,1024,8,10110546358,2167746299,0.214404,0.19399,0.420521,0.010271,48883460,4759532739
2,2,2048,8,81046235957,20743036584,0.255941,0.193622,4.016311,0.010452,398769644,38151767767
3,3,4096,8,648460832126,213888968362,0.329841,0.193504,41.388287,0.012343,3767687760,305247199626
4,4,2048,4,97766577033,24266623994,0.24821,0.193454,4.694471,0.015388,671554278,43641889498
5,5,2048,16,74456479565,19402082089,0.260583,0.193419,3.752731,0.071686,2587663085,36097068988
6,6,2048,32,71543283827,27680759444,0.386909,0.193423,5.354089,0.217035,7642744129,35214419318
7,7,2048,64,70149888874,32524432480,0.463642,0.193504,6.293604,0.242865,8450386960,34794514286
8,8,2048,128,69439545012,34585003248,0.498059,0.193594,6.695465,0.24571,8495523262,34575390032
9,9,2048,256,69131617403,35231568706,0.50963,0.193647,6.822498,0.246595,8504781737,34488922208


In [34]:
render_code("matrix_mul/blockmm_reg.c", show=["//START","//END"])

In [35]:
! cd matrix_mul; make blockmm_reg; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm_reg.csv
! ./matrix_mul/blockmm_reg 2048 4 >> ./matrix_mul/blockmm_reg.csv
! ./matrix_mul/blockmm_reg 2048 8 >> ./matrix_mul/blockmm_reg.csv
! ./matrix_mul/blockmm_reg 2048 16 >> ./matrix_mul/blockmm_reg.csv 
! ./matrix_mul/blockmm_reg 2048 32 >> ./matrix_mul/blockmm_reg.csv 
! ./matrix_mul/blockmm_reg 2048 64 >> ./matrix_mul/blockmm_reg.csv
! ./matrix_mul/blockmm_reg 2048 128 >> ./matrix_mul/blockmm_reg.csv 
! ./matrix_mul/blockmm_reg 2048 256 >> ./matrix_mul/blockmm_reg.csv 
! ./matrix_mul/blockmm_reg 2048 512 >> ./matrix_mul/blockmm_reg.csv 
display_df_mono(render_csv("matrix_mul/blockmm_reg.csv"))

make: 'blockmm_reg' is up to date.


Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,2048,4,97765852086,23935328110,0.244823,0.193369,4.628342,0.015254,665707401,43641550931
1,1,2048,8,81043215218,19497284769,0.240579,0.193077,3.764484,0.010866,414560248,38150553421
2,2,2048,16,74473187428,19453782025,0.261219,0.193252,3.759487,0.071905,2596152829,36105160972
3,3,2048,32,71543835031,27672273100,0.386788,0.193202,5.346344,0.216953,7639893404,35214571476
4,4,2048,64,70149438937,32733840399,0.46663,0.19314,6.322198,0.242104,8423826403,34794275094
5,5,2048,128,69446081686,34562306757,0.497685,0.193374,6.68345,0.245759,8498030900,34578653057
6,6,2048,256,69137240527,35237950453,0.509681,0.193224,6.808813,0.246519,8502666840,34490982288
7,7,2048,512,68986823075,46197787531,0.669661,0.193256,8.927977,0.240498,8284441214,34447065090


In [59]:
render_code("matrix_mul/blockmm_transpose.c", show=["//START","//END"])

### Matrix transpose

In [61]:
! cd matrix_mul; rm blockmm_transpose; make blockmm_transpose; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm_transpose.csv
! ./matrix_mul/blockmm_transpose 512 8 >> ./matrix_mul/blockmm_transpose.csv ;./matrix_mul/blockmm_transpose 1024 8 >> ./matrix_mul/blockmm_transpose.csv ; ./matrix_mul/blockmm_transpose 2048 8 >> ./matrix_mul/blockmm_transpose.csv; ./matrix_mul/blockmm_transpose 4096 8 >> ./matrix_mul/blockmm_transpose.csv

gcc -O4 -DHAVE_LINUX_PERF_EVENT_H blockmm_transpose.c perfstats.c -o blockmm_transpose
234410496.000000,1406510080.000000,10521102336.000000,48070299648.000000,

In [62]:
! ./matrix_mul/blockmm_transpose 2048 8 >> ./matrix_mul/blockmm_transpose.csv 
! ./matrix_mul/blockmm_transpose 2048 16 >> ./matrix_mul/blockmm_transpose.csv 
! ./matrix_mul/blockmm_transpose 2048 32 >> ./matrix_mul/blockmm_transpose.csv 
! ./matrix_mul/blockmm_transpose 2048 64 >> ./matrix_mul/blockmm_transpose.csv
! ./matrix_mul/blockmm_transpose 2048 128 >> ./matrix_mul/blockmm_transpose.csv
! ./matrix_mul/blockmm_transpose 2048 256 >> ./matrix_mul/blockmm_transpose.csv

10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,

In [65]:
display_df_mono(render_csv("matrix_mul/blockmm_transpose.csv"))

Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,512,8,1070281542,242439252,0.226519,0.245084,0.059418,0.005558,2432375,437667454
1,1,1024,8,8793396438,1993091552,0.226658,0.194256,0.38717,0.005302,19065101,3596013701
2,2,2048,8,70351792875,16020026428,0.227713,0.193448,3.099048,0.001958,56320441,28770086175
3,3,4096,8,562821740727,130085419941,0.231131,0.19344,25.163769,0.002008,462141552,230165298682
4,4,2048,8,70351901088,15875339719,0.225656,0.193362,3.06969,0.001873,53886617,28770137331
5,5,2048,16,64810538294,15149020262,0.233743,0.193439,2.930407,0.026082,705401364,27045543543
6,6,2048,32,62393733101,14902080307,0.238839,0.193426,2.882445,0.041199,1086176368,26364362607
7,7,2048,64,61255063916,13644796739,0.222754,0.193406,2.638986,0.022979,598866013,26061029869
8,8,2048,128,60709876879,16660750922,0.274432,0.19341,3.222355,0.015464,400852771,25921119076
9,9,2048,256,60438343834,17009191579,0.28143,0.193569,3.292457,0.010075,260448373,25852185272


In [64]:
display_df_mono(render_csv("matrix_mul/blockmm.csv"))

Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,512,8,1266294705,269303354,0.21267,0.201334,0.05422,0.009066,5404332,596102953
1,1,1024,8,9984901040,2149867747,0.215312,0.198968,0.427754,0.009588,45065792,4700389835
2,2,2048,8,81044308244,20255402179,0.24993,0.193509,3.9196,0.010229,390251728,38151003119
3,3,4096,8,648432613190,204886033336,0.315971,0.193437,39.632588,0.012329,3763338967,305236100590


In [40]:
render_code("matrix_mul/blockmm_transpose_reg.c", show=["//START","//END"])

In [41]:
! cd matrix_mul; rm blockmm_transpose_reg; make blockmm_transpose_reg; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm_transpose_reg.csv 
! ./matrix_mul/blockmm_transpose_reg 2048 8 >> ./matrix_mul/blockmm_transpose_reg.csv 
! ./matrix_mul/blockmm_transpose_reg 2048 16 >> ./matrix_mul/blockmm_transpose_reg.csv 
! ./matrix_mul/blockmm_transpose_reg 2048 32 >> ./matrix_mul/blockmm_transpose_reg.csv 
! ./matrix_mul/blockmm_transpose_reg 2048 64 >> ./matrix_mul/blockmm_transpose_reg.csv
! ./matrix_mul/blockmm_transpose_reg 2048 128 >> ./matrix_mul/blockmm_transpose_reg.csv
! ./matrix_mul/blockmm_transpose_reg 2048 256 >> ./matrix_mul/blockmm_transpose_reg.csv

rm: cannot remove 'blockmm_transpose_reg': No such file or directory
gcc -O4 -DHAVE_LINUX_PERF_EVENT_H blockmm_transpose_reg.c perfstats.c -o blockmm_transpose_reg
10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,

In [42]:
display_df_mono(render_csv("matrix_mul/blockmm_transpose_reg.csv"))

Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,2048,8,60667686406,11604601609,0.191281,0.193985,2.251117,0.003363,53294469,15849319789
1,1,2048,16,49205927530,10032628049,0.203891,0.193373,1.94004,0.066542,797626617,11986874817
2,2,2048,32,43854347828,9736190510,0.222012,0.193008,1.879164,0.09636,987422134,10247265803
3,3,2048,64,41246516681,10038942768,0.243389,0.193141,1.938936,0.069853,657531122,9413044702
4,4,2048,128,39962465083,11887280358,0.297461,0.193436,2.299423,0.045847,412871820,9005357520
5,5,2048,256,39326412762,14051616706,0.357307,0.193021,2.712263,0.032001,281741221,8804127506


In [69]:
render_code("matrix_mul/rect_blockmm_trans.c", show=["//START","//END"])

In [70]:
! cd matrix_mul; make rect_blockmm_trans; echo "size,tile_size_x,tile_size_y,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > rect_blockmm_trans.csv
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 8 8 >> ./matrix_mul/rect_blockmm_trans.csv 
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 8 16 >> ./matrix_mul/rect_blockmm_trans.csv 
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 16 8 >> ./matrix_mul/rect_blockmm_trans.csv
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 16 16 >> ./matrix_mul/rect_blockmm_trans.csv
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 32 8 >> ./matrix_mul/rect_blockmm_trans.csv 
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 32 16 >> ./matrix_mul/rect_blockmm_trans.csv 
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 64 8 >> ./matrix_mul/rect_blockmm_trans.csv
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 128 8 >> ./matrix_mul/rect_blockmm_trans.csv
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 256 8 >> ./matrix_mul/rect_blockmm_trans.csv
display_df_mono(render_csv("matrix_mul/rect_blockmm_trans.csv"))

gcc -O4 -DHAVE_LINUX_PERF_EVENT_H rect_blockmm_trans.c perfstats.c -o rect_blockmm_trans
10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,

Unnamed: 0,index,size,tile_size_x,tile_size_y,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,2048,8,8,60696048824,11648145236,0.191909,0.193631,2.255442,0.00364,57631686,15832143536
1,1,2048,8,16,59757217120,12559842308,0.210181,0.193581,2.431349,0.02185,335952048,15375120532
2,2,2048,16,8,49689553829,11438657060,0.230202,0.193368,2.211867,0.00402,49108352,12216407413
3,3,2048,16,16,49215689298,10068305671,0.204575,0.19351,1.948321,0.06711,804396675,11986190409
4,4,2048,32,8,44182439997,9418862226,0.213181,0.193463,1.822202,0.005665,58957446,10407108672
5,5,2048,32,16,43946919900,9736766008,0.221557,0.19334,1.882503,0.06652,684653454,10292465017
6,6,2048,64,8,41431376817,9605874603,0.23185,0.193431,1.858076,0.005764,54775117,9503383681
7,7,2048,128,8,40059753243,11627416594,0.290252,0.193339,2.248035,0.006747,61082659,9052969176
8,8,2048,256,8,39377061980,13821513823,0.351004,0.193481,2.674196,0.004036,35632020,8828931122


## Prefetch

x86 provide prefetch instructions. As a programmer, you may insert ```_mm_prefetch``` in x86 programs to perform software prefetch for your code. The gcc compiler also has a flag ```-fprefetch-loop-arrays``` to automatically insert software prefetch instructions.

### Using prefetch in matrix transpose code

The following example is a highly optimized matrix transpose code. In the example, we try to prefetch the next row.

In [18]:
render_code("./prefetch/transpose.cpp", lang="c++", show=["//START", "//END"])

Now, let's take a look of what's happening!

In [19]:
! cd prefetch; make clean; make
# ! echo "Without prefetch -- the baseline"; ssh htseng@celebi "lscpu | grep Model; cd courses/CS203/demo/memory/prefetch/; ./transpose"
! echo "Without prefetch -- the baseline"
! lscpu | grep Model
! ./prefetch/transpose
! echo "With prefetch"
! ./prefetch/transpose_prefetch

rm -f blockmm_sse blockmm blockmm_sse_prefetch transpose transpose_prefetch
g++ -msse4.1 -mavx -O3 transpose.cpp -o transpose 
g++ -msse4.1 -mavx -O3 -DENABLE_PREFETCH transpose.cpp -o transpose_prefetch 
Without prefetch -- the baseline
Model name:                           13th Gen Intel(R) Core(TM) i7-13700
Model:                                183
bytes = 4294967296
Starting Data Transpose...   Done
Time: 0.510648 seconds
With prefetch
bytes = 4294967296
Starting Data Transpose...   Done
Time: 0.430467 seconds


Let's try a different machine now.

In [20]:
! ssh htseng@xerneas "cd /nfshome/htseng/courses/CSE142/demo/memory/; make -C ./prefetch clean; make -C ./prefetch ; lscpu | grep Model"
! echo "Without prefetch -- the baseline"; ssh htseng@xerneas  "/nfshome/htseng/courses/CSE142/demo/memory/prefetch/transpose"
! echo "With prefetch";  ssh htseng@xerneas  "/nfshome/htseng/courses/CSE142/demo/memory/prefetch/transpose_prefetch"

make: Entering directory '/nfshome/htseng/courses/CSE142/demo/memory/prefetch'
rm -f blockmm_sse blockmm blockmm_sse_prefetch transpose transpose_prefetch
make: Leaving directory '/nfshome/htseng/courses/CSE142/demo/memory/prefetch'
make: Entering directory '/nfshome/htseng/courses/CSE142/demo/memory/prefetch'
g++ -msse4.1 -mavx -O3 transpose.cpp -o transpose 
g++ -msse4.1 -mavx -O3 -DENABLE_PREFETCH transpose.cpp -o transpose_prefetch 
make: Leaving directory '/nfshome/htseng/courses/CSE142/demo/memory/prefetch'
Model name:                           AMD Ryzen 9 5950X 16-Core Processor
Model:                                33
Without prefetch -- the baseline
bytes = 1073741824
Starting Data Transpose...   Done
Time: 0.115764 seconds
With prefetch
bytes = 1073741824
Starting Data Transpose...   Done
Time: 0.108927 seconds


In [21]:
! ssh htseng@blissey "cd /nfshome/htseng/courses/CSE142/demo/memory/; make -C ./prefetch clean; make -C ./prefetch ; lscpu | grep Model"
! echo "Without prefetch -- the baseline"; ssh htseng@blissey  "/nfshome/htseng/courses/CSE142/demo/memory/prefetch/transpose"
! echo "With prefetch";  ssh htseng@blissey  "/nfshome/htseng/courses/CSE142/demo/memory/prefetch/transpose_prefetch"

make: Entering directory '/nfshome/htseng/courses/CSE142/demo/memory/prefetch'
rm -f blockmm_sse blockmm blockmm_sse_prefetch transpose transpose_prefetch
make: Leaving directory '/nfshome/htseng/courses/CSE142/demo/memory/prefetch'
make: Entering directory '/nfshome/htseng/courses/CSE142/demo/memory/prefetch'
g++ -msse4.1 -mavx -O3 transpose.cpp -o transpose 
g++ -msse4.1 -mavx -O3 -DENABLE_PREFETCH transpose.cpp -o transpose_prefetch 
make: Leaving directory '/nfshome/htseng/courses/CSE142/demo/memory/prefetch'
Model name:                           AMD Ryzen 7 5700X 8-Core Processor
Model:                                33
Without prefetch -- the baseline
bytes = 1073741824
Starting Data Transpose...   Done
Time: 0.103637 seconds
With prefetch
bytes = 1073741824
Starting Data Transpose...   Done
Time: 0.096679 seconds


In [22]:
! ssh htseng@eevee "cd /nfshome/htseng/courses/CSE142/demo/memory/; make -C ./prefetch clean; make -C ./prefetch ; lscpu | grep Model"
! echo "Without prefetch -- the baseline"; ssh htseng@eevee  "/nfshome/htseng/courses/CSE142/demo/memory/prefetch/transpose"
! echo "With prefetch";  ssh htseng@eevee  "/nfshome/htseng/courses/CSE142/demo/memory/prefetch/transpose_prefetch"

make: Entering directory '/nfshome/htseng/courses/CSE142/demo/memory/prefetch'
rm -f blockmm_sse blockmm blockmm_sse_prefetch transpose transpose_prefetch
make: Leaving directory '/nfshome/htseng/courses/CSE142/demo/memory/prefetch'
make: Entering directory '/nfshome/htseng/courses/CSE142/demo/memory/prefetch'
g++ -msse4.1 -mavx -O3 transpose.cpp -o transpose 
g++ -msse4.1 -mavx -O3 -DENABLE_PREFETCH transpose.cpp -o transpose_prefetch 
make: Leaving directory '/nfshome/htseng/courses/CSE142/demo/memory/prefetch'
Model:                              85
Model name:                         Intel(R) Xeon(R) Silver 4108 CPU @ 1.80GHz
Without prefetch -- the baseline
bytes = 1073741824
Starting Data Transpose...   Done
Time: 0.270896 seconds
With prefetch
bytes = 1073741824
Starting Data Transpose...   Done
Time: 0.238537 seconds



-- It doesn't work always!

In [23]:
render_code("matrix_mul/blockmm_interchange.c", show=["//START","//END"])

In [9]:
! cd matrix_mul; rm -f blockmm_interchange; make blockmm_interchange; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm_interchange.csv
! ./matrix_mul/blockmm_interchange 2048 8 >> ./matrix_mul/blockmm_interchange.csv 
! ./matrix_mul/blockmm_interchange 2048 16 >> ./matrix_mul/blockmm_interchange.csv 
! ./matrix_mul/blockmm_interchange 2048 32 >> ./matrix_mul/blockmm_interchange.csv 
! ./matrix_mul/blockmm_interchange 2048 64 >> ./matrix_mul/blockmm_interchange.csv
! ./matrix_mul/blockmm_interchange 2048 128 >> ./matrix_mul/blockmm_interchange.csv
! ./matrix_mul/blockmm_interchange 2048 256 >> ./matrix_mul/blockmm_interchange.csv
! cd matrix_mul; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm.csv
! ./matrix_mul/blockmm 2048 16 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 32 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 64 >> ./matrix_mul/blockmm.csv
! ./matrix_mul/blockmm 2048 128 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 256 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 512 >> ./matrix_mul/blockmm.csv 
display_df_mono(render_csv("matrix_mul/blockmm.csv"))
display_df_mono(render_csv("matrix_mul/blockmm_interchange.csv"))


gcc -O3 -DHAVE_LINUX_PERF_EVENT_H blockmm_interchange.c perfstats.c -o blockmm_interchange
[01m[Kblockmm_interchange.c:[m[K In function ‘[01m[Kmain[m[K’:
   48 |   printf("%d,[01;35m[K%lu[m[K,",ARRAY_SIZE,[32m[Ktile_size[m[K);
      |              [01;35m[K~~^[m[K              [32m[K~~~~~~~~~[m[K
      |                [01;35m[K|[m[K              [32m[K|[m[K
      |                [01;35m[K|[m[K              [32m[Kint[m[K
      |                [01;35m[Klong unsigned int[m[K
      |              [32m[K%u[m[K
10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,

Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,2048,16,74473386911,19237819138,0.258318,0.193622,3.72486,0.07129,2573932385,36105251964
1,1,2048,32,71544344489,28028089435,0.391758,0.193641,5.427381,0.217403,7655790077,35214730065
2,2,2048,64,70028007243,32675835340,0.466611,0.194037,6.340316,0.242686,8429345312,34733602612
3,3,2048,128,69474313293,37149820875,0.534727,0.193606,7.192432,0.244877,8470773775,34591994955
4,4,2048,256,69120040189,53298747450,0.771104,0.193899,10.334574,0.241203,8316237916,34478129179
5,5,2048,512,69044315168,72573739373,1.051118,0.193706,14.057958,0.235176,8106169032,34468477710


Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,2048,8,68604448245,15718358160,0.229116,0.193676,3.044266,0.013131,316092777,24071954131
1,1,2048,16,50678840944,13624971615,0.268849,0.19375,2.639845,0.072926,1318845843,18084631718
2,2,2048,32,42341050047,10245513297,0.241976,0.193869,1.986286,0.075526,1162781546,15395852107
3,3,2048,64,38315079204,8809988526,0.229935,0.193705,1.706541,0.07667,1082713591,14121753798
4,4,2048,128,36334188565,8223440647,0.226328,0.193631,1.592312,0.076101,1027416171,13500753876
5,5,2048,256,35355990869,9389590026,0.265573,0.193904,1.82068,0.03085,407096195,13195857929
