In [1]:
# Please execute/shift-return this cell everytime you run the notebook.  Don't edit it. 
%load_ext autoreload
%autoreload 2
from notebook import * 

## Case study: matrix multiplications

GEMM that computes C = A $\times$ B is the core of many AI/ML applications. The most naive implementation of GEMM takes $O(n^3)$. Assume it takes 1 second to perform GEMM on 1,024$\times$1,024$\times$1,024 matrices. How much time do you expect it would take for 2,048$\times$2,048$\times$2,048 matrices?

In [2]:
render_code("matrix_mul/mm.c", show=["//START","//END"])

In [3]:
! cd matrix_mul; make clean; make mm

rm -f blockmm mm blockmm_transpose cachegrind.* mm_dump rect_blockmm_trans blockmm_transpose_reg blockmm_reg
gcc -DHAVE_LINUX_PERF_EVENT_H -O3 mm.c perfstats.c -o mm 


In [4]:
! cd matrix_mul; echo "IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > mm.csv
! ./matrix_mul/mm 512 >> ./matrix_mul/mm.csv ;./matrix_mul/mm 1024 >> ./matrix_mul/mm.csv ; ./matrix_mul/mm 2048 >> ./matrix_mul/mm.csv
#! cs203 job memory "./matrix_mul/mm 1024 >> ./matrix_mul/mm.csv ; ./matrix_mul/mm 2048 >> ./matrix_mul/mm.csv"

234410496.000000,1406510080.000000,10521102336.000000,

In [5]:
display_df_mono(render_csv("matrix_mul/mm.csv"))

Unnamed: 0,index,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,512,1071043567,698501006,0.652169,0.195353,0.136454,0.249307,133270751,534563975
1,1024,8601742532,11428001262,1.328568,0.179488,2.05119,0.234866,1008623039,4294461854
2,2048,69008688454,120902692467,1.751992,0.179027,21.644796,0.307707,10602706748,34457192646


WOW! Compuational complexty breaks again! The GEMM performance go wild because of cache misses!

What kind of misses are we seeing?

In [6]:
! make -C matrix_mul mm_dump; ./matrix_mul/mm_dump 256 >& mm_dump_address.csv

make: Entering directory '/nfshome/htseng/courses/CSE142/demo/matrix_mul/matrix_mul'
gcc -DHAVE_LINUX_PERF_EVENT_H -DDUMP -O3 mm.c perfstats.c -o mm_dump 
make: Leaving directory '/nfshome/htseng/courses/CSE142/demo/matrix_mul/matrix_mul'


In [7]:
! echo "element,address" > mm_dump_addresses_digest.csv 
! head -n 101 mm_dump_address.csv | grep "b\[" >> mm_dump_addresses_digest.csv
df = pd.read_csv("mm_dump_addresses_digest.csv",skipfooter=1,engine='python')
df["address"] = df["address"].str.replace('0x','')
df["address"]=df[["address"]].apply(lambda x: x.astype(str).map(lambda x: int(x, base=16)))
# only show the first N addresses 
#N = 32
#df2 = df2.iloc[:N]
C = 49152
B = 64
A = 12
offset_bits = int(math.log2(B))
S = int(C/(B*A))
index_bits = int(math.log2(S))
df["tag"]=(df["address"].apply(lambda x: x >> (offset_bits+index_bits)))
df["tag"] = df["tag"].apply(lambda x: hex(x))
df["index"] = df["address"].apply(lambda x: hex((x>>offset_bits)%S))
df["address"] = df["address"].apply(lambda x: hex(x))
display_df_mono(df)

Unnamed: 0,element,address,tag,index
0,b[0][0],0x7a0e4f8ff000,0x7a0e4f8ff,0x0
1,b[1][0],0x7a0e4f8ff800,0x7a0e4f8ff,0x20
2,b[2][0],0x7a0e4f900000,0x7a0e4f900,0x0
3,b[3][0],0x7a0e4f900800,0x7a0e4f900,0x20
4,b[4][0],0x7a0e4f901000,0x7a0e4f901,0x0
5,b[5][0],0x7a0e4f901800,0x7a0e4f901,0x20
6,b[6][0],0x7a0e4f902000,0x7a0e4f902,0x0
7,b[7][0],0x7a0e4f902800,0x7a0e4f902,0x20
8,b[8][0],0x7a0e4f903000,0x7a0e4f903,0x0
9,b[9][0],0x7a0e4f903800,0x7a0e4f903,0x20


### Matrix tiling algorithm

Let's try to partition GEMM into smaller tiles!

In [8]:
render_code("matrix_mul/blockmm.c", show=["//START","//END"])

In [9]:
! cd matrix_mul/; make clean blockmm

rm -f blockmm mm blockmm_transpose cachegrind.* mm_dump rect_blockmm_trans blockmm_transpose_reg blockmm_reg
gcc -O4 -DHAVE_LINUX_PERF_EVENT_H blockmm.c perfstats.c -o blockmm 
[01m[Kblockmm.c:[m[K In function ‘[01m[Kmain[m[K’:
   48 |   printf("%d,[01;35m[K%lu[m[K,",ARRAY_SIZE,[32m[Ktile_size[m[K);
      |              [01;35m[K~~^[m[K              [32m[K~~~~~~~~~[m[K
      |                [01;35m[K|[m[K              [32m[K|[m[K
      |                [01;35m[K|[m[K              [32m[Kint[m[K
      |                [01;35m[Klong unsigned int[m[K
      |              [32m[K%u[m[K


## Try with tile size == 32

In [10]:
! cd matrix_mul; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm.csv
! ./matrix_mul/blockmm 512 32 >> ./matrix_mul/blockmm.csv ;./matrix_mul/blockmm 1024 32 >> ./matrix_mul/blockmm.csv ; ./matrix_mul/blockmm 2048 32 >> ./matrix_mul/blockmm.csv; ./matrix_mul/blockmm 4096 32 >> ./matrix_mul/blockmm.csv

In [11]:
display_df_mono(render_csv("matrix_mul/mm.csv"))
display_df_mono(render_csv("matrix_mul/blockmm.csv"))

Unnamed: 0,index,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,512,1061960818,653553684,0.615422,0.196614,0.128498,0.251323,133211134,530039511
1,1024,8620012508,10646618738,1.235105,0.192918,2.053924,0.241849,1040907527,4303962559
2,2048,69005319250,120917729906,1.752296,0.192898,23.324737,0.319863,11021555745,34457110660


Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,512,32,1068930233,376067010,0.351816,0.209489,0.078782,0.214591,112907449,526152080
1,1,1024,32,8922608387,3172290478,0.355534,0.193754,0.614643,0.214054,940105005,4391898722
2,2,2048,32,71544146772,27920098656,0.39025,0.192857,5.384582,0.21827,7686299255,35214585736
3,3,4096,32,572363257950,232157754011,0.405613,0.192884,44.77955,0.222682,62734171796,281720589313


## Try with tile size == 8

In [12]:
! cd matrix_mul; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm.csv
! ./matrix_mul/blockmm 512 8 >> ./matrix_mul/blockmm.csv ;./matrix_mul/blockmm 1024 8 >> ./matrix_mul/blockmm.csv ; ./matrix_mul/blockmm 2048 8 >> ./matrix_mul/blockmm.csv; ./matrix_mul/blockmm 4096 8 >> ./matrix_mul/blockmm.csv

In [13]:
display_df_mono(render_csv("matrix_mul/mm.csv"))
display_df_mono(render_csv("matrix_mul/blockmm.csv"))

Unnamed: 0,index,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,512,1061960818,653553684,0.615422,0.196614,0.128498,0.251323,133211134,530039511
1,1024,8620012508,10646618738,1.235105,0.192918,2.053924,0.241849,1040907527,4303962559
2,2048,69005319250,120917729906,1.752296,0.192898,23.324737,0.319863,11021555745,34457110660


Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,512,8,1252162667,258442843,0.206397,0.198036,0.051181,0.008962,5282549,589466798
1,1,1024,8,10129952586,2191608567,0.216349,0.193188,0.423392,0.011171,53272742,4768679525
2,2,2048,8,81050488696,22954883140,0.283217,0.19288,4.427539,0.01236,471582948,38153229833
3,3,4096,8,648278071412,220189925522,0.339654,0.192983,42.492996,0.012539,3826267780,305159418539


In [14]:
! ./matrix_mul/blockmm 2048 4 >> ./matrix_mul/blockmm.csv
! ./matrix_mul/blockmm 2048 16 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 32 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 64 >> ./matrix_mul/blockmm.csv
! ./matrix_mul/blockmm 2048 128 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 256 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 512 >> ./matrix_mul/blockmm.csv 
display_df_mono(render_csv("matrix_mul/blockmm.csv"))

Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,512,8,1252162667,258442843,0.206397,0.198036,0.051181,0.008962,5282549,589466798
1,1,1024,8,10129952586,2191608567,0.216349,0.193188,0.423392,0.011171,53272742,4768679525
2,2,2048,8,81050488696,22954883140,0.283217,0.19288,4.427539,0.01236,471582948,38153229833
3,3,4096,8,648278071412,220189925522,0.339654,0.192983,42.492996,0.012539,3826267780,305159418539
4,4,2048,4,97765155630,23910911547,0.244575,0.192876,4.611848,0.014824,646946564,43641302837
5,5,2048,16,74473053838,19039961765,0.255662,0.192883,3.672485,0.071692,2588440121,36105056797
6,6,2048,32,71543789979,27793699711,0.388485,0.192828,5.359415,0.21755,7660926965,35214525518
7,7,2048,64,70150334907,32656079820,0.465516,0.192876,6.298579,0.24232,8431415862,34794600845
8,8,2048,128,69480805743,40163611929,0.578053,0.19289,7.747152,0.24253,8390172850,34594313325
9,9,2048,256,69218509291,76718573795,1.108353,0.192835,14.794016,0.234369,8090660159,34520978225


In [15]:
render_code("matrix_mul/blockmm_reg.c", show=["//START","//END"])

In [16]:
! cd matrix_mul; make blockmm_reg; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm_reg.csv
! ./matrix_mul/blockmm_reg 2048 4 >> ./matrix_mul/blockmm_reg.csv
! ./matrix_mul/blockmm_reg 2048 8 >> ./matrix_mul/blockmm_reg.csv
! ./matrix_mul/blockmm_reg 2048 16 >> ./matrix_mul/blockmm_reg.csv 
! ./matrix_mul/blockmm_reg 2048 32 >> ./matrix_mul/blockmm_reg.csv 
! ./matrix_mul/blockmm_reg 2048 64 >> ./matrix_mul/blockmm_reg.csv
! ./matrix_mul/blockmm_reg 2048 128 >> ./matrix_mul/blockmm_reg.csv 
! ./matrix_mul/blockmm_reg 2048 256 >> ./matrix_mul/blockmm_reg.csv 
! ./matrix_mul/blockmm_reg 2048 512 >> ./matrix_mul/blockmm_reg.csv 
display_df_mono(render_csv("matrix_mul/blockmm_reg.csv"))

gcc -O4 -DHAVE_LINUX_PERF_EVENT_H blockmm_reg.c perfstats.c -o blockmm_reg 
[01m[Kblockmm_reg.c:[m[K In function ‘[01m[Kmain[m[K’:
   48 |   printf("%d,[01;35m[K%lu[m[K,",ARRAY_SIZE,[32m[Ktile_size[m[K);
      |              [01;35m[K~~^[m[K              [32m[K~~~~~~~~~[m[K
      |                [01;35m[K|[m[K              [32m[K|[m[K
      |                [01;35m[K|[m[K              [32m[Kint[m[K
      |                [01;35m[Klong unsigned int[m[K
      |              [32m[K%u[m[K


Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,2048,4,106234829210,29526138843,0.277933,0.192884,5.695115,0.018454,778230102,42170805666
1,1,2048,8,78811673347,19143288700,0.242899,0.192887,3.692499,0.013183,405848337,30785445740
2,2,2048,16,66919450423,14370023632,0.214736,0.192868,2.771518,0.095698,2482068985,25936448800
3,3,2048,32,61315393202,18319999035,0.298783,0.192919,3.534284,0.296296,7014187464,23672933056
4,4,2048,64,58580590342,20780125693,0.354727,0.192873,4.007935,0.378729,8549296454,22573640158
5,5,2048,128,57252021061,33321631102,0.582017,0.192816,6.424938,0.394414,8692931716,22040138973
6,6,2048,256,56663842521,73825505310,1.302868,0.192843,14.236737,0.397473,8665689821,21801983484
7,7,2048,512,56332523319,76439494941,1.356934,0.192926,14.747173,0.398939,8644744220,21669335382


In [17]:
render_code("matrix_mul/blockmm_transpose.c", show=["//START","//END"])

### Matrix transpose

In [18]:
! cd matrix_mul; rm blockmm_transpose; make blockmm_transpose; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm_transpose.csv
! ./matrix_mul/blockmm_transpose 512 8 >> ./matrix_mul/blockmm_transpose.csv ;./matrix_mul/blockmm_transpose 1024 8 >> ./matrix_mul/blockmm_transpose.csv ; ./matrix_mul/blockmm_transpose 2048 8 >> ./matrix_mul/blockmm_transpose.csv; ./matrix_mul/blockmm_transpose 4096 8 >> ./matrix_mul/blockmm_transpose.csv

rm: cannot remove 'blockmm_transpose': No such file or directory
gcc -O4 -DHAVE_LINUX_PERF_EVENT_H blockmm_transpose.c perfstats.c -o blockmm_transpose
234410496.000000,1406510080.000000,10521102336.000000,48070299648.000000,

In [19]:
! ./matrix_mul/blockmm_transpose 2048 8 >> ./matrix_mul/blockmm_transpose.csv 
! ./matrix_mul/blockmm_transpose 2048 16 >> ./matrix_mul/blockmm_transpose.csv 
! ./matrix_mul/blockmm_transpose 2048 32 >> ./matrix_mul/blockmm_transpose.csv 
! ./matrix_mul/blockmm_transpose 2048 64 >> ./matrix_mul/blockmm_transpose.csv
! ./matrix_mul/blockmm_transpose 2048 128 >> ./matrix_mul/blockmm_transpose.csv
! ./matrix_mul/blockmm_transpose 2048 256 >> ./matrix_mul/blockmm_transpose.csv

10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,

In [20]:
display_df_mono(render_csv("matrix_mul/blockmm_transpose.csv"))

Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,512,8,901885031,209155472,0.231909,0.278343,0.058217,0.00918,3385907,368822436
1,1,1024,8,8800047471,2033323256,0.231058,0.194499,0.395479,0.003077,11072198,3598925275
2,2,2048,8,70303735042,16604778639,0.236186,0.193792,3.217876,0.004851,139480008,28752644677
3,3,4096,8,563051814291,133174944654,0.236523,0.19295,25.696122,0.002712,624514129,230267469789
4,4,2048,8,70409639271,16428244610,0.233324,0.192921,3.169353,0.002275,65515759,28795768436
5,5,2048,16,64863212169,14089119040,0.217213,0.192914,2.717984,0.02945,797186418,27069231006
6,6,2048,32,62449859212,14948381355,0.239366,0.19305,2.885786,0.041332,1090736945,26389380429
7,7,2048,64,61312079112,13869624434,0.226214,0.19291,2.675586,0.026448,689926161,26086450627
8,8,2048,128,60765854546,17115197516,0.281658,0.193008,3.303366,0.04077,1057810020,25946043190
9,9,2048,256,60495829530,17574446045,0.290507,0.193,3.391866,0.040118,1038173674,25877735239


In [21]:
display_df_mono(render_csv("matrix_mul/blockmm.csv"))

Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,512,8,1252162667,258442843,0.206397,0.198036,0.051181,0.008962,5282549,589466798
1,1,1024,8,10129952586,2191608567,0.216349,0.193188,0.423392,0.011171,53272742,4768679525
2,2,2048,8,81050488696,22954883140,0.283217,0.19288,4.427539,0.01236,471582948,38153229833
3,3,4096,8,648278071412,220189925522,0.339654,0.192983,42.492996,0.012539,3826267780,305159418539
4,4,2048,4,97765155630,23910911547,0.244575,0.192876,4.611848,0.014824,646946564,43641302837
5,5,2048,16,74473053838,19039961765,0.255662,0.192883,3.672485,0.071692,2588440121,36105056797
6,6,2048,32,71543789979,27793699711,0.388485,0.192828,5.359415,0.21755,7660926965,35214525518
7,7,2048,64,70150334907,32656079820,0.465516,0.192876,6.298579,0.24232,8431415862,34794600845
8,8,2048,128,69480805743,40163611929,0.578053,0.19289,7.747152,0.24253,8390172850,34594313325
9,9,2048,256,69218509291,76718573795,1.108353,0.192835,14.794016,0.234369,8090660159,34520978225


In [22]:
render_code("matrix_mul/blockmm_transpose_reg.c", show=["//START","//END"])

In [23]:
! cd matrix_mul; rm blockmm_transpose_reg; make blockmm_transpose_reg; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm_transpose_reg.csv 
! ./matrix_mul/blockmm_transpose_reg 2048 8 >> ./matrix_mul/blockmm_transpose_reg.csv 
! ./matrix_mul/blockmm_transpose_reg 2048 16 >> ./matrix_mul/blockmm_transpose_reg.csv 
! ./matrix_mul/blockmm_transpose_reg 2048 32 >> ./matrix_mul/blockmm_transpose_reg.csv 
! ./matrix_mul/blockmm_transpose_reg 2048 64 >> ./matrix_mul/blockmm_transpose_reg.csv
! ./matrix_mul/blockmm_transpose_reg 2048 128 >> ./matrix_mul/blockmm_transpose_reg.csv
! ./matrix_mul/blockmm_transpose_reg 2048 256 >> ./matrix_mul/blockmm_transpose_reg.csv

rm: cannot remove 'blockmm_transpose_reg': No such file or directory
gcc -O4 -DHAVE_LINUX_PERF_EVENT_H blockmm_transpose_reg.c perfstats.c -o blockmm_transpose_reg
10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,

In [24]:
display_df_mono(render_csv("matrix_mul/blockmm_transpose_reg.csv"))

Unnamed: 0,index,size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,2048,8,60795543210,16443740560,0.270476,0.19288,3.171676,0.006049,96147899,15894291438
1,1,2048,16,49276413789,10179848642,0.206587,0.193034,1.96506,0.056391,677564182,12015560891
2,2,2048,32,43911609431,9876367294,0.224915,0.192843,1.904593,0.100447,1031873933,10272779421
3,3,2048,64,41304283600,10173252759,0.2463,0.192753,1.960928,0.078061,736795649,9438766894
4,4,2048,128,40018901395,11931305098,0.298142,0.192799,2.300348,0.098078,885689501,9030502548
5,5,2048,256,39380371191,13993515768,0.355342,0.193364,2.705837,0.030068,265453438,8828441916


In [25]:
render_code("matrix_mul/rect_blockmm_trans.c", show=["//START","//END"])

In [26]:
! cd matrix_mul; make rect_blockmm_trans; echo "size,tile_size_x,tile_size_y,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > rect_blockmm_trans.csv
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 8 8 >> ./matrix_mul/rect_blockmm_trans.csv 
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 8 16 >> ./matrix_mul/rect_blockmm_trans.csv 
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 16 8 >> ./matrix_mul/rect_blockmm_trans.csv
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 16 16 >> ./matrix_mul/rect_blockmm_trans.csv
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 32 8 >> ./matrix_mul/rect_blockmm_trans.csv 
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 32 16 >> ./matrix_mul/rect_blockmm_trans.csv 
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 64 8 >> ./matrix_mul/rect_blockmm_trans.csv
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 128 8 >> ./matrix_mul/rect_blockmm_trans.csv
! taskset -c 8 ./matrix_mul/rect_blockmm_trans 2048 256 8 >> ./matrix_mul/rect_blockmm_trans.csv
display_df_mono(render_csv("matrix_mul/rect_blockmm_trans.csv"))

gcc -O4 -DHAVE_LINUX_PERF_EVENT_H rect_blockmm_trans.c perfstats.c -o rect_blockmm_trans
10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,10521102336.000000,

Unnamed: 0,index,size,tile_size_x,tile_size_y,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses
0,0,2048,8,8,60695508358,11646471503,0.191884,0.192843,2.24594,0.003428,54266256,15831879470
1,1,2048,8,16,59757045310,12521277199,0.209536,0.192818,2.414323,0.021325,327878249,15374976428
2,2,2048,16,8,49689647394,11465942089,0.230751,0.192805,2.210694,0.004005,48921926,12216373103
3,3,2048,16,16,49215549640,10044967262,0.204101,0.192842,1.937096,0.067944,814387168,11986095597
4,4,2048,32,8,44182703585,9440025834,0.213659,0.193171,1.823542,0.005901,61416027,10407158320
5,5,2048,32,16,43946922547,9738680134,0.221601,0.192848,1.878089,0.064052,659246052,10292421786
6,6,2048,64,8,41431323211,9667380023,0.233335,0.192797,1.863844,0.005345,50791726,9503290404
7,7,2048,128,8,40059756476,11667601845,0.291255,0.192817,2.249716,0.006489,58747559,9052905729
8,8,2048,256,8,39376619199,13782097723,0.350007,0.193104,2.661376,0.004125,36420846,8828676057


## Prefetch

x86 provide prefetch instructions. As a programmer, you may insert ```_mm_prefetch``` in x86 programs to perform software prefetch for your code. The gcc compiler also has a flag ```-fprefetch-loop-arrays``` to automatically insert software prefetch instructions.

### Using prefetch in matrix transpose code

The following example is a highly optimized matrix transpose code. In the example, we try to prefetch the next row.

In [27]:
render_code("./prefetch/transpose.cpp", lang="c++", show=["//START", "//END"])

FileNotFoundError: [Errno 2] No such file or directory: './prefetch/transpose.cpp'

Now, let's take a look of what's happening!

In [None]:
! cd prefetch; make clean; make
# ! echo "Without prefetch -- the baseline"; ssh htseng@celebi "lscpu | grep Model; cd courses/CS203/demo/memory/prefetch/; ./transpose"
! echo "Without prefetch -- the baseline"
! lscpu | grep Model
! ./prefetch/transpose
! echo "With prefetch"
! ./prefetch/transpose_prefetch

Let's try a different machine now.

In [None]:
! ssh htseng@xerneas "cd /nfshome/htseng/courses/CSE142/demo/software_optimizations_memory/; make -C ./prefetch clean; make -C ./prefetch ; lscpu | grep Model"
! echo "Without prefetch -- the baseline"; ssh htseng@xerneas  "/nfshome/htseng/courses/CSE142/demo/software_optimizations_memory/prefetch/transpose"
! echo "With prefetch";  ssh htseng@xerneas  "/nfshome/htseng/courses/CSE142/demo/software_optimizations_memory/prefetch/transpose_prefetch"

In [None]:
! ssh htseng@blissey "cd /nfshome/htseng/courses/CSE142/demo/memory/; make -C ./prefetch clean; make -C ./prefetch ; lscpu | grep Model"
! echo "Without prefetch -- the baseline"; ssh htseng@blissey  "/nfshome/htseng/courses/CSE142/demo/memory/prefetch/transpose"
! echo "With prefetch";  ssh htseng@blissey  "/nfshome/htseng/courses/CSE142/demo/memory/prefetch/transpose_prefetch"

In [None]:
! ssh htseng@eevee "cd /nfshome/htseng/courses/CSE142/demo/memory/; make -C ./prefetch clean; make -C ./prefetch ; lscpu | grep Model"
! echo "Without prefetch -- the baseline"; ssh htseng@eevee  "/nfshome/htseng/courses/CSE142/demo/memory/prefetch/transpose"
! echo "With prefetch";  ssh htseng@eevee  "/nfshome/htseng/courses/CSE142/demo/memory/prefetch/transpose_prefetch"


-- It doesn't work always!

In [None]:
render_code("matrix_mul/blockmm_interchange.c", show=["//START","//END"])

In [None]:
! cd matrix_mul; rm -f blockmm_interchange; make blockmm_interchange; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm_interchange.csv
! ./matrix_mul/blockmm_interchange 2048 8 >> ./matrix_mul/blockmm_interchange.csv 
! ./matrix_mul/blockmm_interchange 2048 16 >> ./matrix_mul/blockmm_interchange.csv 
! ./matrix_mul/blockmm_interchange 2048 32 >> ./matrix_mul/blockmm_interchange.csv 
! ./matrix_mul/blockmm_interchange 2048 64 >> ./matrix_mul/blockmm_interchange.csv
! ./matrix_mul/blockmm_interchange 2048 128 >> ./matrix_mul/blockmm_interchange.csv
! ./matrix_mul/blockmm_interchange 2048 256 >> ./matrix_mul/blockmm_interchange.csv
! cd matrix_mul; echo "size,tile_size,IC,Cycles,CPI,CT_ns,ET_s,DL1_miss_rate,DL1_misses,DL1_accesses" > blockmm.csv
! ./matrix_mul/blockmm 2048 16 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 32 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 64 >> ./matrix_mul/blockmm.csv
! ./matrix_mul/blockmm 2048 128 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 256 >> ./matrix_mul/blockmm.csv 
! ./matrix_mul/blockmm 2048 512 >> ./matrix_mul/blockmm.csv 
display_df_mono(render_csv("matrix_mul/blockmm.csv"))
display_df_mono(render_csv("matrix_mul/blockmm_interchange.csv"))
