In [5]:
# Please execute/shift-return this cell everytime you run the notebook.  Don't edit it. 
%load_ext autoreload
%autoreload 2
from notebook import * 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Real case studies -- Tegra Orin

## Cache performance of code on "real machines"

### NVIDIA Jetson Nano -- Tegra Orin

In [6]:
render_code("4way_madd/madd.c", show=["//START","//END"])

Let's run it without the above loop code to figure the baseline memory accesses without running the loop on a Jetson nano.

#### Run without the 4-way matrix add loop code.

In [7]:
# Run it "without" the above code.
! ssh htseng@orin-1 "lscpu; cd courses/CSE142/demo/memory/4way_madd/; make clean madd_nano; valgrind --tool=cachegrind ./madd_nano 16384 0 "

Architecture:                    aarch64
CPU op-mode(s):                  32-bit, 64-bit
Byte Order:                      Little Endian
CPU(s):                          6
On-line CPU(s) list:             0-5
Thread(s) per core:              1
Core(s) per socket:              6
Socket(s):                       1
Vendor ID:                       ARM
Model:                           1
Model name:                      ARMv8 Processor rev 1 (v8l)
Stepping:                        r0p1
CPU max MHz:                     1971.2000
CPU min MHz:                     115.2000
BogoMIPS:                        62.50
L1d cache:                       384 KiB
L1i cache:                       384 KiB
L2 cache:                        1.5 MiB
L3 cache:                        2 MiB
Vulnerability Itlb multihit:     Not affected
Vulnerability L1tf:              Not affected
Vulnerability Mds:               Not affected
Vulnerability Meltdown:          Not affected
Vulnerability Spec store bypass: Mitigation; S

Too much detail! Let's use grep to narrow down the outputs.

In [8]:
# Run it "without" the above code.
! ssh htseng@orin-1 "cd courses/CSE142/demo/memory/4way_madd/; valgrind --tool=cachegrind ./madd_nano 8192 0 >& nano_without_loop.perf; grep 'D   refs\|D1' nano_without_loop.perf"

==148649== D   refs:      1,699,882  (1,069,822 rd   + 630,060 wr)
==148649== D1  misses:        8,697  (    2,854 rd   +   5,843 wr)
==148649== D1  miss rate:       0.5% (      0.3%     +     0.9%  )


Let's run it with the above loop code again and observe the changes in L1 cache misses/accesses

#### Run with the 4-way matrix add loop code.

In [10]:
! ssh htseng@orin-1 "cd courses/CSE142/demo/memory/4way_madd/;valgrind --tool=cachegrind ./madd_nano 8192 8192 >& nano_with_loop.perf; grep 'D   refs\|D1' nano_with_loop.perf"

==116178== D   refs:      1,739,709  (1,101,912 rd   + 637,797 wr)
==116178== D1  misses:       49,672  (   35,642 rd   +  14,030 wr)
==116178== D1  miss rate:       2.9% (      3.2%     +     2.2%  )


In [11]:
# Let's do some math here
total_number_of_accesses_before_the_loop =    1698681
total_number_of_accesses_after_the_loop =  1739709
total_number_of_accesses_in_the_loop = total_number_of_accesses_after_the_loop-total_number_of_accesses_before_the_loop
total_number_of_misses_before_the_loop =  8702
total_number_of_misses_after_the_loop = 49672
total_number_of_misses_in_the_loop = total_number_of_misses_after_the_loop-total_number_of_misses_before_the_loop
miss_rate_of_the_loop = total_number_of_misses_in_the_loop/total_number_of_accesses_in_the_loop

print(f"access in the loop: %d misses in the loop %d miss_rate %lf" % (total_number_of_accesses_in_the_loop, total_number_of_misses_in_the_loop, miss_rate_of_the_loop))

access in the loop: 41028 misses in the loop 40970 miss_rate 0.998586


In [12]:
! echo "CPU, IC, Cycles, CPI, CT, ET, miss_rate, misses, accesses" >& ./4way_madd/4way_add.csv
! echo -n "Jetson Orin," >> ./4way_madd/4way_add.csv
! ssh htseng@orin-1 "cd courses/CSE142/demo/memory/4way_madd/; ./madd_nano 16384 16384 >> 4way_add.csv"

In [13]:
! cd 4way_madd; make clean madd_dump; cd ..; 
!echo "element,address"> addresses_madd.csv; 
!./4way_madd/madd_dump 8192 8192 2>> addresses_madd.csv
! head -n 101 addresses_madd.csv > addresses_digest.csv
df = pd.read_csv("addresses_digest.csv",skipfooter=1,engine='python')
df["address"] = df["address"].str.replace('0x','')
df["address"]=df[["address"]].apply(lambda x: x.astype(str).map(lambda x: int(x, base=16)))
# only show the first N addresses 
#N = 32
#df2 = df2.iloc[:N]
C = 32768
B = 64
A = 4
offset_bits = int(math.log2(B))
S = int(C/(B*A))
index_bits = int(math.log2(S))
df["tag"]=(df["address"].apply(lambda x: x >> (offset_bits+index_bits)))
df["tag"] = df["tag"].apply(lambda x: hex(x))
df["index"] = df["address"].apply(lambda x: hex((x>>offset_bits)%S))
df["address"] = df["address"].apply(lambda x: hex(x))
display_df_mono(df)

rm -f madd_intel madd_nano *_O3 *~ madd_A_fission cachegrind* *.perf madd_dump
cc -O3 -DHAVE_LINUX_PERF_EVENT_H -g  -DDUMP perfstats.c madd.c -o madd_dump
180694192,75908718,0.420095,0.221134,0.016786,0.000158,12195,77128166


Unnamed: 0,element,address,tag,index
0,a[0],0x7cc3fdb38000,0x3e61fed9c,0x0
1,b[0],0x7cc3fdb48000,0x3e61feda4,0x0
2,c[0],0x7cc3fdb58000,0x3e61fedac,0x0
3,d[0],0x7cc3fdb68000,0x3e61fedb4,0x0
4,e[0],0x7cc3fdb78000,0x3e61fedbc,0x0
5,a[1],0x7cc3fdb38008,0x3e61fed9c,0x0
6,b[1],0x7cc3fdb48008,0x3e61feda4,0x0
7,c[1],0x7cc3fdb58008,0x3e61fedac,0x0
8,d[1],0x7cc3fdb68008,0x3e61fedb4,0x0
9,e[1],0x7cc3fdb78008,0x3e61fedbc,0x0


In [3]:
display_df_mono(render_csv("./4way_madd/4way_add.csv"))

Unnamed: 0,index,CPU,IC,Cycles,CPI,CT,ET,miss_rate,misses,accesses
0,0,Jetson Nano,0,0,,inf,0.000308,,0,0
1,1,Intel Core i7 13700,0,0,,inf,3.2e-05,,0,0


In [25]:
! cat ./4way_madd/4way_add.csv

CPU, IC, Cycles, CPI, CT, ET, miss_rate, misses, accesses
Jetson Nano,198632,487539,2.454484,0.758914,0.000370,0.117226,21095,179951
Intel Core i7 13700,0,0,-nan,inf,0.000165,-nan,0,0
