In [1]:
# Please execute/shift-return this cell everytime you run the notebook.  Don't edit it. 
%load_ext autoreload
%autoreload 2
from notebook import * 

 # Cache Simulation -- 2-way

What if we have a 2-way, 16-byte blocked, 16-block cache?

In [1]:
df = pd.read_csv("addresses.csv",skipfooter=1,engine='python')
df["address"] = df["address"].str.replace('0x','')
df["address"]=df[["address"]].apply(lambda x: x.astype(str).map(lambda x: int(x, base=16)))
# only show the first N addresses 
#N = 32
#df2 = df2.iloc[:N]
block_size = 16
offset_bits = int(math.log2(block_size))
number_of_blocks = 16
way_assoc=2
number_of_sets = number_of_blocks/way_assoc
index_bits = int(math.log2(number_of_sets))
df["tag"]=(df["address"].apply(lambda x: x >> (offset_bits+index_bits)))
df["tag"] = df["tag"].apply(lambda x: hex(x))
df["index"] = df["address"].apply(lambda x: hex((x>>offset_bits) % number_of_blocks))
df["address"] = df["address"].apply(lambda x: hex(x))
display_df_mono(df)

NameError: name 'pd' is not defined

## Cache performance of code on "real machines"

### NVIDIA Jetson Nano -- Tegra X1

In [13]:
render_code("4way_madd/madd.c", show=["//START","//END"])

Let's run it without the above loop code to figure the baseline memory accesses without running the loop on a Jetson nano.

#### Run without the 4-way matrix add loop code.

In [14]:
# Run it "without" the above code.
! ssh htseng@nano-2 "lscpu; cd courses/CSE142/demo/memory/4way_madd/; make clean madd_nano; valgrind --tool=cachegrind ./madd_nano 16384 0 "

Architecture:        aarch64
Byte Order:          Little Endian
CPU(s):              4
On-line CPU(s) list: 0-3
Thread(s) per core:  1
Core(s) per socket:  4
Socket(s):           1
Vendor ID:           ARM
Model:               1
Model name:          Cortex-A57
Stepping:            r1p1
CPU max MHz:         1479.0000
CPU min MHz:         102.0000
BogoMIPS:            38.40
L1d cache:           32K
L1i cache:           48K
L2 cache:            2048K
Flags:               fp asimd evtstrm aes pmull sha1 sha2 crc32
rm -f madd_intel madd_nano *_O3 *~ madd_A_fission cachegrind* *.perf madd_dump
cc -O1 -DHAVE_LINUX_PERF_EVENT_H -g  -DNANO perfstats.c madd.c -o madd_nano
==15683== Cachegrind, a cache and branch-prediction profiler
==15683== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==15683== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==15683== Command: ./madd_nano 16384 0
==15683== 
--15683--          Run with -v to see.
2260175,2567551,1.1359

Too much detail! Let's use grep to narrow down the outputs.

In [15]:
# Run it "without" the above code.
! ssh htseng@nano-2 "cd courses/CSE142/demo/memory/4way_madd/; valgrind --tool=cachegrind ./madd_nano 8192 0 >& nano_without_loop.perf; grep 'D   refs\|D1' nano_without_loop.perf"

==15722== D   refs:      1,535,330  (946,607 rd   + 588,723 wr)
==15722== D1  misses:        8,672  (  2,865 rd   +   5,807 wr)
==15722== D1  miss rate:       0.6% (    0.3%     +     1.0%  )


Let's run it with the above loop code again and observe the changes in L1 cache misses/accesses

#### Run with the 4-way matrix add loop code.

In [16]:
! ssh htseng@nano-2 "cd courses/CSE142/demo/memory/4way_madd/;valgrind --tool=cachegrind ./madd_nano 8192 8192 >& nano_with_loop.perf; grep 'D   refs\|D1' nano_with_loop.perf"

==15762== D   refs:      1,576,375  (979,438 rd   + 596,937 wr)
==15762== D1  misses:       49,650  ( 35,649 rd   +  14,001 wr)
==15762== D1  miss rate:       3.1% (    3.6%     +     2.3%  )


In [17]:
# Let's do some math here
total_number_of_accesses_before_the_loop =    1527773
total_number_of_accesses_after_the_loop =  1548265
total_number_of_accesses_in_the_loop = total_number_of_accesses_after_the_loop-total_number_of_accesses_before_the_loop
total_number_of_misses_before_the_loop =  8538
total_number_of_misses_after_the_loop = 29031
total_number_of_misses_in_the_loop = total_number_of_misses_after_the_loop-total_number_of_misses_before_the_loop
miss_rate_of_the_loop = total_number_of_misses_in_the_loop/total_number_of_accesses_in_the_loop

print(f"access in the loop: %d misses in the loop %d miss_rate %lf" % (total_number_of_accesses_in_the_loop, total_number_of_misses_in_the_loop, miss_rate_of_the_loop))

access in the loop: 20492 misses in the loop 20493 miss_rate 1.000049


In [18]:
! echo "CPU, IC, Cycles, CPI, CT, ET, miss_rate, misses, accesses" >& ./4way_madd/4way_add.csv
! echo -n "Jetson Nano," >> ./4way_madd/4way_add.csv
! ssh htseng@nano-2 "cd courses/CSE142/demo/memory/4way_madd/; ./madd_nano 16384 16384 >> 4way_add.csv"

In [19]:
! cd 4way_madd; make clean madd_dump; cd ..; 
!echo "element,address"> addresses_madd.csv; 
!./4way_madd/madd_dump 8192 8192 2>> addresses_madd.csv
! head -n 101 addresses_madd.csv > addresses_digest.csv
df = pd.read_csv("addresses_digest.csv",skipfooter=1,engine='python')
df["address"] = df["address"].str.replace('0x','')
df["address"]=df[["address"]].apply(lambda x: x.astype(str).map(lambda x: int(x, base=16)))
# only show the first N addresses 
#N = 32
#df2 = df2.iloc[:N]
C = 32768
B = 64
A = 4
offset_bits = int(math.log2(B))
S = int(C/(B*A))
index_bits = int(math.log2(S))
df["tag"]=(df["address"].apply(lambda x: x >> (offset_bits+index_bits)))
df["tag"] = df["tag"].apply(lambda x: hex(x))
df["index"] = df["address"].apply(lambda x: hex((x>>offset_bits)%S))
df["address"] = df["address"].apply(lambda x: hex(x))
display_df_mono(df)

rm -f madd_intel madd_nano *_O3 *~ madd_A_fission cachegrind* *.perf madd_dump
cc -O3 -DHAVE_LINUX_PERF_EVENT_H -g  -DDUMP perfstats.c madd.c -o madd_dump
191526710,77226859,0.403217,0.266047,0.020546,0.000138,11331,82080984


Unnamed: 0,element,address,tag,index
0,a[0],0x7d102bad1000,0x3e8815d68,0x40
1,b[0],0x7d102bae1000,0x3e8815d70,0x40
2,c[0],0x7d102baf1000,0x3e8815d78,0x40
3,d[0],0x7d102bb01000,0x3e8815d80,0x40
4,e[0],0x7d102bb11000,0x3e8815d88,0x40
5,a[1],0x7d102bad1008,0x3e8815d68,0x40
6,b[1],0x7d102bae1008,0x3e8815d70,0x40
7,c[1],0x7d102baf1008,0x3e8815d78,0x40
8,d[1],0x7d102bb01008,0x3e8815d80,0x40
9,e[1],0x7d102bb11008,0x3e8815d88,0x40


### Intel Core i7 13700 -- 12-way L1, 64B-blocked, 48KB cache

Let's run it without the above loop code to figure the baseline memory accesses without running the loop on a Jetson nano.

Let's again dump, parse and simulation the address sequence.

In [26]:
! cd 4way_madd; make madd_dump; cd ..; 
!echo "element,address"> addresses_madd.csv; 
!./4way_madd/madd_dump 8192 8192 2>> addresses_madd.csv
! head -n 101 addresses_madd.csv > addresses_digest.csv
df = pd.read_csv("addresses_digest.csv",skipfooter=1,engine='python')
df["address"] = df["address"].str.replace('0x','')
df["address"]=df[["address"]].apply(lambda x: x.astype(str).map(lambda x: int(x, base=16)))
# only show the first N addresses 
#N = 32
#df2 = df2.iloc[:N]
C = 49152
B = 64
A = 12
offset_bits = int(math.log2(B))
S = int(C/(B*A))
index_bits = int(math.log2(S))
df["tag"]=(df["address"].apply(lambda x: x >> (offset_bits+index_bits)))
df["tag"] = df["tag"].apply(lambda x: hex(x))
df["index"] = df["address"].apply(lambda x: hex((x>>offset_bits) % S))
df["address"] = df["address"].apply(lambda x: hex(x))
display_df_mono(df)

make: 'madd_dump' is up to date.
191512690,77884910,0.406683,0.263684,0.020537,0.000133,10891,82076806


Unnamed: 0,element,address,tag,index
0,a[0],0x71a54aee6000,0x71a54aee6,0x0
1,b[0],0x71a54aef6000,0x71a54aef6,0x0
2,c[0],0x71a54af06000,0x71a54af06,0x0
3,d[0],0x71a54af16000,0x71a54af16,0x0
4,e[0],0x71a54af26000,0x71a54af26,0x0
5,a[1],0x71a54aee6008,0x71a54aee6,0x0
6,b[1],0x71a54aef6008,0x71a54aef6,0x0
7,c[1],0x71a54af06008,0x71a54af06,0x0
8,d[1],0x71a54af16008,0x71a54af16,0x0
9,e[1],0x71a54af26008,0x71a54af26,0x0


#### Run with performance counters

In [27]:
# Run it "without" the above code.
! make -C ./4way_madd/ madd_intel 
! echo -n "Intel Core i7 13700," >> ./4way_madd/4way_add.csv
! lscpu; cd ~/courses/CSE142/demo/memory/4way_madd/; ./madd_intel 16384 16384 >> 4way_add.csv

make: Entering directory '/nfshome/htseng/courses/CSE142/demo/memory/4way_madd'
make: 'madd_intel' is up to date.
make: Leaving directory '/nfshome/htseng/courses/CSE142/demo/memory/4way_madd'
Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   24
  On-line CPU(s) list:    0-23
Vendor ID:                GenuineIntel
  Model name:             13th Gen Intel(R) Core(TM) i7-13700
    CPU family:           6
    Model:                183
    Thread(s) per core:   2
    Core(s) per socket:   16
    Socket(s):            1
    Stepping:             1
    CPU(s) scaling MHz:   18%
    CPU max MHz:          5200.0000
    CPU min MHz:          800.0000
    BogoMIPS:             4224.00
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
                          ca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 s
    

Let's run it with the above loop code again and observe the changes in L1 cache misses/accesses

#### Run with the 4-way matrix add loop code.

In [28]:
# Run it "with" the above code.
! cd ~/courses/CSE142/demo/memory/4way_madd/; valgrind --tool=cachegrind ./madd_intel 16384 16384  >& intel_with_loop.perf; grep 'D   refs\|D1' intel_with_loop.perf

In [23]:
# Let's do some math here
total_number_of_accesses_before_the_loop =2285911
total_number_of_accesses_after_the_loop = 2326877
total_number_of_accesses_in_the_loop = total_number_of_accesses_after_the_loop-total_number_of_accesses_before_the_loop
total_number_of_misses_before_the_loop = 13447
total_number_of_misses_after_the_loop = 23694
total_number_of_misses_in_the_loop = total_number_of_misses_after_the_loop-total_number_of_misses_before_the_loop
miss_rate_of_the_loop = total_number_of_misses_in_the_loop/total_number_of_accesses_in_the_loop

print(f"access in the loop: %d misses in the loop %d miss_rate %lf" % (total_number_of_accesses_in_the_loop, total_number_of_misses_in_the_loop, miss_rate_of_the_loop))

access in the loop: 40966 misses in the loop 10247 miss_rate 0.250134


In [29]:
display_df_mono(render_csv("./4way_madd/4way_add.csv"))

Unnamed: 0,index,CPU,IC,Cycles,CPI,CT,ET,miss_rate,misses,accesses
0,0,Jetson Nano,198632,487539,2.454484,0.758914,0.00037,0.117226,21095,179951
1,1,Intel Core i7 13700,0,0,,inf,0.000165,,0,0
2,2,Intel Core i7 13700,102678,94163,0.917071,0.477895,4.5e-05,0.134357,6162,45863


In [25]:
! cat ./4way_madd/4way_add.csv

CPU, IC, Cycles, CPI, CT, ET, miss_rate, misses, accesses
Jetson Nano,198632,487539,2.454484,0.758914,0.000370,0.117226,21095,179951
Intel Core i7 13700,0,0,-nan,inf,0.000165,-nan,0,0
