In [1]:
# Please execute/shift-return this cell everytime you run the notebook.  Don't edit it. 
%load_ext autoreload
%autoreload 2
from notebook import * 

 # Cache Simulation

## How blocks are stored in a cache?

How these address are stored "if" they're in a direct-mapped, 16B-sized blocks, 16-block cache?

In [2]:
!echo "element,address"> addresses.csv; make -C mv; ./mv/matvec 16 1 >> addresses.csv
df = pd.read_csv("addresses.csv",skipfooter=1,engine='python')
df["address"] = df["address"].str.replace('0x','')
df["address"]=df[["address"]].apply(lambda x: x.astype(str).map(lambda x: int(x, base=16)))
# only show the first N addresses 
#N = 32
#df2 = df2.iloc[:N]
block_size = 16
offset_bits = int(math.log2(block_size))
number_of_blocks = 16
index_bits = int(math.log2(number_of_blocks))
df["tag"]=(df["address"].apply(lambda x: x >> (offset_bits+index_bits)))
df["tag"] = df["tag"].apply(lambda x: hex(x))
df["index"] = df["address"].apply(lambda x: hex((x>>offset_bits) % number_of_blocks))
df["address"] = df["address"].apply(lambda x: hex(x))
display_df_mono(df)

make: Entering directory '/nfshome/htseng/courses/CSE142/demo/memory/mv'
make: Nothing to be done for 'all'.
make: Leaving directory '/nfshome/htseng/courses/CSE142/demo/memory/mv'


Unnamed: 0,element,address,tag,index
0,&a[0][0],0x5af49f578330,0x5af49f5783,0x3
1,&b[0],0x5af49f578c30,0x5af49f578c,0x3
2,&a[0][1],0x5af49f578338,0x5af49f5783,0x3
3,&b[1],0x5af49f578c38,0x5af49f578c,0x3
4,&a[0][2],0x5af49f578340,0x5af49f5783,0x4
5,&b[2],0x5af49f578c40,0x5af49f578c,0x4
6,&a[0][3],0x5af49f578348,0x5af49f5783,0x4
7,&b[3],0x5af49f578c48,0x5af49f578c,0x4
8,&a[0][4],0x5af49f578350,0x5af49f5783,0x5
9,&b[4],0x5af49f578c50,0x5af49f578c,0x5


What if we have a 2-way, 16-byte blocked, 16-block cache?

In [3]:
df = pd.read_csv("addresses.csv",skipfooter=1,engine='python')
df["address"] = df["address"].str.replace('0x','')
df["address"]=df[["address"]].apply(lambda x: x.astype(str).map(lambda x: int(x, base=16)))
# only show the first N addresses 
#N = 32
#df2 = df2.iloc[:N]
block_size = 16
offset_bits = int(math.log2(block_size))
number_of_blocks = 16
way_assoc=2
number_of_sets = number_of_blocks/way_assoc
index_bits = int(math.log2(number_of_sets))
df["tag"]=(df["address"].apply(lambda x: x >> (offset_bits+index_bits)))
df["tag"] = df["tag"].apply(lambda x: hex(x))
df["index"] = df["address"].apply(lambda x: hex((x>>offset_bits) % number_of_blocks))
df["address"] = df["address"].apply(lambda x: hex(x))
display_df_mono(df)

Unnamed: 0,element,address,tag,index
0,&a[0][0],0x5af49f578330,0xb5e93eaf06,0x3
1,&b[0],0x5af49f578c30,0xb5e93eaf18,0x3
2,&a[0][1],0x5af49f578338,0xb5e93eaf06,0x3
3,&b[1],0x5af49f578c38,0xb5e93eaf18,0x3
4,&a[0][2],0x5af49f578340,0xb5e93eaf06,0x4
5,&b[2],0x5af49f578c40,0xb5e93eaf18,0x4
6,&a[0][3],0x5af49f578348,0xb5e93eaf06,0x4
7,&b[3],0x5af49f578c48,0xb5e93eaf18,0x4
8,&a[0][4],0x5af49f578350,0xb5e93eaf06,0x5
9,&b[4],0x5af49f578c50,0xb5e93eaf18,0x5


## Cache performance of code on "real machines"

### NVIDIA Jetson Nano -- Tegra X1

In [4]:
render_code("4way_madd/madd.c", show=["//START","//END"])

Let's run it without the above loop code to figure the baseline memory accesses without running the loop on a Jetson nano.

#### Run without the 4-way matrix add loop code.

In [5]:
# Run it "without" the above code.
! ssh htseng@nano-2 "lscpu; cd courses/CSE142/demo/memory/4way_madd/; make clean madd_nano; valgrind --tool=cachegrind ./madd_nano 16384 0 "

Architecture:        aarch64
Byte Order:          Little Endian
CPU(s):              4
On-line CPU(s) list: 0-3
Thread(s) per core:  1
Core(s) per socket:  4
Socket(s):           1
Vendor ID:           ARM
Model:               1
Model name:          Cortex-A57
Stepping:            r1p1
CPU max MHz:         1479.0000
CPU min MHz:         102.0000
BogoMIPS:            38.40
L1d cache:           32K
L1i cache:           48K
L2 cache:            2048K
Flags:               fp asimd evtstrm aes pmull sha1 sha2 crc32
rm -f madd_intel madd_nano *_O3 *~ madd_A_fission cachegrind* *.perf madd_dump
cc -O1 -DHAVE_LINUX_PERF_EVENT_H -g  -DNANO perfstats.c madd.c -o madd_nano
==9395== Cachegrind, a cache and branch-prediction profiler
==9395== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==9395== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==9395== Command: ./madd_nano 16384 0
==9395== 
--9395--          Run with -v to see.
0,0,nan,inf,0.000602,nan,0,0

Too much detail! Let's use grep to narrow down the outputs.

In [6]:
# Run it "without" the above code.
! ssh htseng@nano-2 "cd courses/CSE142/demo/memory/4way_madd/; valgrind --tool=cachegrind ./madd_nano 8192 0 >& nano_without_loop.perf; grep 'D   refs\|D1' nano_without_loop.perf"

==9456== D   refs:      1,534,199  (945,919 rd   + 588,280 wr)
==9456== D1  misses:        8,675  (  2,869 rd   +   5,806 wr)
==9456== D1  miss rate:       0.6% (    0.3%     +     1.0%  )


Let's run it with the above loop code again and observe the changes in L1 cache misses/accesses

#### Run with the 4-way matrix add loop code.

In [7]:
! ssh htseng@nano-2 "cd courses/CSE142/demo/memory/4way_madd/;valgrind --tool=cachegrind ./madd_nano 8192 8192 >& nano_with_loop.perf; grep 'D   refs\|D1' nano_with_loop.perf"

==9513== D   refs:      1,575,218  (978,727 rd   + 596,491 wr)
==9513== D1  misses:       49,651  ( 35,651 rd   +  14,000 wr)
==9513== D1  miss rate:       3.2% (    3.6%     +     2.3%  )


In [8]:
# Let's do some math here
total_number_of_accesses_before_the_loop =    1527773
total_number_of_accesses_after_the_loop =  1548265
total_number_of_accesses_in_the_loop = total_number_of_accesses_after_the_loop-total_number_of_accesses_before_the_loop
total_number_of_misses_before_the_loop =  8538
total_number_of_misses_after_the_loop = 29031
total_number_of_misses_in_the_loop = total_number_of_misses_after_the_loop-total_number_of_misses_before_the_loop
miss_rate_of_the_loop = total_number_of_misses_in_the_loop/total_number_of_accesses_in_the_loop

print(f"access in the loop: %d misses in the loop %d miss_rate %lf" % (total_number_of_accesses_in_the_loop, total_number_of_misses_in_the_loop, miss_rate_of_the_loop))

access in the loop: 20492 misses in the loop 20493 miss_rate 1.000049


In [9]:
! echo "CPU, IC, Cycles, CPI, CT, ET, miss_rate, misses, accesses" >& ./4way_madd/4way_add.csv
! echo -n "Jetson Nano," >> ./4way_madd/4way_add.csv
! ssh htseng@nano-2 "cd courses/CSE142/demo/memory/4way_madd/; ./madd_nano 16384 16384 >> 4way_add.csv"

In [10]:
! cd 4way_madd; make clean madd_dump; cd ..; 
!echo "element,address"> addresses_madd.csv; 
!./4way_madd/madd_dump 8192 8192 2>> addresses_madd.csv
! head -n 101 addresses_madd.csv > addresses_digest.csv
df = pd.read_csv("addresses_digest.csv",skipfooter=1,engine='python')
df["address"] = df["address"].str.replace('0x','')
df["address"]=df[["address"]].apply(lambda x: x.astype(str).map(lambda x: int(x, base=16)))
# only show the first N addresses 
#N = 32
#df2 = df2.iloc[:N]
C = 32768
B = 64
A = 4
offset_bits = int(math.log2(B))
S = int(C/(B*A))
index_bits = int(math.log2(S))
df["tag"]=(df["address"].apply(lambda x: x >> (offset_bits+index_bits)))
df["tag"] = df["tag"].apply(lambda x: hex(x))
df["index"] = df["address"].apply(lambda x: hex((x>>offset_bits)%S))
df["address"] = df["address"].apply(lambda x: hex(x))
display_df_mono(df)

rm -f madd_intel madd_nano *_O3 *~ madd_A_fission cachegrind* *.perf madd_dump
cc -O3 -DHAVE_LINUX_PERF_EVENT_H -g  -DDUMP perfstats.c madd.c -o madd_dump
124281208,52365171,0.421344,0.368508,0.019297,0.000194,10293,53037154


Unnamed: 0,element,address,tag,index
0,a[0],0x7ff76e5a2000,0x3ffbb72d1,0x0
1,b[0],0x7ff76e5b2000,0x3ffbb72d9,0x0
2,c[0],0x7ff76e5c2000,0x3ffbb72e1,0x0
3,d[0],0x7ff76e5d2000,0x3ffbb72e9,0x0
4,e[0],0x7ff76e5e2000,0x3ffbb72f1,0x0
5,a[1],0x7ff76e5a2008,0x3ffbb72d1,0x0
6,b[1],0x7ff76e5b2008,0x3ffbb72d9,0x0
7,c[1],0x7ff76e5c2008,0x3ffbb72e1,0x0
8,d[1],0x7ff76e5d2008,0x3ffbb72e9,0x0
9,e[1],0x7ff76e5e2008,0x3ffbb72f1,0x0


### Intel Core i7 13700 -- 12-way L1, 64B-blocked, 48KB cache

Let's run it without the above loop code to figure the baseline memory accesses without running the loop on a Jetson nano.

Let's again dump, parse and simulation the address sequence.

In [11]:
! cd 4way_madd; make madd_dump; cd ..; 
!echo "element,address"> addresses_madd.csv; 
!./4way_madd/madd_dump 8192 8192 2>> addresses_madd.csv
! head -n 101 addresses_madd.csv > addresses_digest.csv
df = pd.read_csv("addresses_digest.csv",skipfooter=1,engine='python')
df["address"] = df["address"].str.replace('0x','')
df["address"]=df[["address"]].apply(lambda x: x.astype(str).map(lambda x: int(x, base=16)))
# only show the first N addresses 
#N = 32
#df2 = df2.iloc[:N]
C = 49152
B = 64
A = 12
offset_bits = int(math.log2(B))
S = int(C/(B*A))
index_bits = int(math.log2(S))
df["tag"]=(df["address"].apply(lambda x: x >> (offset_bits+index_bits)))
df["tag"] = df["tag"].apply(lambda x: hex(x))
df["index"] = df["address"].apply(lambda x: hex((x>>offset_bits) % S))
df["address"] = df["address"].apply(lambda x: hex(x))
display_df_mono(df)

make: 'madd_dump' is up to date.
74057370,31153227,0.420663,0.702431,0.021883,0.000233,7359,31602946


Unnamed: 0,element,address,tag,index
0,a[0],0x7329bfca7000,0x7329bfca7,0x0
1,b[0],0x7329bfcb7000,0x7329bfcb7,0x0
2,c[0],0x7329bfcc7000,0x7329bfcc7,0x0
3,d[0],0x7329bfcd7000,0x7329bfcd7,0x0
4,e[0],0x7329bfce7000,0x7329bfce7,0x0
5,a[1],0x7329bfca7008,0x7329bfca7,0x0
6,b[1],0x7329bfcb7008,0x7329bfcb7,0x0
7,c[1],0x7329bfcc7008,0x7329bfcc7,0x0
8,d[1],0x7329bfcd7008,0x7329bfcd7,0x0
9,e[1],0x7329bfce7008,0x7329bfce7,0x0


#### Run with performance counters

In [12]:
# Run it "without" the above code.
! make -C ./4way_madd/ madd_intel 
! echo -n "Intel Core i7 13700," >> ./4way_madd/4way_add.csv
! lscpu; cd ~/courses/CSE142/demo/memory/4way_madd/; ./madd_intel 16384 16384 >> 4way_add.csv

make: Entering directory '/nfshome/htseng/courses/CSE142/demo/memory/4way_madd'
cc -O3 -DHAVE_LINUX_PERF_EVENT_H -g  -mno-avx perfstats.c madd.c -o madd_intel
make: Leaving directory '/nfshome/htseng/courses/CSE142/demo/memory/4way_madd'
Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          39 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   24
  On-line CPU(s) list:    0-23
Vendor ID:                GenuineIntel
  Model name:             13th Gen Intel(R) Core(TM) i7-13700F
    CPU family:           6
    Model:                183
    Thread(s) per core:   2
    Core(s) per socket:   16
    Socket(s):            1
    Stepping:             1
    CPU(s) scaling MHz:   23%
    CPU max MHz:          5200.0000
    CPU min MHz:          800.0000
    BogoMIPS:             4224.00
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
                          ca cmov pat p

Let's run it with the above loop code again and observe the changes in L1 cache misses/accesses

#### Run with the 4-way matrix add loop code.

In [13]:
# Run it "with" the above code.
! cd ~/courses/CSE142/demo/memory/4way_madd/; valgrind --tool=cachegrind ./madd_intel 16384 16384  >& intel_with_loop.perf; grep 'D   refs\|D1' intel_with_loop.perf

In [14]:
# Let's do some math here
total_number_of_accesses_before_the_loop =2285911
total_number_of_accesses_after_the_loop = 2326877
total_number_of_accesses_in_the_loop = total_number_of_accesses_after_the_loop-total_number_of_accesses_before_the_loop
total_number_of_misses_before_the_loop = 13447
total_number_of_misses_after_the_loop = 23694
total_number_of_misses_in_the_loop = total_number_of_misses_after_the_loop-total_number_of_misses_before_the_loop
miss_rate_of_the_loop = total_number_of_misses_in_the_loop/total_number_of_accesses_in_the_loop

print(f"access in the loop: %d misses in the loop %d miss_rate %lf" % (total_number_of_accesses_in_the_loop, total_number_of_misses_in_the_loop, miss_rate_of_the_loop))

access in the loop: 40966 misses in the loop 10247 miss_rate 0.250134


In [15]:
display_df_mono(render_csv("./4way_madd/4way_add.csv"))

Unnamed: 0,index,CPU,IC,Cycles,CPI,CT,ET,miss_rate,misses,accesses
0,0,Jetson Nano,0,0,,inf,0.000308,,0,0
1,1,Intel Core i7 13700,0,0,,inf,3.2e-05,,0,0


In [16]:
! cat ./4way_madd/4way_add.csv

CPU, IC, Cycles, CPI, CT, ET, miss_rate, misses, accesses
Jetson Nano,0,0,nan,inf,0.000308,nan,0,0
Intel Core i7 13700,0,0,-nan,inf,0.000032,-nan,0,0
