In [None]:
# Please execute/shift-return this cell everytime you run the notebook.  Don't edit it. 
%load_ext autoreload
%autoreload 2
from notebook import * 

# Amdahl's Law

## The starting example -- sorting

In [None]:
render_code("./gpusort/main.cu", show="main")

## Where is the most time critical part of my program?

In [None]:
! cd gpusort; make clean; make
! echo "File on H.D.D.; Sorting on CPU"
! cd ./gpusort; echo "ET,FileInput,CPU_Kernel,GPU_Kernel,Host2GPU,GPU2Host" > sort.csv; source ./run_CPU 2>> sort.csv

In [None]:
display_df_mono(render_csv("./gpusort/sort.csv", columns=["ET","FileInput","CPU_Kernel"]))

In [None]:
! lscpu

### Use gprof to figure out the timing breakdown

In [None]:
! cd gpusort; make clean; make EXTRA_FLAGS=-pg 
! cd ./gpusort; source ./run_CPU

In [None]:
! cd gpusort; gprof ./hybridsort_cpu ./gmon.out

## Amdahl's Law -- optimizating is a moving target

In [None]:
render_code("./gpusort/main.cu", lang="c++", show="bitonic_sort")

In [None]:
! nvidia-smi -a

In [None]:
! cd gpusort; make clean; make
# ! ssh htseng@azelf "source ./courses/CS203/demo/amdahlslaw/gpusort/run_CPU"
! echo "File on H.D.D.; Sorting on GPU"
! cd gpusort; source ./run 2>> sort.csv

In [None]:
display_df_mono(render_csv("./gpusort/sort.csv"))

In [None]:
! echo "File on S.S.D.; Sorting on GPU"
! cd gpusort; source ./run_SSD 2>> sort.csv

In [None]:
display_df_mono(render_csv("./gpusort/sort.csv"))

## Amdahl's Law on parallel programming

In [None]:
! cd vmul; make clean; make
! echo "THREADS,CPUTIME,HOST2GPU,GPUTIME,GPU2HOST" > ./vmul/vmul.csv
! echo "CPU based vul"
! time ./vmul/vmul 33554432 1 0 30 2>> ./vmul/vmul.csv
! echo "GPU based vul"
### i stands for "How many iterations each thread performs 
### -- the larger the number, the fewer the parallelism
! for i in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192; do time ./vmul/vmul 33554432 $i 1 30 2>> ./vmul/vmul.csv ; done

In [None]:
df = render_csv("./vmul/vmul.csv")
df["TOTAL"] = df["CPUTIME"] + df["HOST2GPU"] + df["GPUTIME"] + df["GPU2HOST"]
df = df.sort_values(by=["THREADS"], ascending=True)
display_df_mono(df)
plotPE(df=df, lines=True, what=[ ('THREADS', "TOTAL"), ('THREADS', "GPUTIME")], columns=2)

# Choose the "right" metrics

## Throughput and Latency

### SSD v.s. HDD

You may use to hdparm (need root permission to execute). The /dev/sda on this machine is an SATA SSD that has around 450-500MB/sec bandwidth. The /dev/md0 is a RAID contains two H.D.Ds in RAID-0 configuration that also achieves 450-500MB/sec bandwidth. Let's examine the bandwidth using the following command.

In [None]:
from IPython.display import IFrame
IFrame("https://hub.escalab.org:8000/user/htseng/terminals/1", width="100%", height="400")

Now, let's revisit the optimized gpusort on this machine with different array size...

In [None]:
! echo "Configuration,Size,ET,FileInput,CPU_Kernel,GPU_Kernel,Host2GPU,GPU2Host" > sort_small.csv
! echo "File on H.D.D"
! cd gpusort; source ./run_small 512 2>> ../sort_small.csv
! echo "File on S.S.D"
! cd gpusort; source ./run_small_SSD 512 2>> ../sort_small.csv
! echo "File on H.D.D"
! cd gpusort; source ./run_small 32768 2>> ../sort_small.csv
! echo "File on S.S.D"
! cd gpusort; source ./run_small_SSD 32768 2>> ../sort_small.csv
! echo "File on H.D.D"
! cd gpusort; source ./run_small 262144 2>> ../sort_small.csv
! echo "File on S.S.D"
! cd gpusort; source ./run_small_SSD 262144 2>> ../sort_small.csv
display_df_mono((render_csv("sort_small.csv")))

What can we observe here?

## FLOPs

In [None]:
! cd metrics; make
! cd ./metrics; ./cpumm 2048 512

In [None]:
! cd ./metrics; ./cudamm 2048 1

In [None]:
! cd ./metrics; ./cudamm 4096 1

In [None]:
! cd metrics; ./cudamm 8192 1

In [None]:
! cd metrics; ./cudamm_double 2048 1

In [None]:
! cd metrics;  ./cudamm_double 2048 0

In [None]:
! cd metrics; ./cpumm_double 2048 512

In [None]:
! cd metrics;  ./cudamm 2048 0