# Experiment Setup 

## Goal:
    1. Modify source code if needed
    2. Compile all benchmarks
    3. Profile all benchmarks
    4. Process all the performance counters
    5. visualization of the benchmarks' characteristics

In [1]:
#!/usr/bin/python3
import sys
import subprocess
import os
import time
import re

## Entering A Directory

This class finishes the job by implementating the 'cd' command in shell.

In [2]:
# enter dir
class cd:
    def __init__(self, newPath):
        self.newPath = os.path.expanduser(newPath)

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)


## Kernel Rewrite

This function aims to modify the source code of kernel launch, run the kernel `NUM_ITER` times for an average kernel running time data.This is useful when using the power-sampler for a whole kernel trace.

In [3]:
def copy_cuda_kernels():
    # the first job should copy the kernel for multipule times using wrapper_kernel_begin/end
    current_files = subprocess.check_output(['ls']).decode('utf-8').split('\n')
    
    changed_file = False
    generate_new_file = False

    for file in current_files:
        ori_file_name = file
    
        #check all *c,*.cpp,*.cu
        # replace the kernel invoke kernel<<<...>>>
        if re.search( r'(.*\.cu)|(.*\.c)|(.*\.cpp)', file):
             #print(file)    
            # rw
            # store the file
            nlines = []
            with open(file,"r") as srcs:
                lines = srcs.readlines()
                for line in lines:
                    
                    if re.search(r"#include <kernel_copy.h>",line):
                        changed_file=True
                    
                    # tnis file haven't been modified yet
                    if changed_file==False:
                        #print(line)
                        if re.search( r'<<<', line):
                            # fine this kernel invoke
                            #print(line)
                            #line = line.strip('\n')
                            
                            # run the kernel multipul times
                            # defined in /usr/include/kernel_copy.h
                            # in case the makefile stay the same
                            include   ='    #include <kernel_copy.h>   \n'
                            wrap_begin='    wrapper_kernel_begin    \n'
                            wrap_end  ='    wrapper_kernel_end    \n'
                            newline   =include+wrap_begin+line+wrap_end+'\n'
                            #print(newline)
                            nlines.append(newline)
                        else:
                            nlines.append(line)
                        # the original file will be shadow    
                        new_file_name=ori_file_name.replace(".c",".bp")
                        generate_new_filer=True
                        
            # only when generating new file            
            if generate_new_file==True:
                os.rename(ori_file_name,new_file_name)
                with open(ori_file_name,"w") as newf:
                    newf.writelines(nlines)

## High Level APIs

Used for description of various jobs need to be done

In [6]:
def modifySource():
    copy_cuda_kernels()

    
def do_shell_command(cmd):
    command=(str(cmd))
    print (command)
    subprocess.call(command, shell=True)
    

## Setting DVFS

RTX 3060 support SM frequency change

In [7]:
def change_sm_freq(f):
    cmd = "sudo nvidia-smi -lgc "+str(f)
    do_shell_command(cmd)

## Porfile Kernels Using Nsight-Compute or Sampler

An example of running nsight-compute (must under sudo, in order to have access to GPU performance counters):

sudo **/usr/local/NVIDIA-Nsight-Compute/ncu** --export **2DConvolution** --force-overwrite --target-processes application-only --replay-mode kernel --kernel-name-base function --launch-skip-before-match 0 --section ComputeWorkloadAnalysis --section InstructionStats --section LaunchStats --section MemoryWorkloadAnalysis --section MemoryWorkloadAnalysis_Tables --section SpeedOfLight --sampling-interval auto --sampling-max-passes 5 --sampling-buffer-size 33554432 --profile-from-start 1 --cache-control all --clock-control none --apply-rules yes  --import-source yes  --check-exit-code yes  **./2DConvolution.out**


In [4]:
"# where to store profile data
PROF = "/home/wwr/Desktop/EXEs/PROFs/"


def gen_ncu_cmd(output_file_name,run_prog_cmd): 
    ncu_pre   = "sudo /usr/local/NVIDIA-Nsight-Compute/ncu --export "
    outfile   = PROF+"output_ncu_"+output_file_name
    ncu_param = " --force-overwrite --target-processes application-only --replay-mode kernel --kernel-name-base function --launch-skip-before-match 0 --section ComputeWorkloadAnalysis --section InstructionStats --section LaunchStats --section MemoryWorkloadAnalysis --section MemoryWorkloadAnalysis_Tables --section SpeedOfLight --sampling-interval auto --sampling-max-passes 5 --sampling-buffer-size 33554432 --profile-from-start 1 --cache-control all --clock-control none --apply-rules yes  --import-source yes  --check-exit-code yes   "
    
    gen_cmd =ncu_pre+outfile+ncu_param+run_prog_cmd
    return gen_cmd

def gen_sample_cmd(output_file_name,run_prog_cmd):
    
    pass


def profile_sample_cuda_bin(binary_dir,prof=True,sample=True,runfile=False):
    
    # into the directory where stores all the binary
    with cd(binary_dir):
        
        bins = subprocess.check_output(['ls']).decode('utf-8').split('\n')
        
        # run file will specify exact run command with input
        # Rodinia
        if runfile:
            # cmds = read(run_xxx)
            # for cmd in cmds:
            #     1.get_raw_ptx_sass()
            #     2.get_trace_ptx_sass()
            #     change_sm_freq(f)
            #     for i in range(10):
            #         do_shell_command(xxx)
            #         3.time_power_sampler()
            #
            if prof:
                
                pass
            if sample:
                
                pass
            
        else:
            # for those don't need extra input data binarys
            # Polybench
            for exe in bins:
                # profile each exe 10 times 
                for f in freq:
                    change_sm_freq(f)
                    # prof or sample 10 times
                    for i in range(10):
                        if prof:
                            cmd=gen_ncu_cmd(str(exe)+"_"+str(f)+"_"+str(i),"./"+str(exe))
                            do_shell_command(cmd)
                        if sample:
                            
                            pass
                        
            
            
        


## Dump The Kernel
Get kernel sass and ptx code from binary directly. Also, using PPT-GPU to get ptx and sass traces.

In [None]:
def gen_dump_ptx_cmd():
    
    pass

def gen_dump_sass_cmd():
    
    pass

def gen_dump_trace_cmd():
    
    pass


def dump_kernels(kernel_bin,ptx=True,sass=True,trace=False):
    if ptx:
        do_shell_command(gen_dump_ptx_cmd())
        print(kernel_bin+"ptx generated.") 
    if sass:
        do_shell_command(gen_dump_sass_cmd())
        print(kernel_bin+"sass generated.")
    if trace:
        do_shell_command(gen_dump_trace_cmd())
        print(kernel_bin+"trace generated.")

## Sample The Kernel
Use GPU-Power-Sample for a whole kernel launch curve draw.

In [None]:
def sample_kernels(binary_dir,runfile=False)

## Make All Benchmarks

`bench_dir` is the main directory of benchmark-suit like `Rodinia/`, `CUDASDK/` the function will do the following things according to the params:

     1. cd bench_dir
     2. modify source code if needed
     3. make clean
     4. cd sub_bench_dir
     5. for bench in sub_bench_dir: 
        cd bench ; make
    
for example, the directory structure of CUDASDK is :

         CUDASDK
            |-----0_Simple
            |            |--------aysncAPI/
            |            |--------...
            |-----1_Utilities
                         |--------...
    
so `sub_bench_dir` is `0_Simple/` here , this can be read from the `bench_file` (hand-crafted).

In [9]:

def make_benchmarks(bench_file,bench_dir,bench_name,modify=False):
    print(bench_name+" Start")
   
    # how many programs are maked(or other jobs be done)
    done_num = 0
    # total job nums
    tot_num =0
    
    dir_pair ={}
    
    # a file include all the bench-dir in a benchmark
    sdk_file = bench_file
    dirs = open(sdk_file,"r")
    # read all the benchs located in 
    lines = dirs.readlines()
    
    main_dir = ""
    
    # for each bench 
    for line in lines:
        line = line.strip('\n')
        line = line.replace(' ', '')
        # sepecify main dir
        #print(line)
        
        # the first line is an identification to which benchmarks are working on
        # the format is '\d'_xxxx
        if(line[0].isdigit() and line[1]=='_'):
               
            # for Rodinia
            if line == '0_cuda_rodinia':
                # next step should be 'cd cuda'
                line='cuda'
            # for Polybench
            elif line== '0_poly_CUDA':
                #next step should be 'cd CUDA'
                line='CUDA'
            elif line=='0_gm_src':
                # as mentioned before
                line='gm_src'
                
            # the main_dir is the parent dir to all benchs
            # main_dir
            #    |----bench1/
            #            |-----xxx.cu
            #            |-----makefile
            #    |----bench2/
            # ......
            main_dir=line    
            dir_pair[main_dir]=[]
        else:
            # store the bench dirs
            dir_pair[main_dir].append(line)  
            tot_num = tot_num+1
            
    # do jobs to all benchmarks
    with cd(bench_dir):
        # use make clean from the toppest makefile 
        command= 'make clean'
        print('\n' + command)
        subprocess.call(command, shell=True)
        
        # deep into each bench dir
        for key in dir_pair:
            print('<<enter>> {}'.format(key))
            with cd('{}'.format(key)):
                    for bench in dir_pair[key]:
                        with cd('{}'.format(bench)):
                            # 1.something todo with the source file
                            if modify:
                                modifySource()
                            # 2.make the program
                            do_shell_command('make')
                            # how many are done
                            done_num = done_num + 1
                            print('['+str(done_num)+'/'+str(tot_num)+']')
            print('<<leave>> {}'.format(key))
    #============================================================
    print(bench_name+" Done")

## Start The Jobs

In [10]:
def compile_benchmarks():
    make_benchmarks("./Polybench.txt"   ,"Polybench/"  ,"Polybench")
    

In [None]:
if __name__ == "__main__":
    pass

