In [None]:
def compile_and_run_kernel(kernel_number, test_case, analytics=False):
    import os
    
    # Get current working directory
    cwd = os.getcwd()
    
    # Create paths
    kernel_src = os.path.join(cwd, f"cuda_kernels/kernel{kernel_number}.cu")
    kernel_exe = os.path.join(cwd, f"cuda_kernels/bin/kernel{kernel_number}.exe")
    input_file = os.path.join(cwd, f"Input_TestCases/{test_case}.txt")
    output_file = os.path.join(cwd, f"Output_TestCases/{test_case}_k{kernel_number}_o.txt")
    
    # Compile
    !nvcc "{kernel_src}" -o "{kernel_exe}"
    
     # Run with analytics if requested, otherwise run normally
    if analytics:
        # Create analytics_Bin directory if it doesn't exist
        analytics_dir = os.path.join(cwd, "analytics_Bin")
        os.makedirs(analytics_dir, exist_ok=True)
        
        # Set profile output path inside analytics_Bin folder
        profile_output = os.path.join(analytics_dir, f"profile_k{kernel_number}_{test_case}")
        
        # Run with nsys profiling
        !nsys profile --sample=none --trace=cuda --force-overwrite=true --stats=true --output="{profile_output}" "{kernel_exe}" "{input_file}" "{output_file}"
        print(f"Analytics data saved to {profile_output}")
    else:
        # Run normally
        !"{kernel_exe}" "{input_file}" "{output_file}"
    
    return output_file

### **Requriment - 1**
- Use only 1 block for your kernel and let the CPU handle the final sum.


In [32]:
compile_and_run_kernel(1, "t2_50", analytics=True)

kernel1.cu
tmpxft_00004260_00000000-10_kernel1.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_3\Solution\cuda_kernels\bin\kernel1.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_3\Solution\cuda_kernels\bin\kernel1.exp
Collecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-b633.qdstrm'

[1/6] [0%                          ] profile_k1_t2_50.nsys-rep
[1/6] [0%                          ] profile_k1_t2_50.nsys-rep
[1/6] [12%                         ] profile_k1_t2_50.nsys-rep
[1/6] [11%                         ] profile_k1_t2_50.nsys-rep
[1/6] [12%                         ] profile_k1_t2_50.nsys-rep
[1/6] [=====32%                    ] profile_k1_t2_50.nsys-rep
[1/6] [=====31%                    ] profile_k1_t2_50.nsys-rep
[1/6] [=====30%                    ] profile_k1_t2_50.nsys-rep
[1/6] [=====29%                    ] profile_k1_t2_50.nsys-rep
[1/6] [====28%  

'e:\\02_Learn\\01_University\\Senior-1 Spring\\Current\\Parallel Computing\\Labs\\Lab_3\\Solution\\Output_TestCases/t2_50_k1_o.txt'

### **Requriment - 2**
- Use only 1 block for your kernal and let one thread to handle the final sum.


In [None]:

compile_and_run_kernel(2, "t2_50", analytics=True)

kernel2.cu
tmpxft_00003494_00000000-10_kernel2.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_3\Solution\cuda_kernels\bin\kernel2.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_3\Solution\cuda_kernels\bin\kernel2.exp
^C
Analytics data saved to profile_k2_t2_50


'e:\\02_Learn\\01_University\\Senior-1 Spring\\Current\\Parallel Computing\\Labs\\Lab_3\\Solution\\Output_TestCases/t2_50_k2_o.txt'

Collecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-5cd4.qdstrm'

[1/6] [0%                          ] profile_k2_t2_50.nsys-rep
[1/6] [0%                          ] profile_k2_t2_50.nsys-rep
[1/6] [10%                         ] profile_k2_t2_50.nsys-rep
[1/6] [9%                          ] profile_k2_t2_50.nsys-rep
[1/6] [8%                          ] profile_k2_t2_50.nsys-rep
[1/6] [7%                          ] profile_k2_t2_50.nsys-rep
[1/6] [6%                          ] profile_k2_t2_50.nsys-rep
[1/6] [5%                          ] profile_k2_t2_50.nsys-rep
[1/6] [0%                          ] profile_k2_t2_50.nsys-rep
[1/6] [7%                          ] profile_k2_t2_50.nsys-rep
[1/6] [10%                         ] profile_k2_t2_50.nsys-rep
[1/6] [14%                         ] profile_k2_t2_50.nsys-rep
[1/6] [==18%                       ] profile_k2_t2_50.nsys-rep
[1/6] [==21%                       ] profile_k2_t2_50.nsys-rep
[1/6] [====25%             

### **Requriment - 3**
- Use multiple blocks for your kernal and let the CPU handle the final sum.


In [None]:

compile_and_run_kernel(3, "t2_50", analytics=True)

^C
Block 5: sharedSum[0] = -51364.735000
Block 2: sharedSum[0] = 224510.394000
Block 4: sharedSum[0] = 34191.373000
Block 1: sharedSum[0] = 72056.736000
Block 3: sharedSum[0] = 27260.010000
Block 0: sharedSum[0] = 126479.878000
Block 6: sharedSum[0] = -55551.030000
Kernel sumZDimension Started
Kernel reduce2DTo1D Started


Error opening file ../Output_TestCases/1d_kernel3.txt


'e:\\02_Learn\\01_University\\Senior-1 Spring\\Current\\Parallel Computing\\Labs\\Lab_3\\Solution\\Output_TestCases/t2_50_k3_o.txt'

kernel3.cu
tmpxft_00003b34_00000000-10_kernel3.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_3\Solution\cuda_kernels\bin\kernel3.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_3\Solution\cuda_kernels\bin\kernel3.exp
