# Prefix Sum Algorithm

helper function 
-run code
-test two files identical

In [1]:
def compile_and_run_kernel(kernel_number, input_file_name, analytics=False):
    import os
    
    # Get current working directory
    cwd = os.getcwd()
    # print(f"Current working directory: {cwd}")
    
    # Create paths
    kernel_src = os.path.join(cwd, f"cuda_kernels/kernel{kernel_number}.cu")
    kernel_exe = os.path.join(cwd, f"cuda_kernels/bin/kernel{kernel_number}.exe")
    
    # Ensure input file path is correct
    input_file = os.path.join(cwd, f"Generator_TestCases/PrefixSum/{input_file_name}.txt")
    
    # Create output file path
    output_file = os.path.join(cwd, f"Output_TestCases/{input_file_name}_k{kernel_number}_o.txt")
    
    # Create bin directory if it doesn't exist
    os.makedirs(os.path.dirname(kernel_exe), exist_ok=True)
    
    # Compile the CUDA kernel
    !nvcc "{kernel_src}" -o "{kernel_exe}"
    
    # Run with analytics if requested, otherwise run normally
    if analytics:
        # Create analytics_Bin directory if it doesn't exist
        analytics_dir = os.path.join(cwd, "analytics_Bin")
        os.makedirs(analytics_dir, exist_ok=True)
        
        # Set profile output path inside analytics_Bin folder
        profile_output = os.path.join(analytics_dir, f"profile_k{kernel_number}_{input_file_name}")
        
        # Run with nsys profiling
        !nsys profile --sample=none --trace=cuda --force-overwrite=true --stats=true --output="{profile_output}" "{kernel_exe}" "{input_file}" "{output_file}"
        print(f"Analytics data saved to {profile_output}")
    else:
        # Run normally
        !"{kernel_exe}" "{input_file}" "{output_file}"
    
    return output_file

# Example usage:
# output = compile_and_run_kernel(1, "vector1")
# output = compile_and_run_kernel(2, "vector2", analytics=True)

In [3]:
def compare_files(input_file_name, output_file_name, tolerance=1):
    """
    Compare two files line by line and token by token, allowing a numeric tolerance for floats.

    Parameters:
    - input_file_name: Expected output file in Generator_TestCases/PrefixSum.
    - output_file_name: Actual output file in Output_TestCases.
    - tolerance: Allowed numeric difference between corresponding float values.

    Returns:
    - True if the files match within the given tolerance, False otherwise.
    """
    import os

    cwd = os.getcwd()
    input_file_path = os.path.join(cwd, "Generator_TestCases/PrefixSum", input_file_name)
    output_file_path = os.path.join(cwd, "Output_TestCases", output_file_name)

    try:
        with open(input_file_path, 'r') as input_file, open(output_file_path, 'r') as output_file:
            input_lines = input_file.read().strip().splitlines()
            output_lines = output_file.read().strip().splitlines()

            if len(input_lines) != len(output_lines):
                print(f"❌ Line count mismatch: {len(input_lines)} vs {len(output_lines)}")
                return False

            for i, (in_line, out_line) in enumerate(zip(input_lines, output_lines), start=1):
                in_tokens = in_line.split()
                out_tokens = out_line.split()

                if len(in_tokens) != len(out_tokens):
                    print(f"❌ Token count mismatch on line {i}: {in_tokens} vs {out_tokens}")
                    return False

                for j, (a, b) in enumerate(zip(in_tokens, out_tokens)):
                    try:
                        a_num = float(a)
                        b_num = float(b)
                        if abs(a_num - b_num) > tolerance:
                            print(f"❌ Value mismatch on line {i}, token {j+1}: {a_num} vs {b_num}")
                            return False
                    except ValueError:
                        if a != b:
                            print(f"❌ Non-numeric mismatch on line {i}, token {j+1}: '{a}' vs '{b}'")
                            return False

        print(f"✅ Test case passed: {input_file_name} matches {output_file_name} within tolerance {tolerance}")
        return True

    except FileNotFoundError as e:
        print(f"❌ Error: {e}")
        return False


## Requirement 1: Compare Work-Efficient and Work-Inefficient Implementations
- Implement both work-efficient and work-inefficient versions of the prefix sum algorithm using CUDA.

In [37]:
# Work efficient Prefix Sum
compile_and_run_kernel(1, "prefixsum_v1000000_input", analytics=False)
compare_files("prefixsum_v1000000_expected_output.txt", "prefixsum_v1000000_input_k1_o.txt",tolerance=1)

# Work Inefficient Prefix Sum 
# compile_and_run_kernel(2, "prefixsum_v1000000_input", analytics=False)
# compare_files("prefixsum_v1000000_expected_output.txt", "prefixsum_v1000000_input_k2_o.txt",tolerance=1)

kernel1.cu
tmpxft_000020b8_00000000-10_kernel1.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_6_PrefixSum\cuda_kernels\bin\kernel1.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_6_PrefixSum\cuda_kernels\bin\kernel1.exp
✅ Test case passed: prefixsum_v1000000_expected_output.txt matches prefixsum_v1000000_input_k1_o.txt within tolerance 1


True

## Requirement 2: Compare Work-Efficient Algorithm Using Different Memory Types
- Evaluate the performance of the work-efficient algorithm with the following memory types:
  - Pageable Memory
  - Unified Memory
  - Zero-Copy Memory
  - Pinned Memory

In [54]:
# Example: Compile and run kernel 2 for work-efficient implementation with pageable memory
compile_and_run_kernel(3, "prefixsum_v1000000_input", analytics=False)
compare_files("prefixsum_v1000000_expected_output.txt", "prefixsum_v1000000_input_k3_o.txt",tolerance=1)

kernel3.cu
tmpxft_00002cb0_00000000-10_kernel3.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_6_PrefixSum\cuda_kernels\bin\kernel3.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_6_PrefixSum\cuda_kernels\bin\kernel3.exp
=== Memory Type Performance Comparison ===
Input size: 1000000 elements

Running benchmarks (1 runs each)...

Results (average time in milliseconds):
Pageable memory: 0.000 ms
Pinned memory:   0.000 ms
Unified memory:  231.704 ms
Zero-copy memory: 0.000 ms

Speedups relative to pageable memory:
Pinned memory:   -nan(ind)x
Unified memory:  0.00x
Zero-copy memory: -nan(ind)x
✅ Test case passed: prefixsum_v1000000_expected_output.txt matches prefixsum_v1000000_input_k3_o.txt within tolerance 1


True

## Requirement 3: Use Streams to Distribute Data
- Implement the prefix sum algorithm using CUDA streams to distribute data among multiple streams.

In [None]:
# Example: Compile and run kernel 2 for work-efficient implementation with pageable memory
compile_and_run_kernel(4, "prefixsum_v1000000_input", analytics=False)
compare_files("prefixsum_v1000000_expected_output.txt", "prefixsum_v1000000_input_k4_o.txt",tolerance=1)

## Bonus: Compare Thread Coarsening vs. Streams
- Compare the performance of thread coarsening and using CUDA streams.

In [62]:
# Example: Compile and run kernel 2 for work-efficient implementation with pageable memory
compile_and_run_kernel(5, "prefixsum_v1000000_input", analytics=False)
compare_files("prefixsum_v1000000_expected_output.txt", "prefixsum_v1000000_input_k5_o.txt",tolerance=1)

kernel5.cu
tmpxft_00000ce4_00000000-10_kernel5.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_6_PrefixSum\cuda_kernels\bin\kernel5.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_6_PrefixSum\cuda_kernels\bin\kernel5.exp
Thread coarsened execution time (factor=4): 1 ms
✅ Test case passed: prefixsum_v1000000_expected_output.txt matches prefixsum_v1000000_input_k5_o.txt within tolerance 1


True