# Prefix Sum Algorithm

helper function 
-run code
-test two files identical

In [None]:
def compile_and_run_kernel(kernel_number, input_file_name, analytics=False):
    import os
    
    # Get current working directory
    cwd = os.getcwd()
    # print(f"Current working directory: {cwd}")
    
    # Create paths
    kernel_src = os.path.join(cwd, f"cuda_kernels/kernel{kernel_number}.cu")
    kernel_exe = os.path.join(cwd, f"cuda_kernels/bin/kernel{kernel_number}.exe")
    
    # Ensure input file path is correct
    input_file = os.path.join(cwd, f"Generator_TestCases/PrefixSum/{input_file_name}.txt")
    
    # Create output file path
    output_file = os.path.join(cwd, f"Output_TestCases/{input_file_name}_k{kernel_number}_o.txt")
    
    # Create bin directory if it doesn't exist
    os.makedirs(os.path.dirname(kernel_exe), exist_ok=True)
    
    # Compile the CUDA kernel
    !nvcc "{kernel_src}" -o "{kernel_exe}"
    
    # Run with analytics if requested, otherwise run normally
    if analytics:
        # Create analytics_Bin directory if it doesn't exist
        analytics_dir = os.path.join(cwd, "analytics_Bin")
        os.makedirs(analytics_dir, exist_ok=True)
        
        # Set profile output path inside analytics_Bin folder
        profile_output = os.path.join(analytics_dir, f"profile_k{kernel_number}_{input_file_name}")
        
        # Run with nsys profiling
        !nsys profile --sample=none --trace=cuda --force-overwrite=true --stats=true --output="{profile_output}" "{kernel_exe}" "{input_file}" "{output_file}"
        print(f"Analytics data saved to {profile_output}")
    else:
        # Run normally
        !"{kernel_exe}" "{input_file}" "{output_file}"
    
    return output_file

# Example usage:
# output = compile_and_run_kernel(1, "vector1")
# output = compile_and_run_kernel(2, "vector2", analytics=True)

In [None]:
def compare_files(input_file_name, output_file_name):
    """
    Compare the contents of two files to check if they are the same.

    Parameters:
    - input_file_name: Name of the input file (expected output file in Generator_TestCases/PrefixSum).
    - output_file_name: Name of the output file (actual output file in Output_TestCases).

    Returns:
    - True if the files are the same, False otherwise.
    """
    import os

    # Get current working directory
    cwd = os.getcwd()
    # print(f"Current working directory: {cwd}")

    # Construct file paths
    input_file_path = os.path.join(cwd, "Generator_TestCases/PrefixSum", input_file_name)
    output_file_path = os.path.join(cwd, "Output_TestCases", output_file_name)

    try:
        # Read the contents of both files
        with open(input_file_path, 'r') as input_file:
            input_content = input_file.read().strip()

        with open(output_file_path, 'r') as output_file:
            output_content = output_file.read().strip()

        # Compare the contents
        if input_content == output_content:
            print(f"✅ Test case passed: {input_file_name} matches {output_file_name}")
            return True
        else:
            print(f"❌ Test case failed: {input_file_name} does not match {output_file_name}")
            return False

    except FileNotFoundError as e:
        print(f"❌ Error: {e}")
        return False

# Example usage:
# compare_files("prefixsum_v1000_expected_output.txt", "prefixsum_v1000_k1_o.txt")

## Requirement 1: Compare Work-Efficient and Work-Inefficient Implementations
- Implement both work-efficient and work-inefficient versions of the prefix sum algorithm using CUDA.

In [22]:
# Work efficient Prefix Sum
# compile_and_run_kernel(1, "prefixsum_v10000_input", analytics=False)
# compare_files("prefixsum_v10000_expected_output.txt", "prefixsum_v10000_input_k1_o.txt")

# Work Inefficient Prefix Sum
compile_and_run_kernel(2, "prefixsum_v10000_input", analytics=False)
compare_files("prefixsum_v10000_expected_output.txt", "prefixsum_v10000_input_k2_o.txt")

Current working directory: e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_6_PrefixSum
kernel2.cu
tmpxft_00000a8c_00000000-10_kernel2.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_6_PrefixSum\cuda_kernels\bin\kernel2.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_6_PrefixSum\cuda_kernels\bin\kernel2.exp
Current working directory: e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_6_PrefixSum
✅ Test case passed: prefixsum_v10000_expected_output.txt matches prefixsum_v10000_input_k2_o.txt


True

## Requirement 2: Compare Work-Efficient Algorithm Using Different Memory Types
- Evaluate the performance of the work-efficient algorithm with the following memory types:
  - Pageable Memory
  - Unified Memory
  - Zero-Copy Memory
  - Pinned Memory

In [None]:
# Example: Compile and run kernel 2 for work-efficient implementation with pageable memory
compile_and_run_kernel(2, "input_large", "mask_large", analytics=True)

## Requirement 3: Use Streams to Distribute Data
- Implement the prefix sum algorithm using CUDA streams to distribute data among multiple streams.

In [None]:
# Example: Compile and run kernel 3 for stream-based implementation
compile_and_run_kernel(3, "input_large", "mask_large", analytics=True)

## Bonus: Compare Thread Coarsening vs. Streams
- Compare the performance of thread coarsening and using CUDA streams.

In [None]:
# Example: Compile and run kernel 4 for thread coarsening
compile_and_run_kernel(4, "input_large", "mask_large", analytics=True)