# 1D Convolution Kernels and Report

## Task Overview
You are required to write three CUDA kernels that perform 1D convolution, along with a report analyzing their performance.

### Kernel Implementations:
1. **Kernel 2**: No tiling.
2. **Kernel 2**: Uses output tiling.
3. **Kernel 3**: Uses input tiling.

## Report Requirements
The report should compare the performance of the three kernels, highlighting differences in execution time and efficiency.

## Command-Line Arguments
Each kernel should accept three command-line arguments:
- **Input file** (vector data)
- **Mask file** (convolution mask)
- **Output file** (resulting vector)

### Example Usage:
```bash
./kernel1 inputfile.txt mask.txt outputfile.txt
```

## Input File Format
The input file contains:
2. An integer `N` (size of the vector).
2. `N` float numbers representing the vector.

**Example:**
```
5
1 2 3 4 5
```

## Mask File Format
The mask file contains:
1. An integer `M` (size of the mask).
2. `M` float numbers representing the mask.

**Example:**
```
3
0 1 0
```

## Output File Format
The output file should contain `N` float numbers representing the resulting vector after convolution.

**Example Output:**
```
1 2 3 4 5
```

## Submission
Submit a compressed file named with your student code. The archive should contain:
- The three CUDA kernel implementations.
- The report analyzing their performance.
---

In [70]:
def compile_and_run_kernel(kernel_number, input_file_name, mask_file_name, analytics=False):
    import os
    
    # Get current working directory
    cwd = os.getcwd()
    print(f"Current working directory: {cwd}")
    # Create paths
    kernel_src = os.path.join(cwd, f"cuda_kernels/kernel{kernel_number}.cu")
    kernel_exe = os.path.join(cwd, f"cuda_kernels/bin/kernel{kernel_number}.exe")
    
    # Ensure input and mask file paths are correct
    input_file = os.path.join(cwd, f"Generator_TestCases/Convolution/{input_file_name}.txt")
    mask_file = os.path.join(cwd, f"Generator_TestCases/Convolution/{mask_file_name}.txt")
    
    # Create output file path
    output_file = os.path.join(cwd, f"Output_TestCases/{input_file_name}_mask{mask_file_name}_k{kernel_number}_o.txt")
    
    # Create bin directory if it doesn't exist
    os.makedirs(os.path.dirname(kernel_exe), exist_ok=True)
    
    # Compile
    !nvcc "{kernel_src}" -o "{kernel_exe}"
    
    # Run with analytics if requested, otherwise run normally
    if analytics:
        # Create analytics_Bin directory if it doesn't exist
        analytics_dir = os.path.join(cwd, "analytics_Bin")
        os.makedirs(analytics_dir, exist_ok=True)
        
        # Set profile output path inside analytics_Bin folder
        profile_output = os.path.join(analytics_dir, f"profile_k{kernel_number}_{input_file_name}_mask{mask_file_name}")
        
        # Run with nsys profiling
        !nsys profile --sample=none --trace=cuda --force-overwrite=true --stats=true --output="{profile_output}" "{kernel_exe}" "{input_file}" "{mask_file}" "{output_file}"
        print(f"Analytics data saved to {profile_output}")
    else:
        # Run normally
        !"{kernel_exe}" "{input_file}" "{mask_file}" "{output_file}"
    
    return output_file

# Example usage:
# output = compile_and_run_kernel(1, "vector1", "mask1")
# output = compile_and_run_kernel(2, "vector2", "mask2", analytics=True)

In [71]:
import numpy as np
import os
import time

def compare_output_files(expected_file, actual_file, tolerance=1e-5, verbose=False):
    """
    Compare the output values in two text files to check if they're identical within a tolerance range.
    
    Parameters:
    - expected_file: Path to the file containing expected values
    - actual_file: Path to the file containing actual values from your implementation
    - tolerance: Maximum allowed difference between corresponding values
    - verbose: Whether to print details about the comparison
    
    Returns:
    - True if files match within tolerance, False otherwise
    """
    try:
        # Read expected output
        with open(expected_file, 'r') as f:
            expected_content = f.read().strip()
            expected_values = np.array([float(x) for x in expected_content.split()])
        
        # Read actual output
        with open(actual_file, 'r') as f:
            actual_content = f.read().strip()
            actual_values = np.array([float(x) for x in actual_content.split()])
        
        # Check if arrays have the same length
        if len(expected_values) != len(actual_values):
            print(f"❌ FAIL: Output files have different lengths! Expected: {len(expected_values)}, Actual: {len(actual_values)}")
            return False
        
        # Calculate absolute differences
        diff = np.abs(expected_values - actual_values)
        max_diff = np.max(diff)
        mean_diff = np.mean(diff)
        
        # Check if values match within tolerance
        match = np.allclose(expected_values, actual_values, rtol=0, atol=tolerance)
        
        if match:
            print(f"✅ PASS: Output values match within tolerance {tolerance}")
            if verbose:
                print(f"  - Maximum difference: {max_diff:.8e}")
                print(f"  - Average difference: {mean_diff:.8e}")
        else:
            print(f"❌ FAIL: Output values differ by more than tolerance {tolerance}")
            print(f"  - Maximum difference: {max_diff:.8e}")
            print(f"  - Average difference: {mean_diff:.8e}")
            
            if verbose:
                # Find and print the first few mismatched elements
                mismatched_indices = np.where(diff > tolerance)[0]
                print(f"  - Found {len(mismatched_indices)} mismatched values")
                for i in range(min(5, len(mismatched_indices))):
                    idx = mismatched_indices[i]
                    print(f"  - Index {idx}: Expected {expected_values[idx]:.8f}, Actual {actual_values[idx]:.8f}, Diff {diff[idx]:.8e}")
        
        return match
    
    except Exception as e:
        print(f"❌ FAIL: Error during comparison: {str(e)}")
        return False

# Example usage:
def verify_kernel_output(kernel_num, input_base, mask_base):
    """
    Verify the output of a kernel against the expected output.
    
    Parameters:
    - kernel_num: Kernel number (1, 2, or 3)
    - input_base: Base filename for input (without extension)
    - mask_base: Base filename for mask (without extension)
    """

    # Construct file paths
    base_path = "./Generator_TestCases/Convolution/"
    expected_file = f"{base_path}{input_base}_expected_output.txt"
    actual_file = f"./Output_TestCases/{input_base}_input_mask{mask_base}_k{kernel_num}_o.txt"

    print(f"Verifying kernel {kernel_num} output...")
    # Run comparison
    result = compare_output_files(expected_file, actual_file, tolerance=1e-1, verbose=True)
    return result

# Example: verify_kernel_output(1, "conv_v1000_m3_input", "conv_v1000_m3_mask")

### **Requriment - 1**
- kernel 1 should have no tiling


In [72]:
# Test pair 1: vector=1000, mask=3
compile_and_run_kernel(1, "conv_v1000_m3_input", "conv_v1000_m3_mask", analytics=True)
# verify_kernel_output(1, "conv_v1000_m3", "conv_v1000_m3_mask")

# Test pair 2: vector=10000, mask=5
compile_and_run_kernel(1, "conv_v10000_m5_input", "conv_v10000_m5_mask", analytics=True)
# verify_kernel_output(1, "conv_v10000_m5", "conv_v10000_m5_mask")

# Test pair 3: vector=100000, mask=9
compile_and_run_kernel(1, "conv_v100000_m9_input", "conv_v100000_m9_mask", analytics=True)
# verify_kernel_output(1, "conv_v100000_m9", "conv_v100000_m9_mask")

# Test pair 4: vector=1000000, mask=15
compile_and_run_kernel(1, "conv_v1000000_m15_input", "conv_v1000000_m15_mask", analytics=True)
# verify_kernel_output(1, "conv_v1000000_m15", "conv_v1000000_m15_mask")

Current working directory: e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution
kernel1.cu
tmpxft_00003194_00000000-10_kernel1.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel1.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel1.exp
Collecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-ed06.qdstrm'

[1/6] [0%                          ] profile_k1_conv_v1000_m3_input_maskconv_v1000_m3_mask.nsys-rep
[1/6] [0%                          ] profile_k1_conv_v1000_m3_input_maskconv_v1000_m3_mask.nsys-rep

[2/6] [0%                          ] profile_k1_conv_v1000_m3_input_maskconv_v1000_m3_mask.sqlite
[2/6] [1%                          ] profile_k1_conv_v1000_m3_input_maskconv_v1000_m3_mask.sqlite
[2/6] [3%                          ] profile_k1_conv_v1000_m3_input_ma



kernel1.cu
tmpxft_00005718_00000000-10_kernel1.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel1.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel1.exp
Collecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-edce.qdstrm'

[1/6] [0%                          ] profile_k1_conv_v10000_m5_input_maskconv_v10000_m5_mask.nsys-rep
[1/6] [0%                          ] profile_k1_conv_v10000_m5_input_maskconv_v10000_m5_mask.nsys-rep

[2/6] [0%                          ] profile_k1_conv_v10000_m5_input_maskconv_v10000_m5_mask.sqlite
[2/6] [1%                          ] profile_k1_conv_v10000_m5_input_maskconv_v10000_m5_mask.sqlite
[2/6] [3%                          ] profile_k1_conv_v10000_m5_input_maskconv_v10000_m5_mask.sqlite
[2/6] [4%                          ] profile_k1_conv_v10000_m5_input_maskconv_



kernel1.cu
tmpxft_0000462c_00000000-10_kernel1.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel1.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel1.exp
Collecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-3e26.qdstrm'

[1/6] [0%                          ] profile_k1_conv_v100000_m9_input_maskconv_v100000_m9_mask.nsys-rep
[1/6] [0%                          ] profile_k1_conv_v100000_m9_input_maskconv_v100000_m9_mask.nsys-rep

[2/6] [0%                          ] profile_k1_conv_v100000_m9_input_maskconv_v100000_m9_mask.sqlite
[2/6] [1%                          ] profile_k1_conv_v100000_m9_input_maskconv_v100000_m9_mask.sqlite
[2/6] [3%                          ] profile_k1_conv_v100000_m9_input_maskconv_v100000_m9_mask.sqlite
[2/6] [4%                          ] profile_k1_conv_v100000_m9_inpu



kernel1.cu
tmpxft_00005724_00000000-10_kernel1.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel1.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel1.exp
Collecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-9695.qdstrm'

[1/6] [0%                          ] profile_k1_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.nsys-rep
[1/6] [0%                          ] profile_k1_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.nsys-rep

[2/6] [0%                          ] profile_k1_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.sqlite
[2/6] [1%                          ] profile_k1_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.sqlite
[2/6] [3%                          ] profile_k1_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.sqlite
[2/6] [4%                          ] profile_k1_



'e:\\02_Learn\\01_University\\Senior-1 Spring\\Current\\Parallel Computing\\Labs\\Lab_4\\Solution\\Output_TestCases/conv_v1000000_m15_input_maskconv_v1000000_m15_mask_k1_o.txt'

### **Requriment - 2**
- kernel 2 should have output tiling 


In [73]:
# Test pair 2: vector=1000, mask=3
compile_and_run_kernel(2, "conv_v1000_m3_input", "conv_v1000_m3_mask", analytics=True)
# verify_kernel_output(2, "conv_v1000_m3", "conv_v1000_m3_mask")

# Test pair 2: vector=10000, mask=5
compile_and_run_kernel(2, "conv_v10000_m5_input", "conv_v10000_m5_mask", analytics=True)
# verify_kernel_output(2, "conv_v10000_m5", "conv_v10000_m5_mask")

# Test pair 3: vector=100000, mask=9
compile_and_run_kernel(2, "conv_v100000_m9_input", "conv_v100000_m9_mask", analytics=True)
# verify_kernel_output(2, "conv_v100000_m9", "conv_v100000_m9_mask")

# Test pair 4: vector=1000000, mask=15
compile_and_run_kernel(2, "conv_v1000000_m15_input", "conv_v1000000_m15_mask", analytics=True)
# verify_kernel_output(2, "conv_v1000000_m15", "conv_v1000000_m15_mask")

Current working directory: e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution
kernel2.cu
tmpxft_000039a8_00000000-10_kernel2.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel2.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel2.exp
Analytics data saved to e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\analytics_Bin\profile_k2_conv_v1000_m3_input_maskconv_v1000_m3_maskCollecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-fa45.qdstrm'

[1/6] [0%                          ] profile_k2_conv_v1000_m3_input_maskconv_v1000_m3_mask.nsys-rep
[1/6] [0%                          ] profile_k2_conv_v1000_m3_input_maskconv_v1000_m3_mask.nsys-rep

[2/6] [0%                          ] profile_k2_conv_v1000_m3_input_maskconv_v1000_m3_



kernel2.cu
tmpxft_00005c10_00000000-10_kernel2.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel2.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel2.exp
Collecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-ea47.qdstrm'

[1/6] [0%                          ] profile_k2_conv_v10000_m5_input_maskconv_v10000_m5_mask.nsys-rep
[1/6] [0%                          ] profile_k2_conv_v10000_m5_input_maskconv_v10000_m5_mask.nsys-rep

[2/6] [0%                          ] profile_k2_conv_v10000_m5_input_maskconv_v10000_m5_mask.sqlite
[2/6] [1%                          ] profile_k2_conv_v10000_m5_input_maskconv_v10000_m5_mask.sqlite
[2/6] [3%                          ] profile_k2_conv_v10000_m5_input_maskconv_v10000_m5_mask.sqlite
[2/6] [4%                          ] profile_k2_conv_v10000_m5_input_maskconv_



kernel2.cu
tmpxft_000018b0_00000000-10_kernel2.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel2.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel2.exp
Collecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-51e0.qdstrm'

[1/6] [0%                          ] profile_k2_conv_v100000_m9_input_maskconv_v100000_m9_mask.nsys-rep
[1/6] [0%                          ] profile_k2_conv_v100000_m9_input_maskconv_v100000_m9_mask.nsys-rep

[2/6] [0%                          ] profile_k2_conv_v100000_m9_input_maskconv_v100000_m9_mask.sqlite
[2/6] [1%                          ] profile_k2_conv_v100000_m9_input_maskconv_v100000_m9_mask.sqlite
[2/6] [3%                          ] profile_k2_conv_v100000_m9_input_maskconv_v100000_m9_mask.sqlite
[2/6] [4%                          ] profile_k2_conv_v100000_m9_inpu



kernel2.cu
tmpxft_00005cd8_00000000-10_kernel2.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel2.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel2.exp
Collecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-24ec.qdstrm'

[1/6] [0%                          ] profile_k2_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.nsys-rep
[1/6] [0%                          ] profile_k2_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.nsys-rep

[2/6] [0%                          ] profile_k2_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.sqlite
[2/6] [1%                          ] profile_k2_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.sqlite
[2/6] [3%                          ] profile_k2_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.sqlite
[2/6] [4%                          ] profile_k2_



'e:\\02_Learn\\01_University\\Senior-1 Spring\\Current\\Parallel Computing\\Labs\\Lab_4\\Solution\\Output_TestCases/conv_v1000000_m15_input_maskconv_v1000000_m15_mask_k2_o.txt'

### **Requriment - 3**
- kernel 3 should have input tiling


In [74]:
# Test pair 3: vector=1000, mask=3
compile_and_run_kernel(3, "conv_v1000_m3_input", "conv_v1000_m3_mask", analytics=True)
# verify_kernel_output(3, "conv_v1000_m3", "conv_v1000_m3_mask")

# Test pair 2: vector=10000, mask=5
compile_and_run_kernel(3, "conv_v10000_m5_input", "conv_v10000_m5_mask", analytics=True)
# verify_kernel_output(3, "conv_v10000_m5", "conv_v10000_m5_mask")

# Test pair 3: vector=100000, mask=9
compile_and_run_kernel(3, "conv_v100000_m9_input", "conv_v100000_m9_mask", analytics=True)
# verify_kernel_output(3, "conv_v100000_m9", "conv_v100000_m9_mask")

# Test pair 4: vector=1000000, mask=15
compile_and_run_kernel(3, "conv_v1000000_m15_input", "conv_v1000000_m15_mask", analytics=True)
# verify_kernel_output(3, "conv_v1000000_m15", "conv_v1000000_m15_mask")

Current working directory: e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution
kernel3.cu
tmpxft_000047d0_00000000-10_kernel3.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel3.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel3.exp
Collecting data...Analytics data saved to e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\analytics_Bin\profile_k3_conv_v1000_m3_input_maskconv_v1000_m3_mask
Current working directory: e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution

Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-93a8.qdstrm'

[1/6] [0%                          ] profile_k3_conv_v1000_m3_input_maskconv_v1000_m3_mask.nsys-rep
[1/6] [0%                          ] profile_k3_conv_v1000_m3_input_ma



kernel3.cu
tmpxft_00006394_00000000-10_kernel3.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel3.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel3.exp
Collecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-bd43.qdstrm'

[1/6] [0%                          ] profile_k3_conv_v10000_m5_input_maskconv_v10000_m5_mask.nsys-rep
[1/6] [0%                          ] profile_k3_conv_v10000_m5_input_maskconv_v10000_m5_mask.nsys-rep

[2/6] [0%                          ] profile_k3_conv_v10000_m5_input_maskconv_v10000_m5_mask.sqlite
[2/6] [1%                          ] profile_k3_conv_v10000_m5_input_maskconv_v10000_m5_mask.sqlite
[2/6] [3%                          ] profile_k3_conv_v10000_m5_input_maskconv_v10000_m5_mask.sqlite
[2/6] [4%                          ] profile_k3_conv_v10000_m5_input_maskconv_



kernel3.cu
tmpxft_000057a0_00000000-10_kernel3.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel3.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel3.exp
Analytics data saved to e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\analytics_Bin\profile_k3_conv_v100000_m9_input_maskconv_v100000_m9_maskCollecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-5f9f.qdstrm'

[1/6] [0%                          ] profile_k3_conv_v100000_m9_input_maskconv_v100000_m9_mask.nsys-rep
[1/6] [0%                          ] profile_k3_conv_v100000_m9_input_maskconv_v100000_m9_mask.nsys-rep

[2/6] [0%                          ] profile_k3_conv_v100000_m9_input_maskconv_v100000_m9_mask.sqlite
[2/6] [1%                          ] profile_k3_conv_v100000_m9_input_maskconv_v100000_m



kernel3.cu
tmpxft_00004a84_00000000-10_kernel3.cudafe1.cpp
   Creating library e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel3.lib and object e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\cuda_kernels\bin\kernel3.exp
^C
Analytics data saved to e:\02_Learn\01_University\Senior-1 Spring\Current\Parallel Computing\Labs\Lab_4\Solution\analytics_Bin\profile_k3_conv_v1000000_m15_input_maskconv_v1000000_m15_mask


'e:\\02_Learn\\01_University\\Senior-1 Spring\\Current\\Parallel Computing\\Labs\\Lab_4\\Solution\\Output_TestCases/conv_v1000000_m15_input_maskconv_v1000000_m15_mask_k3_o.txt'



Collecting data...
Generating 'C:\Users\basim\AppData\Local\Temp\nsys-report-0f70.qdstrm'

[1/6] [0%                          ] profile_k3_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.nsys-rep
[1/6] [0%                          ] profile_k3_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.nsys-rep

[2/6] [0%                          ] profile_k3_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.sqlite
[2/6] [1%                          ] profile_k3_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.sqlite
[2/6] [3%                          ] profile_k3_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.sqlite
[2/6] [4%                          ] profile_k3_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.sqlite
[2/6] [6%                          ] profile_k3_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.sqlite
[2/6] [7%                          ] profile_k3_conv_v1000000_m15_input_maskconv_v1000000_m15_mask.sqlite
[2/6] [9%                          ] profile_k3_conv_v10