# **Lab 2 Basim Sherief 1210207**


### **Requirement 2**

1) Complete the provided matrix addition example, following these cases:
        
        A.   kernel1: each thread produces one output matrix element
        B.   kernel2: each thread produces one output matrix row
        C.   kernel3: each thread produces one output matrix column
  Analyze the pros and cons of each of the kernels above by using nvprof with large matrix sizes to validate your posize_ts. Collect your insights in a PDF report and explain them.

2) Implement a matrix–vector multiplication kernel. Use one thread to calculate an output vector element.

Let both programs read testcases from a .txt file and prsize_t the output to another. Their pathes are to be provided as command line arguments. Sample test file and invoking command are to be attached to the e-learning page.



In [194]:
# Setup cuda environment
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-2wds6s2_
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-2wds6s2_
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 28f872a2f99a1b201bcd0db14fdbc5a496b9bfd7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


**Generate Testcases**

In [195]:
import numpy as np

def generate_matrix_test_cases(num_tests, min_dim, max_dim, min_val, max_val, output_file):
    """
    Generate test cases for matrix addition
    Parameters:
    - num_tests: number of test cases
    - min_dim: minimum dimension (rows/cols)
    - max_dim: maximum dimension (rows/cols)
    - min_val: minimum value in matrices
    - max_val: maximum value in matrices
    - output_file: path to output file
    """
    with open(output_file, 'w') as f:
        # Write number of test cases
        f.write(f"{num_tests}\n")
        
        for _ in range(num_tests):
            # Generate random dimensions
            rows = np.random.randint(min_dim, max_dim + 1)
            cols = np.random.randint(min_dim, max_dim + 1)
            
            # Write dimensions
            f.write(f"{rows} {cols}\n")
            
            # Generate and write first matrix
            matrix1 = np.random.uniform(min_val, max_val, (rows, cols))
            for row in matrix1:
                f.write(" ".join(f"{x:.3f}" for x in row) + "\n")
            
            # Generate and write second matrix
            matrix2 = np.random.uniform(min_val, max_val, (rows, cols))
            for row in matrix2:
                f.write(" ".join(f"{x:.3f}" for x in row) + "\n")

# Set your parameters here
params = {
    'num_tests': 5,              # Number of test cases
    'min_dim': 2,               # Minimum matrix dimension
    'max_dim': 10,               # Maximum matrix dimension
    'min_val': -50000000.0,           # Minimum value in matrices
    'max_val': 5000000000.0,            # Maximum value in matrices
    'output_file': './inputfile.txt'  # Output file name
}

# Run the generator with the specified parameters
if __name__ == "__main__":
    generate_matrix_test_cases(**params)

# **CPU Only**
# Vector addition in pure C (CPU-only execution)

In [196]:
%%writefile kernel0.cu   
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <time.h>

#define MAX_ERR 1e-6

// Function to perform vector addition
void vector_add(double *out, double *a, double *b, size_t  n) {
    for (size_t  i = 0; i < n; i++) {
        out[i] = a[i] + b[i];
    }
}

int main(int argc, char* argv[]) {
    FILE *file_reading;
    int numberOfTests;
    size_t rows, cols;
    // Open the file in read mode
    file_reading = fopen(argv[1], "r");
    if (file_reading == NULL) {
        printf("Error opening file!\n");
        return 1;
    }

    // Read number of tests
    fscanf(file_reading, "%d",&numberOfTests);
for(size_t i=0;i<numberOfTests;i++){
    
    // Read matrix dimensions
    fscanf(file_reading, "%zu %zu",&rows, &cols);
    // Allocate host matrices
    double* A = (double*)malloc(sizeof(double) * rows * cols);
    double* B = (double*)malloc(sizeof(double) * rows * cols);
    double* C = (double*)malloc(sizeof(double) * rows * cols);

    if (A == NULL || B == NULL || C == NULL) {
        printf("Memory allocation failed!\n");
        fclose(file_reading);
        return 1;
    }

    // Read matrices A and B
    for (size_t i = 0; i < rows * cols; i++) {
        fscanf(file_reading, "%lf", &A[i]);
    }
    for (size_t i = 0; i < rows * cols; i++) {
        fscanf(file_reading, "%lf", &B[i]);
    }



    // Start timing
    clock_t start = clock();

    // Perform vector addition
    vector_add(C, A, B, rows * cols);

    // End timing
    clock_t end = clock();

    // Calculate the elapsed time in seconds
    double time_spent = (double)(end - start) / CLOCKS_PER_SEC * 1000.0;

    printf("Time elapsed: %f ms\n", time_spent);

    // Verification
    for (size_t i = 0; i < rows * cols; i++) {
        assert(fabs(C[i] - A[i] - B[i]) < MAX_ERR);
    }

    printf("Vector addition completed successfully!\n");

  
    // Write results to output file
   // Write results to output file
    FILE *file_writing;
    file_writing= fopen(argv[2], "w"); // Open file for writing
    if (file_writing == NULL) {
        perror("Error opening file");
        return 1;
    }


    // Write matrix C
    for (size_t i = 0; i < rows; i++) {
        for (size_t j = 0; j < cols; j++) {
              printf("%.3lf ", C[i * cols + j]);    
            fprintf(file_writing, "%.3lf ", C[i * cols + j]); // Write double with 2 decimal places
        }
         printf("\n");  
        fprintf(file_writing, "\n"); // New line after each row
    }
    fclose(file_writing);


    // Free allocated memory
    free(A);
    free(B);
    free(C);
}
    return 0;
}

Overwriting kernel0.cu


In [197]:
!nvcc kernel0.cu -o kernel0
!nvprof ./kernel0 inputfile.txt outputfile_cpu.txt

Time elapsed: 0.002000 ms
Vector addition completed successfully!
7682593151.425 9307621257.798 8854249266.964 4087549386.149 3439707194.127 
8803873489.240 1909966970.166 1397633034.193 7068590033.049 3491590656.183 
7948104694.498 4313069925.916 4640805840.438 4960592514.210 711771544.461 
Time elapsed: 0.001000 ms
Vector addition completed successfully!
4258394483.178 3383874166.151 
5002761596.659 3018116653.388 
Time elapsed: 0.001000 ms
Vector addition completed successfully!
7151195548.145 3552891609.456 2462738634.944 1605793866.303 6614305951.769 6019915126.339 
7535842869.965 5249547090.566 537876028.792 1643638077.207 4910906587.083 3822890372.163 
6929633903.204 5232428985.481 6466724766.609 3930741405.756 3848236662.653 3429254488.717 
5307709439.581 5313845795.434 4157938493.647 4175912989.887 5350863888.465 4186056914.385 
6584231625.400 7177001344.189 3692395991.829 1718159940.207 3385078105.065 4876807181.588 
1675273462.793 4077316734.557 2172254967.665 6891091089.804

# kernel1: each thread produces one output matrix element


In [198]:
%%writefile kernel1.cu   
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <fstream>
#include <iostream>
#include <sstream>
#define MAX_ERR 1e-6
__global__ void matrixAddKernel1(double* C, double* A, double* B, size_t rows, size_t cols) {
    size_t row = blockIdx.y * blockDim.y + threadIdx.y;
    size_t col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < rows && col < cols) {
        size_t idx = row * cols + col;
        C[idx] = A[idx] + B[idx];
    }
}

cudaError_t addMatricesWithCuda(double* C, double* A, double* B, size_t rows, size_t cols) {
    double* dev_A = nullptr;
    double* dev_B = nullptr;
    double* dev_C = nullptr;
    cudaError_t cudaStatus;

    // Allocate GPU buffers
    size_t size = rows * cols * sizeof(double);  // Changed from size_t to double
    
    cudaStatus = cudaMalloc((void**)&dev_C, size);
    
    cudaStatus = cudaMalloc((void**)&dev_A, size);

    cudaStatus = cudaMalloc((void**)&dev_B, size);

    // Copy input matrices from host memory to GPU buffers
    cudaStatus = cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);    
    cudaStatus = cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);

    // Launch kernel
    dim3 threadsPerBlock(16, 16); // i just did what Ta and cuda said the best to use
    dim3 numBlocks((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
    
    matrixAddKernel1<<<numBlocks, threadsPerBlock>>>(dev_C, dev_A, dev_B, rows, cols);

    // Copy output matrix from GPU buffer to host memory
    cudaStatus = cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);

    return cudaStatus;
}

int main(int argc, char* argv[]) {

    FILE *file_reading;
    int numberOfTests;
    size_t  rows, cols;
    // Open the file in read mode
    file_reading = fopen(argv[1], "r");
    if (file_reading == NULL) {
        printf("Error opening file!\n");
        return 1;
    }
    // Read number of tests
    fscanf(file_reading, "%d",&numberOfTests);
for(size_t i=0;i<numberOfTests;i++){
    
    // Read matrix dimensions
    fscanf(file_reading, "%zu %zu", &rows, &cols);

    // Allocate host matrices
    double* A = (double*)malloc(sizeof(double) * rows * cols);  // Changed from size_t to double
    double* B = (double*)malloc(sizeof(double) * rows * cols);
    double* C = (double*)malloc(sizeof(double) * rows * cols);

    // Read matrices A and B
    for (size_t i = 0; i < rows * cols; i++) {
        fscanf(file_reading, "%lf", &A[i]);
    }
    for (size_t i = 0; i < rows * cols; i++) {
        fscanf(file_reading, "%lf", &B[i]);
    }
    
 

    // Add matrices using CUDA
    cudaError_t cudaStatus = addMatricesWithCuda(C, A, B, rows, cols);

    // Verification
    for (size_t i = 0; i < rows * cols; i++) {
        assert(fabs(C[i] - A[i] - B[i]) < MAX_ERR);
    }

    printf("Vector addition completed successfully!\n");

    // Write results to output file
    FILE *file_writing;
    file_writing= fopen(argv[2], "w"); // Open file for writing
    if (file_writing == NULL) {
        perror("Error opening file");
        return 1;
    }


    // Write matrix C
    for (size_t i = 0; i < rows; i++) {
        for (size_t j = 0; j < cols; j++) {
              printf("%.3f ", C[i * cols + j]);    
            fprintf(file_writing, "%.3f ", C[i * cols + j]); // Write double with 2 decimal places
        }
         printf("\n");  
        fprintf(file_writing, "\n"); // New line after each row
    }
    fclose(file_writing);


    // Cleanup
    free(A);
    free(B);
    free(C);
}
    return 0;
}

Overwriting kernel1.cu


In [199]:
!nvcc kernel1.cu -o kernel1
!nvprof ./kernel1 inputfile.txt outputfile_kernel1.txt

==5242== NVPROF is profiling process 5242, command: ./kernel1 inputfile.txt outputfile_kernel1.txt
Vector addition completed successfully!
7682593151.425 9307621257.798 8854249266.964 4087549386.149 3439707194.127 
8803873489.240 1909966970.166 1397633034.193 7068590033.049 3491590656.183 
7948104694.498 4313069925.916 4640805840.438 4960592514.210 711771544.461 
Vector addition completed successfully!
4258394483.178 3383874166.151 
5002761596.659 3018116653.388 
Vector addition completed successfully!
7151195548.145 3552891609.456 2462738634.944 1605793866.303 6614305951.769 6019915126.339 
7535842869.965 5249547090.566 537876028.792 1643638077.207 4910906587.083 3822890372.163 
6929633903.204 5232428985.481 6466724766.609 3930741405.756 3848236662.653 3429254488.717 
5307709439.581 5313845795.434 4157938493.647 4175912989.887 5350863888.465 4186056914.385 
6584231625.400 7177001344.189 3692395991.829 1718159940.207 3385078105.065 4876807181.588 
1675273462.793 4077316734.557 21722549

# kernel2: each thread produces one output matrix row


In [200]:
%%writefile kernel2.cu   
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <fstream>
#include <iostream>
#include <sstream>
#define MAX_ERR 1e-6
__global__ void matrixAddKernel1(double* C, double* A, double* B, size_t rows, size_t cols) {
    size_t row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < rows) {
        for(size_t col =0 ; col < cols; col++){
              size_t idx = row * cols + col;
            C[idx] = A[idx] + B[idx];
        }
    }
}

cudaError_t addMatricesWithCuda(double* C, double* A, double* B, size_t rows, size_t cols) {
    double* dev_A = nullptr;
    double* dev_B = nullptr;
    double* dev_C = nullptr;
    cudaError_t cudaStatus;

    // Allocate GPU buffers
    size_t size = rows * cols * sizeof(double);  // Changed from size_t to double
    
    cudaStatus = cudaMalloc((void**)&dev_C, size);
    
    cudaStatus = cudaMalloc((void**)&dev_A, size);

    cudaStatus = cudaMalloc((void**)&dev_B, size);

    // Copy input matrices from host memory to GPU buffers
    cudaStatus = cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);    
    cudaStatus = cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);

    // Launch kernel
    dim3 threadsPerBlock(16, 16); // i just did what Ta and cuda said the best to use
    dim3 numBlocks((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
    
    matrixAddKernel1<<<numBlocks, threadsPerBlock>>>(dev_C, dev_A, dev_B, rows, cols);

    // Copy output matrix from GPU buffer to host memory
    cudaStatus = cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);

    return cudaStatus;
}

int main(int argc, char* argv[]) {

    FILE *file_reading;
    int numberOfTests;
    size_t  rows, cols;
    // Open the file in read mode
    file_reading = fopen(argv[1], "r");
    if (file_reading == NULL) {
        printf("Error opening file!\n");
        return 1;
    }
    // Read number of tests
    fscanf(file_reading, "%d",&numberOfTests);
for(size_t i=0;i<numberOfTests;i++){
    
    // Read matrix dimensions
    fscanf(file_reading, "%zu %zu", &rows, &cols);

    // Allocate host matrices
    double* A = (double*)malloc(sizeof(double) * rows * cols);  // Changed from size_t to double
    double* B = (double*)malloc(sizeof(double) * rows * cols);
    double* C = (double*)malloc(sizeof(double) * rows * cols);

    // Read matrices A and B
    for (size_t i = 0; i < rows * cols; i++) {
        fscanf(file_reading, "%lf", &A[i]);
    }
    for (size_t i = 0; i < rows * cols; i++) {
        fscanf(file_reading, "%lf", &B[i]);
    }
    
 

    // Add matrices using CUDA
    cudaError_t cudaStatus = addMatricesWithCuda(C, A, B, rows, cols);

    // Verification
    for (size_t i = 0; i < rows * cols; i++) {
        assert(fabs(C[i] - A[i] - B[i]) < MAX_ERR);
    }

    printf("Vector addition completed successfully!\n");

    // Write results to output file
    FILE *file_writing;
    file_writing= fopen(argv[2], "w"); // Open file for writing
    if (file_writing == NULL) {
        perror("Error opening file");
        return 1;
    }


    // Write matrix C
    for (size_t i = 0; i < rows; i++) {
        for (size_t j = 0; j < cols; j++) {
              printf("%.3f ", C[i * cols + j]);    
            fprintf(file_writing, "%.3f ", C[i * cols + j]); // Write double with 2 decimal places
        }
         printf("\n");  
        fprintf(file_writing, "\n"); // New line after each row
    }
    fclose(file_writing);


    // Cleanup
    free(A);
    free(B);
    free(C);
}
    return 0;
}

Overwriting kernel2.cu


In [201]:
!nvcc kernel2.cu -o kernel2
!nvprof ./kernel2 inputfile.txt outputfile_kernel2.txt

==5286== NVPROF is profiling process 5286, command: ./kernel2 inputfile.txt outputfile_kernel2.txt
Vector addition completed successfully!
7682593151.425 9307621257.798 8854249266.964 4087549386.149 3439707194.127 
8803873489.240 1909966970.166 1397633034.193 7068590033.049 3491590656.183 
7948104694.498 4313069925.916 4640805840.438 4960592514.210 711771544.461 
Vector addition completed successfully!
4258394483.178 3383874166.151 
5002761596.659 3018116653.388 
Vector addition completed successfully!
7151195548.145 3552891609.456 2462738634.944 1605793866.303 6614305951.769 6019915126.339 
7535842869.965 5249547090.566 537876028.792 1643638077.207 4910906587.083 3822890372.163 
6929633903.204 5232428985.481 6466724766.609 3930741405.756 3848236662.653 3429254488.717 
5307709439.581 5313845795.434 4157938493.647 4175912989.887 5350863888.465 4186056914.385 
6584231625.400 7177001344.189 3692395991.829 1718159940.207 3385078105.065 4876807181.588 
1675273462.793 4077316734.557 21722549

# kernel3: each thread produces one output matrix col


In [202]:
%%writefile kernel3.cu   
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <fstream>
#include <iostream>
#include <sstream>
#define MAX_ERR 1e-6
__global__ void matrixAddKernel1(double* C, double* A, double* B, size_t rows, size_t cols) {
    size_t col = blockIdx.x * blockDim.x + threadIdx.x;
    if ( col < cols) {
        for(size_t row = 0; row< rows ; row++){
            size_t idx = row * cols + col;
            C[idx] = A[idx] + B[idx];
        }
   
    }
}

cudaError_t addMatricesWithCuda(double* C, double* A, double* B, size_t rows, size_t cols) {
    double* dev_A = nullptr;
    double* dev_B = nullptr;
    double* dev_C = nullptr;
    cudaError_t cudaStatus;

    // Allocate GPU buffers
    size_t size = rows * cols * sizeof(double);  // Changed from size_t to double
    
    cudaStatus = cudaMalloc((void**)&dev_C, size);
    
    cudaStatus = cudaMalloc((void**)&dev_A, size);

    cudaStatus = cudaMalloc((void**)&dev_B, size);

    // Copy input matrices from host memory to GPU buffers
    cudaStatus = cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);    
    cudaStatus = cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);

    // Launch kernel
    dim3 threadsPerBlock(16, 16); // i just did what Ta and cuda said the best to use
    dim3 numBlocks((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
    
    matrixAddKernel1<<<numBlocks, threadsPerBlock>>>(dev_C, dev_A, dev_B, rows, cols);

    // Copy output matrix from GPU buffer to host memory
    cudaStatus = cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);

    return cudaStatus;
}

int main(int argc, char* argv[]) {

    FILE *file_reading;
    int numberOfTests;
    size_t  rows, cols;
    // Open the file in read mode
    file_reading = fopen(argv[1], "r");
    if (file_reading == NULL) {
        printf("Error opening file!\n");
        return 1;
    }
    // Read number of tests
    fscanf(file_reading, "%d",&numberOfTests);
for(size_t i=0;i<numberOfTests;i++){
    
    // Read matrix dimensions
    fscanf(file_reading, "%zu %zu", &rows, &cols);

    // Allocate host matrices
    double* A = (double*)malloc(sizeof(double) * rows * cols);  // Changed from size_t to double
    double* B = (double*)malloc(sizeof(double) * rows * cols);
    double* C = (double*)malloc(sizeof(double) * rows * cols);

    // Read matrices A and B
    for (size_t i = 0; i < rows * cols; i++) {
        fscanf(file_reading, "%lf", &A[i]);
    }
    for (size_t i = 0; i < rows * cols; i++) {
        fscanf(file_reading, "%lf", &B[i]);
    }
    
 

    // Add matrices using CUDA
    cudaError_t cudaStatus = addMatricesWithCuda(C, A, B, rows, cols);

    // Verification
    for (size_t i = 0; i < rows * cols; i++) {
        assert(fabs(C[i] - A[i] - B[i]) < MAX_ERR);
    }

    printf("Vector addition completed successfully!\n");

    // Write results to output file
    FILE *file_writing;
    file_writing= fopen(argv[2], "w"); // Open file for writing
    if (file_writing == NULL) {
        perror("Error opening file");
        return 1;
    }


    // Write matrix C
    for (size_t i = 0; i < rows; i++) {
        for (size_t j = 0; j < cols; j++) {
            printf("%.3f ", C[i * cols + j]);    
            fprintf(file_writing, "%.3f ", C[i * cols + j]); // Write double with 2 decimal places
        }
        printf("\n");  
        fprintf(file_writing, "\n"); // New line after each row
    }
    fclose(file_writing);


    // Cleanup
    free(A);
    free(B);
    free(C);
}
    return 0;
}

Overwriting kernel3.cu


In [203]:
!nvcc kernel3.cu -o kernel3
!nvprof ./kernel3 inputfile.txt outputfile_kernel3.txt

==5330== NVPROF is profiling process 5330, command: ./kernel3 inputfile.txt outputfile_kernel3.txt
Vector addition completed successfully!
7682593151.425 9307621257.798 8854249266.964 4087549386.149 3439707194.127 
8803873489.240 1909966970.166 1397633034.193 7068590033.049 3491590656.183 
7948104694.498 4313069925.916 4640805840.438 4960592514.210 711771544.461 
Vector addition completed successfully!
4258394483.178 3383874166.151 
5002761596.659 3018116653.388 
Vector addition completed successfully!
7151195548.145 3552891609.456 2462738634.944 1605793866.303 6614305951.769 6019915126.339 
7535842869.965 5249547090.566 537876028.792 1643638077.207 4910906587.083 3822890372.163 
6929633903.204 5232428985.481 6466724766.609 3930741405.756 3848236662.653 3429254488.717 
5307709439.581 5313845795.434 4157938493.647 4175912989.887 5350863888.465 4186056914.385 
6584231625.400 7177001344.189 3692395991.829 1718159940.207 3385078105.065 4876807181.588 
1675273462.793 4077316734.557 21722549