<a href="https://colab.research.google.com/github/AnKiTu03/C/blob/main/Cuda_Lab_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### ***Section - A***

### 1

Write an OpenMP program to calculate the value of PI using the critical section.

In [None]:
%%writefile q1.c
#include <stdio.h>
#include <omp.h>

int main() {
    long num_steps = 1000000;
    double step = 1.0 / num_steps, pi = 0.0;

    #pragma omp parallel
    {
        double sum = 0.0;
        #pragma omp for
        for (long i = 0; i < num_steps; i++)
            sum += 4.0 / (1.0 + ((i + 0.5) * step) * ((i + 0.5) * step));

        #pragma omp critical
        pi += sum * step;
    }

    printf("PI: %.15f\n", pi);
    return 0;
}

Writing q1.c


In [None]:
!gcc -fopenmp q1.c -o q1
!./q1

PI: 3.141592653589899


### 2

Write an OpenMP program to print parallel programming
environment information.

In [None]:
%%writefile q2.c
#include <stdio.h>
#include <omp.h>

int main() {
    #pragma omp parallel
    {
        int thread_id = omp_get_thread_num();
        int total_threads = omp_get_num_threads();
        int max_threads = omp_get_max_threads();
        int num_procs = omp_get_num_procs();
        int in_parallel = omp_in_parallel();

        #pragma omp critical
        {
            printf("Thread ID: %d\n", thread_id);
            printf("Total Threads: %d\n", total_threads);
            printf("Max Threads: %d\n", max_threads);
            printf("Number of Processors: %d\n", num_procs);
            printf("In Parallel: %d\n", in_parallel);
            printf("---------------------------\n");
        }
    }

    return 0;
}


Writing q2.c


### 3
Write an OpenMP program to add two arrays in parallel using dynamic clause.

In [None]:
%%writefile q3.c
#include <stdio.h>
#include <omp.h>

#define SIZE 1000

int main() {
    int a[SIZE], b[SIZE], c[SIZE];

    for (int i = 0; i < SIZE; i++) {
        a[i] = i;
        b[i] = SIZE - i;
    }

    #pragma omp parallel for schedule(dynamic)
    for (int i = 0; i < SIZE; i++) {
        c[i] = a[i] + b[i];
    }

    printf("First 10 elements of array C:\n");
    for (int i = 0; i < 10; i++) {
        printf("c[%d] = %d\n", i, c[i]);
    }

    return 0;
}


### 4

Write an OpenMP program to add and multiply two arrays with two different threads ( Work sharing).

In [None]:
%%writefile q4.c

#include <stdio.h>
#include <omp.h>

#define SIZE 1000

int main() {
    int a[SIZE], b[SIZE], sum[SIZE], product[SIZE];

    for (int i = 0; i < SIZE; i++) {
        a[i] = i;
        b[i] = SIZE - i;
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        for (int i = 0; i < SIZE; i++)
          sum[i] = a[i] + b[i];

        #pragma omp section
        for (int i = 0; i < SIZE; i++)
          product[i] = a[i] * b[i];
    }

    printf("sum[0]=%d, product[0]=%d\n", sum[0], product[0]);
    return 0;
}


In [None]:
!gcc -fopenmp q4.c -o q4
!./q4

### 5
Write an OpenMP program to perform matrix multiplication.

In [None]:
%%writefile q5.c
#include <stdio.h>
#include <omp.h>

#define N 3

int main() {
    int A[N][N], B[N][N], C[N][N] = {0};

    // Initialize matrices A and B
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            A[i][j] = i + j;
            B[i][j] = i - j;
        }
    }

    #pragma omp parallel for collapse(2)
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            for (int k = 0; k < N; k++) {
                C[i][j] += A[i][k] * B[k][j];
            }
        }
    }

    printf("C[0][0] = %d\n", C[0][1]);

    return 0;
}

Writing q5.c


In [None]:
!gcc -fopenmp q5.c -o q5
!./q5

C[0][0] = 2


### 6

Write an OpenMP program to demonstrate the first private clause.

In [None]:
%%writefile q6.c
#include <stdio.h>
#include <omp.h>

int main() {
    int x = 10; // Shared variable

    printf("Initial value of x: %d\n", x);

    #pragma omp parallel firstprivate(x)
    {
        int thread_id = omp_get_thread_num();
        x += thread_id;
        printf("Thread %d: x = %d\n", thread_id, x);
    }

    printf("Value of x after parallel region: %d\n", x);

    return 0;
}


Writing q6.c


### 7

Write an OpenMP program to add all the numbers in a vector by demonstrating the use of the reduction clause.


In [None]:
%%writefile q7.c
#include <stdio.h>
#include <omp.h>

#define SIZE 1000

int main() {
    int vector[SIZE];
    int sum = 0;
    for (int i = 0; i < SIZE; i++) {
        vector[i] = i + 1;
    }
    #pragma omp parallel for reduction(+:sum)
    for (int i = 0; i < SIZE; i++) {
        sum += vector[i];
    }

    printf("Sum of vector elements: %d\n", sum);
    return 0;
}

### ***Section - B***

### 1

Write a CUDA program to add 2 numbers.

In [None]:
%%writefile q1.cu
#include <stdio.h>

__global__ void add(int *a, int *b, int *c) {
    *c = *a + *b;
}

int main() {
    int a = 5, b = 10, c;
    int *d_a, *d_b, *d_c;

    // Allocate memory on the GPU
    cudaMalloc((void **)&d_a, sizeof(int));
    cudaMalloc((void **)&d_b, sizeof(int));
    cudaMalloc((void **)&d_c, sizeof(int));

    // Copy inputs to GPU
    cudaMemcpy(d_a, &a, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, &b, sizeof(int), cudaMemcpyHostToDevice);

    // Launch kernel
    add<<<1, 1>>>(d_a, d_b, d_c);

    // Copy result back to host
    cudaMemcpy(&c, d_c, sizeof(int), cudaMemcpyDeviceToHost);

    printf("The sum of %d and %d is %d\n", a, b, c);

    // Free GPU memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}

Overwriting q1.cu


In [None]:
!nvcc -o q1 q1.cu
!./q1


The sum of 5 and 10 is 15


### 2

Write a CUDA program to perform vector addition.

In [None]:
%%writefile q2.cu
#include <stdio.h>

#define N 1000

__global__ void vector_add(int *a, int *b, int *c, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int a[N], b[N], c[N];
    int *d_a, *d_b, *d_c;

    for (int i = 0; i < N; i++) {
        a[i] = i;
        b[i] = N - i;
    }

    cudaMalloc((void **)&d_a, N * sizeof(int));
    cudaMalloc((void **)&d_b, N * sizeof(int));
    cudaMalloc((void **)&d_c, N * sizeof(int));

    cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice);

    int block_size = 256;
    int grid_size = (N + block_size - 1) / block_size;
    vector_add<<<grid_size, block_size>>>(d_a, d_b, d_c, N);

    cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);

    printf("First 10 elements of the result:\n");
    for (int i = 0; i < 10; i++) {
        printf("c[%d] = %d\n", i, c[i]);
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Writing q2.cu


In [None]:
!nvcc -o q2 q2.cu
!./q2

First 10 elements of the result:
c[0] = 1000
c[1] = 1000
c[2] = 1000
c[3] = 1000
c[4] = 1000
c[5] = 1000
c[6] = 1000
c[7] = 1000
c[8] = 1000
c[9] = 1000


### 3

Write a CUDA program to perform matrix addition.

In [None]:
%%writefile q3.cu
#include <stdio.h>

#define N 3

__global__ void matrix_add(int *a, int *b, int *c, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < n && col < n) {
        int idx = row * n + col;
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int a[N][N], b[N][N], c[N][N], *d_a, *d_b, *d_c;
    int size = N * N * sizeof(int);

    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++) {
            a[i][j] = i + j;
            b[i][j] = i - j;
        }
    printf("Matrix A:\n");
    for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
        printf("%d ", a[i][j]);
    }
    printf("\n");
}
printf("Matrix B:\n");
    for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
        printf("%d ", b[i][j]);
    }
    printf("\n");
}

    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((N + 15) / 16, (N + 15) / 16);
    matrix_add<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N);

    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    printf("Result matrix:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", c[i][j]);
        }
        printf("\n");
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Overwriting q3.cu


In [None]:
!nvcc -o q3 q3.cu
!./q3

Matrix A:
0 1 2 
1 2 3 
2 3 4 
Matrix B:
0 -1 -2 
1 0 -1 
2 1 0 
Result matrix:
0 0 0 
2 2 2 
4 4 4 


### 4

Examiner’s choice program.

In [None]:
#let god save us


### 5

Write a CUDA program to perform matrix multiplication.

In [None]:
%%writefile q5.cu
#include <stdio.h>

#define N 3

__global__ void matrix_multiply(int *a, int *b, int *c, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < n && col < n) {
        int sum = 0;
        for (int k = 0; k < n; k++) {
            sum += a[row * n + k] * b[k * n + col];
        }
        c[row * n + col] = sum;
    }
}

int main() {
    int a[N][N], b[N][N], c[N][N], *d_a, *d_b, *d_c;
    int size = N * N * sizeof(int);

    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++) {
            a[i][j] = i + j;
            b[i][j] = i - j;
        }

    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((N + 15) / 16, (N + 15) / 16);
    matrix_multiply<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N);

    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    printf("Result matrix:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", c[i][j]);
        }
        printf("\n");
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Writing q5.cu


In [None]:
!nvcc -o q5 q5.cu
!./q5

Result matrix:
5 2 -1 
8 2 -4 
11 2 -7 


### 6

Write a CUDA program to perform following operations:
- Take 2 matrices A, B
- Find the transpose TA, TB
- Perform C = A*B, TC = TA*TB
- V erify whether C and TC are equal or not.


In [None]:
%%writefile q6.cu

#include <stdio.h>

#define N 10

__global__ void matrix_multiply(int *a, int *b, int *c, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < n && col < n) {
        int sum = 0;
        for (int k = 0; k < n; k++) {
            sum += a[row * n + k] * b[k * n + col];
        }
        c[row * n + col] = sum;
    }
}

__global__ void matrix_transpose(int *input, int *output, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < n && col < n) {
        output[col * n + row] = input[row * n + col];
    }
}

__global__ void matrix_compare(int *a, int *b, int *result, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n * n && a[idx] != b[idx]) {
        *result = 0;
    }
}

int main() {
    int a[N][N], b[N][N], c[N][N], ta[N][N], tb[N][N], tc[N][N];
    int *d_a, *d_b, *d_c, *d_ta, *d_tb, *d_tc;
    int size = N * N * sizeof(int);
    int result = 1, *d_result;

    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++) {
            a[i][j] = i + 1;
            b[i][j] = j + 1;
        }

    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);
    cudaMalloc(&d_ta, size);
    cudaMalloc(&d_tb, size);
    cudaMalloc(&d_tc, size);
    cudaMalloc(&d_result, sizeof(int));

    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_result, &result, sizeof(int), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((N + 15) / 16, (N + 15) / 16);

    // Transpose A and B
    matrix_transpose<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_ta, N);
    matrix_transpose<<<blocksPerGrid, threadsPerBlock>>>(d_b, d_tb, N);

    // C = A * B
    matrix_multiply<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N);

    // TC = TA * TB
    matrix_multiply<<<blocksPerGrid, threadsPerBlock>>>(d_ta, d_tb, d_tc, N);

    // Compare C and TC
    matrix_compare<<<(N * N + 255) / 256, 256>>>(d_c, d_tc, d_result, N);

    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(tc, d_tc, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(&result, d_result, sizeof(int), cudaMemcpyDeviceToHost);

    printf("Matrix C:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", c[i][j]);
        }
        printf("\n");
    }

    printf("Matrix TC:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", tc[i][j]);
        }
        printf("\n");
    }

    printf("C and TC are %sequal.\n", result ? "" : "not ");

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFree(d_ta);
    cudaFree(d_tb);
    cudaFree(d_tc);
    cudaFree(d_result);

    return 0;
}


Writing q6.cu


In [None]:
!nvcc -o q6 q6.cu
!./q6

      int a[10][10], b[10][10], c[10][10], ta[10][10], tb[10][10], tc[10][10];
                                           ^


      int a[10][10], b[10][10], c[10][10], ta[10][10], tb[10][10], tc[10][10];
                                                       ^

Matrix C:
10 20 30 40 50 60 70 80 90 100 
20 40 60 80 100 120 140 160 180 200 
30 60 90 120 150 180 210 240 270 300 
40 80 120 160 200 240 280 320 360 400 
50 100 150 200 250 300 350 400 450 500 
60 120 180 240 300 360 420 480 540 600 
70 140 210 280 350 420 490 560 630 700 
80 160 240 320 400 480 560 640 720 800 
90 180 270 360 450 540 630 720 810 900 
100 200 300 400 500 600 700 800 900 1000 
Matrix TC:
385 385 385 385 385 385 385 385 385 385 
385 385 385 385 385 385 385 385 385 385 
385 385 385 385 385 385 385 385 385 385 
385 385 385 385 385 385 385 385 385 385 
385 385 385 385 385 385 385 385 385 385 
385 385 385 385 385 385 385 385 385 385 
385 385 385 385 385 385 385 385 385 385 
385 385 385 385 385 385 385 385 385 385 


### 7

Write a CUDA program to perform dot product on two vectors.

In [None]:
%%writefile q7.cu

#include <stdio.h>

#define N 1000

__global__ void vector_dot_product(int *a, int *b, int *result) {
    __shared__ int temp[1024];
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int thread_id = threadIdx.x;
    temp[thread_id] = (idx < N) ? a[idx] * b[idx] : 0;
    __syncthreads();

    if (thread_id == 0) {
        int sum = 0;
        for (int i = 0; i < blockDim.x; i++) sum += temp[i];
        atomicAdd(result, sum);
    }
}

int main() {
    int a[N], b[N], result = 0;
    int *d_a, *d_b, *d_result;

    for (int i = 0; i < N; i++) {
        a[i] = i + 1;
        b[i] = N - i;
    }

    cudaMalloc(&d_a, N * sizeof(int));
    cudaMalloc(&d_b, N * sizeof(int));
    cudaMalloc(&d_result, sizeof(int));
    cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_result, &result, sizeof(int), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    vector_dot_product<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_result);
    cudaMemcpy(&result, d_result, sizeof(int), cudaMemcpyDeviceToHost);

    printf("Dot product: %d\n", result);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_result);
    return 0;
}


Overwriting q7.cu


In [None]:
!nvcc -o q7 q7.cu
!./q7

Dot product: 167167000
