In [1]:
!nvidia-smi



Tue May  6 20:19:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   69C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Write the CUDA code to a .cu file

%%writefile vector_add.cu

// Filename: vector_add.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 1000000  // 1 million elements

// CUDA kernel for vector addition
__global__ void vectorAdd(int* a, int* b, int* c, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

// Fill array with random integers
void fillArray(int* arr, int n) {
    for (int i = 0; i < n; i++) {
        arr[i] = rand() % 100;
    }
}

int main() {
    int size = N * sizeof(int);

    // Allocate memory on host
    int *h_a = (int*)malloc(size);
    int *h_b = (int*)malloc(size);
    int *h_c = (int*)malloc(size);

    // Fill arrays with random data
    fillArray(h_a, N);
    fillArray(h_b, N);

    // Allocate memory on device
    int *d_a, *d_b, *d_c;
    cudaMalloc((void**)&d_a, size);
    cudaMalloc((void**)&d_b, size);
    cudaMalloc((void**)&d_c, size);

    // Copy input vectors to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // Launch kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N);

    // Wait for kernel to finish
    cudaDeviceSynchronize();

    // Copy result back to host
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Print first 10 results
    printf("Vector Addition Result (first 10 elements):\n");
    for (int i = 0; i < 10; i++) {
        printf("%d + %d = %d\n", h_a[i], h_b[i], h_c[i]);
    }

    // Free memory - Frees both CPU and GPU memory.
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

    return 0;
}





Writing vector_add.cu


In [3]:
!nvcc -arch=sm_75 vector_add.cu -o vector_add



In [4]:
!./vector_add


Vector Addition Result (first 10 elements):
83 + 89 = 172
86 + 63 = 149
77 + 84 = 161
15 + 93 = 108
93 + 81 = 174
35 + 55 = 90
86 + 6 = 92
92 + 93 = 185
49 + 61 = 110
21 + 50 = 71


In [5]:

%%writefile matrix_mul.cu

// %writefile matrix_mul.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 4  // You can increase this to 512 or 1024 for bigger matrices

// CUDA kernel for Matrix Multiplication
__global__ void matrixMulKernel(int* a, int* b, int* c, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < width && col < width) {
        int sum = 0;
        for (int k = 0; k < width; ++k) {
            sum += a[row * width + k] * b[k * width + col];
        }
        c[row * width + col] = sum;
    }
}

// Fill matrix with random integers
void fillMatrix(int* mat, int width) {
    for (int i = 0; i < width * width; i++) {
        mat[i] = rand() % 10; // fill with random values 0-9
    }
}

// Print matrix (for verification)
void printMatrix(int* mat, int width) {
    for (int i = 0; i < width; i++) {
        for (int j = 0; j < width; j++) {
            printf("%4d ", mat[i * width + j]);
        }
        printf("\n");
    }
    printf("\n");
}

int main() {
    int size = N * N * sizeof(int);

    // Allocate memory on host
    int *h_a = (int*)malloc(size);
    int *h_b = (int*)malloc(size);
    int *h_c = (int*)malloc(size);

    // Fill host matrices with random values
    fillMatrix(h_a, N);
    fillMatrix(h_b, N);

    // Allocate memory on device
    int *d_a, *d_b, *d_c;
    cudaMalloc((void**)&d_a, size);
    cudaMalloc((void**)&d_b, size);
    cudaMalloc((void**)&d_c, size);

    // Copy host matrices to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 dimBlock(4, 4);
    dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x,
                 (N + dimBlock.y - 1) / dimBlock.y);

    // Launch kernel
    matrixMulKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
    cudaDeviceSynchronize();

    // Copy result back to host
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Print results (for verification)
    printf("Matrix A:\n"); printMatrix(h_a, N);
    printf("Matrix B:\n"); printMatrix(h_b, N);
    printf("Matrix C = A * B:\n"); printMatrix(h_c, N);

    // Free memory
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

    return 0;
}




Writing matrix_mul.cu


In [6]:
!nvcc -arch=sm_75 matrix_mul.cu -o matrix_mul

In [7]:
!./matrix_mul

Matrix A:
   3    6    7    5 
   3    5    6    2 
   9    1    2    7 
   0    9    3    6 

Matrix B:
   0    6    2    6 
   1    8    7    9 
   2    0    2    3 
   7    5    9    2 

Matrix C = A * B:
  55   91  107  103 
  31   68   71   85 
  54   97   92   83 
  57  102  123  102 

