In [None]:
!pip install git+https://github.com/afnan47/cuda.git
%load_ext nvcc_plugin

Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-n99cvtoo
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-n99cvtoo
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4289 sha256=c1535c7e1abb49ec2b0856641c42870aaec4ab6c1b14c5f7efa9d58bffef5ed3
  Stored in directory: /tmp/pip-ephem-wheel-cache-7qp28wfh/wheels/aa/f3/44/e10c1d226ec561d971fcd4b0463f6bff08602afa928a3e7bc7
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cu
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
using namespace std;

#define N 10 // Size of vectors

__global__ void add(int *a, int *b, int *c) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

int main() {

    // Defining host arrays
    int host_a[N], host_b[N], host_c[N];

    // Defining Device vectors
    int *device_a, *device_b, *device_c;

    int size = N * sizeof(int);

    // Allocate memory on device
    cudaMalloc(&device_a, size);
    cudaMalloc(&device_b, size);
    cudaMalloc(&device_c, size);

    // Initialize host vectors
    for (int i = 0; i < N; i++) {
        host_a[i] = (i+1) ;
        host_b[i] = (i+1) * (i+1) ;
    }

    // Display vectors
    cout << "\nVector A: " ;
    for (int i = 0; i < N; i++) {
        cout << host_a[i] << ", " ;
    }
    cout << "\nVector B: " ;
    for (int i = 0; i < N; i++) {
        cout << host_b[i] << ", " ;
    }

    // Copy host vectors to device
    cudaMemcpy(device_a, host_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(device_b, host_b, size, cudaMemcpyHostToDevice);

    // Launch kernel
    add<<<N, 1>>>(device_a, device_b, device_c);

    // Copy result back to host
    cudaMemcpy(host_c, device_c, size, cudaMemcpyDeviceToHost);

    // Display result
    cout << "\nVector addition on GPU: " << endl;
    for (int i = 0; i < N; i++) {
        cout << host_a[i] << " + " << host_b[i] << " = " << host_c[i] << endl;
    }

    // Free device memory
    cudaFree(device_a);
    cudaFree(device_b);
    cudaFree(device_c);

    // Free host memory
    delete[] host_a;
    delete[] host_b;
    delete[] host_c;

    return 0;
}



Vector A: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
Vector B: 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 
Vector addition on GPU: 
1 + 1 = 0
2 + 4 = 0
3 + 9 = 0
4 + 16 = 0
5 + 25 = 0
6 + 36 = 0
7 + 49 = 0
8 + 64 = 0
9 + 81 = 0
10 + 100 = 0
double free or corruption (out)



In [None]:
%%cu
#include <iostream>
#include <cuda_runtime.h>
using namespace std;

const int N = 4; // Matrix size (NxN)

// CUDA kernel to perform matrix multiplication
__global__ void matrixMul(int *a, int *b, int *c, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < n && col < n) {
        int sum = 0;
        for (int i = 0; i < n; ++i) {
            sum += a[row * n + i] * b[i * n + col];
        }
        c[row * n + col] = sum;
    }
}

int main() {
    int *h_a, *h_b, *h_c; // Host matrices
    int *d_a, *d_b, *d_c; // Device matrices
    int size = N * N * sizeof(int);

    // Allocate memory on host
    h_a = (int*)malloc(size);
    h_b = (int*)malloc(size);
    h_c = (int*)malloc(size);

    // Initialize matrices on host
    for (int i = 0; i < N * N; ++i) {
        h_a[i] = 2 ;
        h_b[i] = 2 ;
    }

    // print the matrix
    cout << "Matrix Multiplication using CUDA: " << endl ;
    cout << "\nMatrix A: " << endl ;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            cout << h_a[i * N + j] << " ";
        }
        cout << "\n";
    }

    cout << "\nMatrix B: " << endl ;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            cout << h_b[i * N + j] << " ";
        }
        cout << "\n";
    }


    // Allocate memory on device
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    // Copy matrices from host to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x, (N + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Launch kernel
    matrixMul<<<numBlocks, threadsPerBlock>>>(d_a, d_b, d_c, N);

    // Copy result from device to host
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Print result
    cout << "\nResult : " << endl ;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            cout << h_c[i * N + j] << " ";
        }
        cout << "\n";
    }

    // Free host memory
    free(h_a);
    free(h_b);
    free(h_c);

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Matrix Multiplication using CUDA: 

Matrix A: 
2 2 2 2 
2 2 2 2 
2 2 2 2 
2 2 2 2 

Matrix B: 
2 2 2 2 
2 2 2 2 
2 2 2 2 
2 2 2 2 

Result : 
0 0 0 0 
0 0 0 0 
0 0 0 0 
0 0 0 0 

