<a href="https://colab.research.google.com/github/Anamikacoder/Anamikacoder/blob/main/assignmnetcuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#code1
%%writefile thread.cu
#include<stdio.h>
#include <cuda.h>
__global__ void printThreadIDs() {
    int tid = threadIdx.x;
    int bid = blockIdx.x;
    int globalID = bid * blockDim.x + tid;
    printf("blockIdx.x: %d, threadIdx.x: %d, globalID: %d\n", bid, tid, globalID);
}

int main() {
    printThreadIDs<<<2, 4>>>();
    cudaDeviceSynchronize();
    return 0;
}


Overwriting thread.cu


In [None]:
!nvcc -arch=sm_75 thread.cu -o thread

In [None]:
!./thread

blockIdx.x: 0, threadIdx.x: 0, globalID: 0
blockIdx.x: 0, threadIdx.x: 1, globalID: 1
blockIdx.x: 0, threadIdx.x: 2, globalID: 2
blockIdx.x: 0, threadIdx.x: 3, globalID: 3
blockIdx.x: 1, threadIdx.x: 0, globalID: 4
blockIdx.x: 1, threadIdx.x: 1, globalID: 5
blockIdx.x: 1, threadIdx.x: 2, globalID: 6
blockIdx.x: 1, threadIdx.x: 3, globalID: 7


In [41]:
#code2
%%writefile vectorsqr.cu
#include <stdio.h>
#include <cuda.h>

__global__ void add(int *A, int *B, int *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) C[i] = A[i] + B[i];
}

int main() {
    int N = 1000;
    size_t size = N * sizeof(int);

    int *h_A = (int *)malloc(size);
    int *h_B = (int *)malloc(size);
    int *h_C = (int *)malloc(size);


    for (int i = 0; i < N; i++) {
        h_A[i] = i+1;
        h_B[i] = (N * (i+1));
    }


    int *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    int threads = 256;
    int blocks = (N + threads - 1) / threads;
    add<<<blocks, threads>>>(d_A, d_B, d_C, N);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    for (int i = 0; i < 5; i++) {
        printf("C[%d] = %d\n", i, h_C[i]);
    }

    cudaDeviceSynchronize();
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B); free(h_C);

    return 0;
}

Overwriting vectorsqr.cu


In [42]:
!nvcc -arch=sm_75 vectorsqr.cu -o vectorsqr

In [43]:
!./vectorsqr

C[0] = 1001
C[1] = 2002
C[2] = 3003
C[3] = 4004
C[4] = 5005


In [47]:
#code3
%%writefile arrsqr.cu
#include <stdio.h>
#include <cuda.h>

__global__ void squareElements(int *A, int *C, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) C[idx] = A[idx] * A[idx];
}

int main() {
    int N = 1024;
    int *A, *C;

    cudaMallocManaged(&A, N * sizeof(int));
    cudaMallocManaged(&C, N * sizeof(int));

    for (int i = 0; i < N; i++) A[i] = i+1;

    squareElements<<<(N+255)/256, 256>>>(A, C, N);
    cudaDeviceSynchronize();

    for (int i = 0; i < 5; i++)
        printf("C[%d] = %d\n", i, C[i]);

    cudaFree(A); cudaFree(C);
    return 0;
}

Overwriting arrsqr.cu


In [48]:
!nvcc -arch=sm_75 arrsqr.cu -o arrsqr

In [49]:
!./arrsqr

C[0] = 1
C[1] = 4
C[2] = 9
C[3] = 16
C[4] = 25
