In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
%%writefile firstprogram.cu
#include <stdio.h>

#include <stdlib.h>

int main()

{

int deviceCount;

cudaGetDeviceCount(&deviceCount);

if (deviceCount == 0)

{

printf("There is no device supporting CUDA\n");

}

int dev;

for (dev = 0; dev < deviceCount; ++dev)

{

cudaDeviceProp deviceProp;

cudaGetDeviceProperties(&deviceProp, dev);

if (dev == 0)

{

if (deviceProp.major < 1)

{

printf("There is no device supporting CUDA.\n");

}

else if (deviceCount == 1)

{

printf("There is 1 device supporting CUDA\n");

}

else

{

printf("There are %d devices supporting CUDA\n", deviceCount);

}

}

printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

printf(" Major revision number: %d\n", deviceProp.major);

printf(" Minor revision number: %d\n", deviceProp.minor);

printf(" Total amount of global memory: %ld bytes\n", deviceProp.totalGlobalMem);

printf(" Total amount of constant memory: %ld bytes\n", deviceProp.totalConstMem);

printf(" Total amount of shared memory per block: %ld bytes\n", deviceProp.sharedMemPerBlock);

printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);

printf(" Warp size: %d\n", deviceProp.warpSize);

printf(" Multiprocessor count: %d\n",deviceProp.multiProcessorCount );


printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);

printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);

printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);

printf(" Maximum memory pitch: %ld bytes\n", deviceProp.memPitch);

printf(" Texture alignment: %ld bytes\n", deviceProp.textureAlignment);

printf(" Clock rate: %d kilohertz\n", deviceProp.clockRate);

}

}

Writing firstprogram.cu


In [None]:
!nvcc -o p1 firstprogram.cu

In [None]:
! ./p1

There is 1 device supporting CUDA

Device 0: "Tesla T4"
 Major revision number: 7
 Minor revision number: 5
 Total amount of global memory: 15828320256 bytes
 Total amount of constant memory: 65536 bytes
 Total amount of shared memory per block: 49152 bytes
 Total number of registers available per block: 65536
 Warp size: 32
 Multiprocessor count: 40
 Maximum number of threads per block: 1024
 Maximum sizes of each dimension of a block: 1024 x 1024 x 64
 Maximum sizes of each dimension of a grid: 2147483647 x 65535 x 65535
 Maximum memory pitch: 2147483647 bytes
 Texture alignment: 512 bytes
 Clock rate: 1590000 kilohertz


In [None]:
%%writefile secondprogram.cu
#include <stdio.h>

#include <cuda_runtime.h>

__global__ void helloWorldKernel() {
    // Get the thread ID within the block
    int threadId = threadIdx.x;

    // Print the thread ID and "Hello World"
    printf("Hello, World! from thread %d\n", threadId);
}


int main() {
    // Define the number of threads per block
    int numThreads = 10; // You can adjust this to any number of threads

    // Launch the kernel with 1 block and `numThreads` threads
    helloWorldKernel<<<1, numThreads>>>();

    // Wait for GPU to finish before accessing the results
   // =();


    cudaError_t err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
    printf("CUDA Error: %s\n", cudaGetErrorString(err));
}
    return 0;
}


Writing secondprogram.cu


In [None]:
!nvcc -arch=sm_75 -o sp secondprogram.cu

In [None]:
# Run the compiled executable
!./sp

Hello, World! from thread 0
Hello, World! from thread 1
Hello, World! from thread 2
Hello, World! from thread 3
Hello, World! from thread 4
Hello, World! from thread 5
Hello, World! from thread 6
Hello, World! from thread 7
Hello, World! from thread 8
Hello, World! from thread 9


In [None]:
%%writefile thirdprogram.cu
#include <stdio.h>
#include <cuda_runtime.h>

// Kernel function that each thread will execute
__global__ void helloWorldKernel() {
    // Get the global thread ID (unique for all threads)
    int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;

    // Print the thread ID and "Hello World"
    //printf("Hello, World! from thread %d\n", globalThreadId);
    printf("Hello World from Block %d, Thread %d (Global ID: %d)\n", blockIdx.x, threadIdx.x, globalThreadId);
}

int main() {
    // Define the number of threads per block and number of blocks
    int threadsPerBlock = 2;  // Threads per block
    int numBlocks = 2;         // Number of blocks

    // Launch the kernel with multiple blocks and multiple threads per block
    helloWorldKernel<<<numBlocks, threadsPerBlock>>>();

    // Wait for GPU to finish before accessing the results
    cudaDeviceSynchronize();

    return 0;
}


Overwriting thirdprogram.cu


In [None]:
!nvcc -arch=sm_75 -o sp thirdprogram.cu


In [None]:
!./sp

Hello World from Block 0, Thread 0 (Global ID: 0)
Hello World from Block 0, Thread 1 (Global ID: 1)
Hello World from Block 1, Thread 0 (Global ID: 2)
Hello World from Block 1, Thread 1 (Global ID: 3)


In [None]:
%%writefile hello2D.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void hello2D() {


    int global_x = threadIdx.x + blockIdx.x * blockDim.x;
    int global_y = threadIdx.y + blockIdx.y * blockDim.y;


    printf("Hello from thread (%d, %d) in block (%d, %d) -> Global ID (%d, %d)\n",
           threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, global_x, global_y);


}

int main() {
    dim3 threadsPerBlock(2, 2); // 2x2 = 4 threads per block
    dim3 numBlocks(2, 2);       // 2x2 = 4 blocks in the grid

    hello2D<<<numBlocks, threadsPerBlock>>>();
    cudaDeviceSynchronize();
    return 0;
}



Writing hello2D.cu


In [None]:
!nvcc -arch=sm_75 -o sp hello2D.cu

In [None]:
!./sp

Hello from thread (0, 0) in block (0, 1) -> Global ID (0, 2)
Hello from thread (1, 0) in block (0, 1) -> Global ID (1, 2)
Hello from thread (0, 1) in block (0, 1) -> Global ID (0, 3)
Hello from thread (1, 1) in block (0, 1) -> Global ID (1, 3)
Hello from thread (0, 0) in block (0, 0) -> Global ID (0, 0)
Hello from thread (1, 0) in block (0, 0) -> Global ID (1, 0)
Hello from thread (0, 1) in block (0, 0) -> Global ID (0, 1)
Hello from thread (1, 1) in block (0, 0) -> Global ID (1, 1)
Hello from thread (0, 0) in block (1, 1) -> Global ID (2, 2)
Hello from thread (1, 0) in block (1, 1) -> Global ID (3, 2)
Hello from thread (0, 1) in block (1, 1) -> Global ID (2, 3)
Hello from thread (1, 1) in block (1, 1) -> Global ID (3, 3)
Hello from thread (0, 0) in block (1, 0) -> Global ID (2, 0)
Hello from thread (1, 0) in block (1, 0) -> Global ID (3, 0)
Hello from thread (0, 1) in block (1, 0) -> Global ID (2, 1)
Hello from thread (1, 1) in block (1, 0) -> Global ID (3, 1)


In [None]:
%%writefile VectorAddition.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 10  // Size of the vectors, can be adjusted

// CUDA kernel to perform element-wise addition of two vectors A and B
__global__ void vectorAdd(int *A, int *B, int *C, int n) {
    // Get the thread index
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    // Ensure the thread index is within bounds
    if (idx < n) {
        C[idx] = A[idx] + B[idx];
    }
}

int main() {
    // Host vectors
    int h_A[N], h_B[N], h_C[N];

    // Initialize vectors A and B
    for (int i = 0; i < N; i++) {
        h_A[i] = i;        // Vector A: 0, 1, 2, ...
        h_B[i] = i * 2;    // Vector B: 0, 2, 4, ...
    }

    // Device vectors
    int *d_A, *d_B, *d_C;

    // Allocate memory on the GPU for vectors A, B, and C
    cudaMalloc((void**)&d_A, N * sizeof(int));
    cudaMalloc((void**)&d_B, N * sizeof(int));
    cudaMalloc((void**)&d_C, N * sizeof(int));

    // Copy vectors A and B from host to device
    cudaMemcpy(d_A, h_A, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, N * sizeof(int), cudaMemcpyHostToDevice);

    // Define block size (number of threads per block)
    int blockSize = 256;  // Max threads per block
    int numBlocks = (N + blockSize - 1) / blockSize;  // Calculate number of blocks

    // Launch the kernel
    vectorAdd<<<numBlocks, blockSize>>>(d_A, d_B, d_C, N);

    // Check for kernel launch errors
    cudaDeviceSynchronize();

    // Copy the result vector C from device to host
    cudaMemcpy(h_C, d_C, N * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the result
    printf("Vector A: ");
    for (int i = 0; i < N; i++) {
        printf("%d ", h_A[i]);
    }
    printf("\n");

    printf("Vector B: ");
    for (int i = 0; i < N; i++) {
        printf("%d ", h_B[i]);
    }
    printf("\n");

    printf("Result Vector C (A + B): ");
    for (int i = 0; i < N; i++) {
        printf("%d ", h_C[i]);
    }
    printf("\n");

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Writing VectorAddition.cu


In [None]:
!nvcc -arch=sm_75 -o sp VectorAddition.cu

In [None]:
!./sp

Vector A: 0 1 2 3 4 5 6 7 8 9 
Vector B: 0 2 4 6 8 10 12 14 16 18 
Result Vector C (A + B): 0 3 6 9 12 15 18 21 24 27 


In [None]:
%%writefile VectorAdditionSerialAndParallel.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>

// CUDA kernel for vector addition
__global__ void vectorAddCUDA(float *A, float *B, float *C, int N) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N)
        C[idx] = A[idx] + B[idx];
}

// Serial CPU implementation of vector addition
void vectorAddCPU(float *A, float *B, float *C, int N) {
    for (int i = 0; i < N; i++) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    int Ns[] = {100000, 1000000, 10000000}; // Test with different sizes

    for (int test = 0; test < 3; test++) {
        int N = Ns[test];
        size_t size = N * sizeof(float);
        printf("\n========== N = %d ==========\n", N);

        // Allocate host memory
        float *h_A = (float *)malloc(size);
        float *h_B = (float *)malloc(size);
        float *h_C_CPU = (float *)malloc(size);
        float *h_C_GPU = (float *)malloc(size);

        // Initialize input vectors
        for (int i = 0; i < N; i++) {
            h_A[i] = i * 1.0f;
            h_B[i] = i * 2.0f;
        }

        // -------------------- CPU Vector Addition --------------------
        clock_t start_cpu = clock();
        vectorAddCPU(h_A, h_B, h_C_CPU, N);
        clock_t end_cpu = clock();
        double time_cpu = ((double)(end_cpu - start_cpu)) / CLOCKS_PER_SEC;
        printf("CPU Time: %f seconds\n", time_cpu);

        // -------------------- GPU Vector Addition --------------------
        float *d_A, *d_B, *d_C;
        cudaMalloc((void **)&d_A, size);
        cudaMalloc((void **)&d_B, size);
        cudaMalloc((void **)&d_C, size);

        // Copy inputs to device
        cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

        // CUDA timing events
        cudaEvent_t start_gpu, stop_gpu;
        cudaEventCreate(&start_gpu);
        cudaEventCreate(&stop_gpu);
        cudaEventRecord(start_gpu);

        int threadsPerBlock = 256;
        int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

        vectorAddCUDA<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
        cudaEventRecord(stop_gpu);
        cudaEventSynchronize(stop_gpu);

        float time_gpu = 0;
        cudaEventElapsedTime(&time_gpu, start_gpu, stop_gpu);
        printf("GPU Time: %f milliseconds\n", time_gpu);

        // Copy result back to host
        cudaMemcpy(h_C_GPU, d_C, size, cudaMemcpyDeviceToHost);

        // Optional correctness check
        int errors = 0;
        for (int i = 0; i < N; i++) {
            if (abs(h_C_CPU[i] - h_C_GPU[i]) > 1e-5) {
                errors++;
                if (errors < 10) {
                    printf("Mismatch at index %d: CPU = %f, GPU = %f\n", i, h_C_CPU[i], h_C_GPU[i]);
                }
            }
        }
        if (errors == 0) {
            printf("✅ CPU and GPU results match!\n");
        } else {
            printf("❌ Mismatches found: %d\n", errors);
        }

        // Free memory
        free(h_A); free(h_B); free(h_C_CPU); free(h_C_GPU);
        cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
        cudaEventDestroy(start_gpu);
        cudaEventDestroy(stop_gpu);
    }

    return 0;
}


Writing VectorAdditionSerialAndParallel.cu


In [None]:
!nvcc -arch=sm_75 -o sp VectorAdditionSerialAndParallel.cu

In [None]:
!./sp


CPU Time: 0.000809 seconds
GPU Time: 0.000000 milliseconds
Mismatch at index 1: CPU = 3.000000, GPU = 0.000000
Mismatch at index 2: CPU = 6.000000, GPU = 0.000000
Mismatch at index 3: CPU = 9.000000, GPU = 0.000000
Mismatch at index 4: CPU = 12.000000, GPU = 0.000000
Mismatch at index 5: CPU = 15.000000, GPU = 0.000000
Mismatch at index 6: CPU = 18.000000, GPU = 0.000000
Mismatch at index 7: CPU = 21.000000, GPU = 0.000000
Mismatch at index 8: CPU = 24.000000, GPU = 0.000000
Mismatch at index 9: CPU = 27.000000, GPU = 0.000000
❌ Mismatches found: 99999

CPU Time: 0.006473 seconds
GPU Time: 0.000000 milliseconds
Mismatch at index 1: CPU = 3.000000, GPU = 0.000000
Mismatch at index 2: CPU = 6.000000, GPU = 0.000000
Mismatch at index 3: CPU = 9.000000, GPU = 0.000000
Mismatch at index 4: CPU = 12.000000, GPU = 0.000000
Mismatch at index 5: CPU = 15.000000, GPU = 0.000000
Mismatch at index 6: CPU = 18.000000, GPU = 0.000000
Mismatch at index 7: CPU = 21.000000, GPU = 0.000000
Mismatch at 

In [None]:
%%writefile fifthprogram.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>

// Serial version of vector addition
void vector_add_cpu(float *A, float *B, float *C, int N) {
    for (int i = 0; i < N; i++) {
        C[i] = A[i] + B[i];
    }
}

// CUDA kernel for vector addition
__global__ void vector_add_cuda(float *A, float *B, float *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    // Different sizes of N to test
    int sizes[] = {100000, 1000000, 10000000};
    int num_sizes = sizeof(sizes) / sizeof(sizes[0]);

    // Loop over different N sizes
    for (int s = 0; s < num_sizes; s++) {
        int N = sizes[s];
        float *A, *B, *C;
        float *d_A, *d_B, *d_C;

        // Allocate memory for vectors on the host
        A = (float*)malloc(N * sizeof(float));
        B = (float*)malloc(N * sizeof(float));
        C = (float*)malloc(N * sizeof(float));

        // Initialize random number generator
        srand(time(NULL));

        // Initialize vectors A and B with random numbers
        for (int i = 0; i < N; i++) {
            A[i] = rand() % 1000;
            B[i] = rand() % 1000;
        }

        // Run the CPU-based vector addition
        clock_t start_time = clock();
        vector_add_cpu(A, B, C, N);
        clock_t end_time = clock();
        double cpu_time = ((double)(end_time - start_time)) / CLOCKS_PER_SEC;
        printf("\nFor N = %d, CPU execution time: %f seconds\n", N, cpu_time);

        // Allocate memory on the GPU
        cudaMalloc((void**)&d_A, N * sizeof(float));
        cudaMalloc((void**)&d_B, N * sizeof(float));
        cudaMalloc((void**)&d_C, N * sizeof(float));

        // Copy vectors A and B from host to device
        cudaMemcpy(d_A, A, N * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, B, N * sizeof(float), cudaMemcpyHostToDevice);

        // Define block and grid size
        int blockSize = 256;
        int gridSize = (N + blockSize - 1) / blockSize;

        // Measure execution time for GPU-based implementation
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start);

        // Launch the CUDA kernel
        vector_add_cuda<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);

        // Wait for GPU to finish
        cudaDeviceSynchronize();

        cudaEventRecord(stop);
        cudaEventSynchronize(stop);

        float gpu_time;
        cudaEventElapsedTime(&gpu_time, start, stop);
        printf("GPU execution time: %f milliseconds\n", gpu_time);

        // Copy the result from device to host
        cudaMemcpy(C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);

        // Free memory on the GPU
        cudaFree(d_A);
        cudaFree(d_B);
        cudaFree(d_C);

        // Free memory on the host
        free(A);
        free(B);
        free(C);

        // Calculate and report the speedup
        double speedup = cpu_time / (gpu_time / 1000.0); // Convert GPU time to seconds
        printf("Speedup: %f\n", speedup);
    }

    return 0;
}


Writing fifthprogram.cu


In [None]:
!nvcc -arch=sm_75 -o sp fifthprogram.cu

In [None]:
!./sp


For N = 100000, CPU execution time: 0.000584 seconds
GPU execution time: 0.000000 milliseconds
Speedup: 1627954399492735387821024212502147848208384.000000

For N = 1000000, CPU execution time: 0.005590 seconds
GPU execution time: 0.000000 milliseconds
Speedup: 15582645707473272524007270571691354634584064.000000

For N = 10000000, CPU execution time: 0.060235 seconds
GPU execution time: 0.000000 milliseconds
Speedup: 167910673379186489224696619917750606657224704.000000


In [1]:
%%writefile fifthprogram1.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>

// Serial version of vector addition
void vector_add_cpu(float *A, float *B, float *C, int N) {
    for (int i = 0; i < N; i++) {
        C[i] = A[i] + B[i];
    }
}

// CUDA kernel for vector addition
__global__ void vector_add_cuda(float *A, float *B, float *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    int sizes[] = {100000, 1000000, 10000000};
    int num_sizes = sizeof(sizes) / sizeof(sizes[0]);

    for (int s = 0; s < num_sizes; s++) {
        int N = sizes[s];
        float *A, *B, *C;
        float *d_A, *d_B, *d_C;

        // Allocate host memory
        A = (float*)malloc(N * sizeof(float));
        B = (float*)malloc(N * sizeof(float));
        C = (float*)malloc(N * sizeof(float));

        // Initialize random numbers
        srand((unsigned int)time(NULL));
        for (int i = 0; i < N; i++) {
            A[i] = rand() % 1000;
            B[i] = rand() % 1000;
        }

        // CPU timing
        clock_t start_time = clock();
        vector_add_cpu(A, B, C, N);
        clock_t end_time = clock();
        double cpu_time = ((double)(end_time - start_time)) / CLOCKS_PER_SEC;
        printf("\nFor N = %d, CPU execution time: %f seconds\n", N, cpu_time);

        // Allocate device memory
        cudaMalloc((void**)&d_A, N * sizeof(float));
        cudaMalloc((void**)&d_B, N * sizeof(float));
        cudaMalloc((void**)&d_C, N * sizeof(float));

        // Copy data to device
        cudaMemcpy(d_A, A, N * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, B, N * sizeof(float), cudaMemcpyHostToDevice);

        // Configure kernel
        int blockSize = 256;
        int gridSize = (N + blockSize - 1) / blockSize;

        // Create CUDA events for timing
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        // Start timing and launch kernel
        cudaEventRecord(start);
        vector_add_cuda<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
        cudaEventRecord(stop);

        // Check for kernel errors
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) {
            printf("CUDA Error: %s\n", cudaGetErrorString(err));
        }

        // Wait for kernel to finish and get time
        cudaEventSynchronize(stop);
        float gpu_time;
        cudaEventElapsedTime(&gpu_time, start, stop);

        printf("GPU execution time: %f milliseconds\n", gpu_time);

        // Copy result back to host
        cudaMemcpy(C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);

        // Calculate speedup
        double speedup = cpu_time / (gpu_time / 1000.0); // Convert ms to sec
        printf("Speedup: %f\n", speedup);

        // Cleanup
        cudaFree(d_A);
        cudaFree(d_B);
        cudaFree(d_C);
        free(A);
        free(B);
        free(C);
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }

    return 0;
}


Writing fifthprogram1.cu


In [2]:
!nvcc -arch=sm_75 -o sp fifthprogram1.cu

In [3]:
!./sp


For N = 100000, CPU execution time: 0.000479 seconds
GPU execution time: 0.129856 milliseconds
Speedup: 3.688701

For N = 1000000, CPU execution time: 0.004948 seconds
GPU execution time: 0.051584 milliseconds
Speedup: 95.921213

For N = 10000000, CPU execution time: 0.055042 seconds
GPU execution time: 0.457632 milliseconds
Speedup: 120.275679


In [4]:
%%writefile sixthprogram.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>
#include <math.h>

__global__ void matrixAddCUDA(float* A, float* B, float* C, int M, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < M && col < N) {
        int idx = row * N + col;
        C[idx] = A[idx] + B[idx];
    }
}

void matrixAddCPU(float* A, float* B, float* C, int M, int N) {
    for (int i = 0; i < M; ++i)
        for (int j = 0; j < N; ++j)
            C[i * N + j] = A[i * N + j] + B[i * N + j];
}

void initMatrix(float* mat, int size) {
    for (int i = 0; i < size; ++i)
        mat[i] = (float)(rand() % 100);
}

double getCPUTime() {
    struct timespec t;
    clock_gettime(CLOCK_MONOTONIC, &t);
    return (t.tv_sec * 1e3) + (t.tv_nsec / 1e6);  // milliseconds
}

int main() {
    int sizes[] = {100, 500, 1000};  // Use provided sizes only
    int numSizes = sizeof(sizes) / sizeof(sizes[0]);

    for (int s = 0; s < numSizes; ++s) {
        int M = sizes[s], N = sizes[s];
        int size = M * N * sizeof(float);

        float *h_A = (float*)malloc(size);
        float *h_B = (float*)malloc(size);
        float *h_C_cpu = (float*)malloc(size);
        float *h_C_gpu = (float*)malloc(size);

        initMatrix(h_A, M * N);
        initMatrix(h_B, M * N);

        // CPU timing
        double cpu_start = getCPUTime();
        matrixAddCPU(h_A, h_B, h_C_cpu, M, N);
        double cpu_end = getCPUTime();
        double cpu_time = cpu_end - cpu_start;

        // Allocate GPU memory
        float *d_A, *d_B, *d_C;
        cudaMalloc((void**)&d_A, size);
        cudaMalloc((void**)&d_B, size);
        cudaMalloc((void**)&d_C, size);

        cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

        dim3 blockSize(16, 16);
        dim3 gridSize((N + blockSize.x - 1) / blockSize.x,
                      (M + blockSize.y - 1) / blockSize.y);

        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        // GPU timing with correct event placement
        cudaEventRecord(start);
        matrixAddCUDA<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N);
        cudaEventRecord(stop);

        // Error checking
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) {
            printf("CUDA kernel error: %s\n", cudaGetErrorString(err));
        }

        cudaEventSynchronize(stop);
        float gpu_time;
        cudaEventElapsedTime(&gpu_time, start, stop);  // in milliseconds

        cudaMemcpy(h_C_gpu, d_C, size, cudaMemcpyDeviceToHost);

        // Check result correctness
        int error = 0;
        for (int i = 0; i < M * N; ++i) {
            if (fabs(h_C_cpu[i] - h_C_gpu[i]) > 1e-5) {
                error = 1;
                break;
            }
        }

        // Safe speedup calculation
        double speedup = (gpu_time > 0.0) ? (cpu_time / gpu_time) : 0.0;

        printf("Matrix size: %dx%d\n", M, N);
        printf("CPU Time: %.4f ms\n", cpu_time);
        printf("GPU Time: %.4f ms\n", gpu_time);
        printf("Speedup: %.2fx\n", speedup);
        printf("Result match: %s\n\n", error ? "No" : "Yes");

        // Cleanup
        free(h_A); free(h_B); free(h_C_cpu); free(h_C_gpu);
        cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
        cudaEventDestroy(start); cudaEventDestroy(stop);
    }

    return 0;
}



Writing sixthprogram.cu


In [5]:
!nvcc -arch=sm_75 -o sp sixthprogram.cu

In [6]:
!./sp

Matrix size: 100x100
CPU Time: 0.0747 ms
GPU Time: 0.0938 ms
Speedup: 0.80x
Result match: Yes

Matrix size: 500x500
CPU Time: 1.2672 ms
GPU Time: 0.0224 ms
Speedup: 56.49x
Result match: Yes

Matrix size: 1000x1000
CPU Time: 5.1848 ms
GPU Time: 0.0555 ms
Speedup: 93.39x
Result match: Yes



In [19]:
%%writefile seventhprogram.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

#define BLOCK_SIZE 256

// CUDA kernel for dot product using shared memory and atomicAdd
__global__ void dotProductKernel(float *A, float *B, float *result, int N) {
    __shared__ float cache[BLOCK_SIZE];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int cacheIdx = threadIdx.x;

    float temp = 0.0;
    while (tid < N) {
        temp += A[tid] * B[tid];
        tid += blockDim.x * gridDim.x;
    }

    cache[cacheIdx] = temp;
    __syncthreads();

    // Reduction within block
    int i = blockDim.x / 2;
    while (i != 0) {
        if (cacheIdx < i)
            cache[cacheIdx] += cache[cacheIdx + i];
        __syncthreads();
        i /= 2;
    }

    // Add block result to global result
    if (cacheIdx == 0)
        atomicAdd(result, cache[0]);
}

// Serial CPU implementation
float dotProductCPU(float *A, float *B, int N) {
    float sum = 0.0;
    for (int i = 0; i < N; ++i)
        sum += A[i] * B[i];
    return sum;
}

// Function to initialize vectors with random floats
void initializeVectors(float *A, float *B, int N) {
    for (int i = 0; i < N; ++i) {
        A[i] = (float)(rand()) / RAND_MAX;
        B[i] = (float)(rand()) / RAND_MAX;
    }
}

// Run and time both CPU and GPU dot product
void runDotProduct(int N) {
    float *h_A, *h_B;
    float *d_A, *d_B, *d_result;
    float gpu_result = 0.0, cpu_result = 0.0;

    size_t size = N * sizeof(float);

    // Allocate host memory
    h_A = (float*)malloc(size);
    h_B = (float*)malloc(size);

    initializeVectors(h_A, h_B, N);

    // CPU computation and timing
    clock_t cpu_start = clock();
    cpu_result = dotProductCPU(h_A, h_B, N);
    clock_t cpu_end = clock();
    double cpu_time = ((double)(cpu_end - cpu_start)) / CLOCKS_PER_SEC * 1000;  // Convert to ms

    // Allocate device memory
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_result, sizeof(float));

    // Copy data to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    cudaMemset(d_result, 0, sizeof(float));

    // GPU computation and timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    dotProductKernel<<<numBlocks, BLOCK_SIZE>>>(d_A, d_B, d_result, N);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Copy result back to host
    cudaMemcpy(&gpu_result, d_result, sizeof(float), cudaMemcpyDeviceToHost);

    // Output results
    printf("Vector Size: %d\n", N);
    printf("CPU Result: %f, Time: %.3f ms\n", cpu_result, cpu_time);
    printf("GPU Result: %f, Time: %.3f ms\n", gpu_result, milliseconds);
    printf("Speedup: %.2fx\n\n", cpu_time / milliseconds);

    // Cleanup
    free(h_A); free(h_B);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_result);
}

int main() {
    srand(time(NULL));
    int sizes[] = {100000, 1000000, 10000000};  // Different sizes of N

    for (int i = 0; i < 3; ++i)
        runDotProduct(sizes[i]);

    return 0;
}


Overwriting seventhprogram.cu


In [20]:
!nvcc -arch=sm_75 -o sp seventhprogram.cu

In [21]:
!./sp

Vector Size: 100000
CPU Result: 24868.974609, Time: 0.324 ms
GPU Result: 24869.150391, Time: 0.102 ms
Speedup: 3.18x

Vector Size: 1000000
CPU Result: 249918.468750, Time: 3.038 ms
GPU Result: 249954.234375, Time: 0.107 ms
Speedup: 28.48x

Vector Size: 10000000
CPU Result: 2471850.250000, Time: 30.714 ms
GPU Result: 2500996.000000, Time: 0.998 ms
Speedup: 30.76x



In [25]:
%%writefile eighthprogram.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>

#define TILE_SIZE 16  // Defining tile size for shared memory optimization

// CUDA kernel for matrix multiplication
__global__ void matrixMultiplyCUDA(float* A, float* B, float* C, int M, int N, int P) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < P) {
        float value = 0;
        for (int k = 0; k < N; k++) {
            value += A[row * N + k] * B[k * P + col];
        }
        C[row * P + col] = value;
    }
}

// Serial matrix multiplication on the CPU
void matrixMultiplySerial(float* A, float* B, float* C, int M, int N, int P) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < P; j++) {
            C[i * P + j] = 0;
            for (int k = 0; k < N; k++) {
                C[i * P + j] += A[i * N + k] * B[k * P + j];
            }
        }
    }
}

// Function to initialize a matrix with random values
void initializeMatrix(float* mat, int rows, int cols) {
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            mat[i * cols + j] = rand() % 100;
        }
    }
}

int main() {
    // Matrix sizes to test
    int matrix_sizes[][3] = {
        {100, 100, 100},
        {500, 500, 500},
        {1000, 1000, 1000}
    };

    // Loop over different matrix sizes
    for (int idx = 0; idx < 3; idx++) {
        int M = matrix_sizes[idx][0]; // Rows of A and C
        int N = matrix_sizes[idx][1]; // Columns of A and Rows of B
        int P = matrix_sizes[idx][2]; // Columns of B and C

        printf("\nRunning matrix multiplication for size %dx%dx%d\n", M, N, P);

        // Allocate memory for matrices on the host (CPU)
        float* A = (float*)malloc(M * N * sizeof(float));
        float* B = (float*)malloc(N * P * sizeof(float));
        float* C_serial = (float*)malloc(M * P * sizeof(float));
        float* C_cuda = (float*)malloc(M * P * sizeof(float));

        // Initialize matrices A and B with random values
        srand(time(NULL));
        initializeMatrix(A, M, N);
        initializeMatrix(B, N, P);

        // Measure the execution time for serial matrix multiplication
        clock_t start = clock();
        matrixMultiplySerial(A, B, C_serial, M, N, P);
        clock_t end = clock();
        double serial_time = (double)(end - start) / CLOCKS_PER_SEC;
        printf("Serial execution time: %f seconds\n", serial_time);

        // Allocate memory for matrices on the device (GPU)
        float *d_A, *d_B, *d_C;
        cudaMalloc((void**)&d_A, M * N * sizeof(float));
        cudaMalloc((void**)&d_B, N * P * sizeof(float));
        cudaMalloc((void**)&d_C, M * P * sizeof(float));

        // Copy matrices A and B from host to device
        cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, B, N * P * sizeof(float), cudaMemcpyHostToDevice);

        // Define the grid and block sizes for the kernel
        dim3 dimBlock(TILE_SIZE, TILE_SIZE);
        dim3 dimGrid((P + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);

        // Measure the execution time for parallel matrix multiplication (GPU)
        start = clock();
        matrixMultiplyCUDA<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, M, N, P);
        cudaDeviceSynchronize();
        end = clock();
        double cuda_time = (double)(end - start) / CLOCKS_PER_SEC;
        printf("CUDA execution time: %f seconds\n", cuda_time);

        // Copy the result matrix C from device to host
        cudaMemcpy(C_cuda, d_C, M * P * sizeof(float), cudaMemcpyDeviceToHost);

        // Calculate speedup
        double speedup = serial_time / cuda_time;
        printf("Speedup: %f\n", speedup);

        // Free device memory
        cudaFree(d_A);
        cudaFree(d_B);
        cudaFree(d_C);

        // Free host memory
        free(A);
        free(B);
        free(C_serial);
        free(C_cuda);
    }

    return 0;
}


NameError: name 'free' is not defined

In [23]:
!nvcc -arch=sm_75 -o sp eighthprogram.cu

In [24]:
!./sp


Running matrix multiplication for size 100x100x100
Serial execution time: 0.005033 seconds
CUDA execution time: 0.000117 seconds
Speedup: 43.017094

Running matrix multiplication for size 500x500x500
Serial execution time: 0.759604 seconds
CUDA execution time: 0.001124 seconds
Speedup: 675.804270

Running matrix multiplication for size 1000x1000x1000
Serial execution time: 7.171067 seconds
CUDA execution time: 0.007191 seconds
Speedup: 997.228063
