<a href="https://colab.research.google.com/github/BhavanaSrinivasegowda/HW_for_AI_ML/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Wed Apr 23 22:19:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   63C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [3]:
%%writefile saxpy.cu
#include <stdio.h>
#include <math.h> // for fabsf

__global__
void saxpy(int n, float a, float *x, float *y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n) y[i] = a*x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_x, N*sizeof(float));
  cudaMalloc(&d_y, N*sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);

  // Perform SAXPY on 1M elements
  saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);

  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmaxf(maxError, fabsf(y[i]-4.0f));
  printf("Max error: %f\n", maxError);

  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
}

Writing saxpy.cu


In [4]:
!nvcc -arch=sm_70 saxpy.cu -o saxpy

In [5]:
!./saxpy

Max error: 0.000000


In [6]:
%%writefile saxpy_debug.cu
#include <stdio.h>
#include <math.h>
#include <cuda_runtime.h>

__global__
void saxpy(int n, float a, float *x, float *y)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) y[i] = a * x[i] + y[i];
}

int main(void)
{
  for (int exp = 15; exp <= 25; exp++) {
    int N = 1 << exp;
    float *x, *y, *d_x, *d_y;

    x = (float*)malloc(N * sizeof(float));
    y = (float*)malloc(N * sizeof(float));

    if (cudaMalloc(&d_x, N * sizeof(float)) != cudaSuccess ||
        cudaMalloc(&d_y, N * sizeof(float)) != cudaSuccess) {
      printf("CUDA malloc failed for N = %d\n", N);
      free(x); free(y);
      continue;
    }

    for (int i = 0; i < N; i++) {
      x[i] = 1.0f;
      y[i] = 2.0f;
    }

    cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, d_x, d_y);
    cudaEventRecord(stop);

    // ✅ Check for kernel launch errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
      printf("CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
      cudaFree(d_x); cudaFree(d_y); free(x); free(y);
      continue;
    }

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);

    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
      maxError = fmaxf(maxError, fabsf(y[i] - 4.0f));

    printf("N = 2^%d (%d): Max error = %f, Time = %.3f ms\n", exp, N, maxError, milliseconds);

    // ✅ Print a few y values to confirm correctness
    printf("Sample values: y[0] = %.1f, y[1] = %.1f, y[N-1] = %.1f\n\n", y[0], y[1], y[N-1]);

    cudaFree(d_x);
    cudaFree(d_y);
    free(x);
    free(y);
  }

  return 0;
}

Writing saxpy_debug.cu


In [7]:
!nvcc -arch=sm_70 saxpy_debug.cu -o saxpy_debug

In [8]:
!./saxpy_debug

N = 2^15 (32768): Max error = 0.000000, Time = 0.095 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^16 (65536): Max error = 0.000000, Time = 0.012 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^17 (131072): Max error = 0.000000, Time = 0.013 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^18 (262144): Max error = 0.000000, Time = 0.020 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^19 (524288): Max error = 0.000000, Time = 0.026 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^20 (1048576): Max error = 0.000000, Time = 0.053 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^21 (2097152): Max error = 0.000000, Time = 0.100 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^22 (4194304): Max error = 0.000000, Time = 0.198 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^23 (8388608): Max error = 0.000000, Time = 0.388 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^24

In [9]:
%%writefile saxpy_fancy.cu
#include <stdio.h>
#include <math.h>
#include <cuda_runtime.h>

__global__
void saxpy(int n, float a, float *x, float *y) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) y[i] = a * x[i] + y[i];
}

int main(void) {
  for (int exp = 15; exp <= 25; exp++) {
    int N = 1 << exp;
    float *x, *y, *d_x, *d_y;

    // Start total timer
    cudaEvent_t total_start, total_stop;
    cudaEventCreate(&total_start);
    cudaEventCreate(&total_stop);
    cudaEventRecord(total_start);

    x = (float*)malloc(N * sizeof(float));
    y = (float*)malloc(N * sizeof(float));

    if (cudaMalloc(&d_x, N * sizeof(float)) != cudaSuccess ||
        cudaMalloc(&d_y, N * sizeof(float)) != cudaSuccess) {
      printf("CUDA malloc failed for N = %d\n", N);
      free(x); free(y);
      continue;
    }

    for (int i = 0; i < N; i++) {
      x[i] = 1.0f;
      y[i] = 2.0f;
    }

    cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);

    // Start kernel-only timer
    cudaEvent_t kernel_start, kernel_stop;
    cudaEventCreate(&kernel_start);
    cudaEventCreate(&kernel_stop);
    cudaEventRecord(kernel_start);

    saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, d_x, d_y);

    cudaEventRecord(kernel_stop);
    cudaEventSynchronize(kernel_stop);

    cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);

    // End total timer
    cudaEventRecord(total_stop);
    cudaEventSynchronize(total_stop);

    // Calculate times
    float kernel_time = 0, total_time = 0;
    cudaEventElapsedTime(&kernel_time, kernel_start, kernel_stop);
    cudaEventElapsedTime(&total_time, total_start, total_stop);

    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
      maxError = fmaxf(maxError, fabsf(y[i] - 4.0f));

    printf("N = 2^%d (%d): Max error = %.6f\n", exp, N, maxError);
    printf(" → Kernel-only time: %.3f ms\n", kernel_time);
    printf(" → Total execution time: %.3f ms\n", total_time);
    printf(" → Sample y[0] = %.1f, y[N-1] = %.1f\n\n", y[0], y[N - 1]);

    // Cleanup
    cudaFree(d_x); cudaFree(d_y); free(x); free(y);
    cudaEventDestroy(total_start); cudaEventDestroy(total_stop);
    cudaEventDestroy(kernel_start); cudaEventDestroy(kernel_stop);
  }

  return 0;
}

Writing saxpy_fancy.cu


In [10]:
!nvcc -arch=sm_70 saxpy_fancy.cu -o saxpy_fancy

In [11]:
!./saxpy_fancy

N = 2^15 (32768): Max error = 0.000000
 → Kernel-only time: 0.098 ms
 → Total execution time: 0.576 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^16 (65536): Max error = 0.000000
 → Kernel-only time: 0.011 ms
 → Total execution time: 0.749 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^17 (131072): Max error = 0.000000
 → Kernel-only time: 0.015 ms
 → Total execution time: 1.170 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^18 (262144): Max error = 0.000000
 → Kernel-only time: 0.020 ms
 → Total execution time: 2.186 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^19 (524288): Max error = 0.000000
 → Kernel-only time: 0.022 ms
 → Total execution time: 4.029 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^20 (1048576): Max error = 0.000000
 → Kernel-only time: 0.055 ms
 → Total execution time: 9.370 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^21 (2097152): Max error = 0.000000
 → Kernel-only time: 0.102 ms
 → Total execution time: 17.589 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^22 (4194304):

In [12]:
%%writefile fibonacci.cu
#include <stdio.h>

__global__ void fibonacci_kernel(unsigned long long *fib, int N) {
  // Only one thread computes the whole sequence
  if (threadIdx.x == 0 && blockIdx.x == 0) {
    fib[0] = 0;
    if (N > 1) fib[1] = 1;

    for (int i = 2; i < N; ++i) {
      fib[i] = fib[i - 1] + fib[i - 2];
    }
  }
}

int main() {
  const int N = 93; // Upper limit for unsigned long long
  unsigned long long *fib, *d_fib;

  // Allocate memory on host and device
  fib = (unsigned long long*)malloc(N * sizeof(unsigned long long));
  cudaMalloc(&d_fib, N * sizeof(unsigned long long));

  // Launch 1 thread to do the entire sequence computation
  fibonacci_kernel<<<1, 1>>>(d_fib, N);

  // Copy back result
  cudaMemcpy(fib, d_fib, N * sizeof(unsigned long long), cudaMemcpyDeviceToHost);

  // Print result
  for (int i = 0; i < N; i++)
    printf("F[%d] = %llu\n", i, fib[i]);

  // Free memory
  cudaFree(d_fib);
  free(fib);
  return 0;
}

Writing fibonacci.cu


In [13]:
!nvcc -arch=sm_70 fibonacci.cu -o fibo

In [14]:
!./fibo

F[0] = 0
F[1] = 1
F[2] = 1
F[3] = 2
F[4] = 3
F[5] = 5
F[6] = 8
F[7] = 13
F[8] = 21
F[9] = 34
F[10] = 55
F[11] = 89
F[12] = 144
F[13] = 233
F[14] = 377
F[15] = 610
F[16] = 987
F[17] = 1597
F[18] = 2584
F[19] = 4181
F[20] = 6765
F[21] = 10946
F[22] = 17711
F[23] = 28657
F[24] = 46368
F[25] = 75025
F[26] = 121393
F[27] = 196418
F[28] = 317811
F[29] = 514229
F[30] = 832040
F[31] = 1346269
F[32] = 2178309
F[33] = 3524578
F[34] = 5702887
F[35] = 9227465
F[36] = 14930352
F[37] = 24157817
F[38] = 39088169
F[39] = 63245986
F[40] = 102334155
F[41] = 165580141
F[42] = 267914296
F[43] = 433494437
F[44] = 701408733
F[45] = 1134903170
F[46] = 1836311903
F[47] = 2971215073
F[48] = 4807526976
F[49] = 7778742049
F[50] = 12586269025
F[51] = 20365011074
F[52] = 32951280099
F[53] = 53316291173
F[54] = 86267571272
F[55] = 139583862445
F[56] = 225851433717
F[57] = 365435296162
F[58] = 591286729879
F[59] = 956722026041
F[60] = 1548008755920
F[61] = 2504730781961
F[62] = 4052739537881
F[63] = 6557470319842
F[