<a href="https://colab.research.google.com/github/BhavanaSrinivasegowda/HW_for_AI_ML/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Wed Apr 30 21:25:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   60C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [3]:
%%writefile saxpy.cu
#include <stdio.h>
#include <math.h> // for fabsf

__global__
void saxpy(int n, float a, float *x, float *y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n) y[i] = a*x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_x, N*sizeof(float));
  cudaMalloc(&d_y, N*sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);

  // Perform SAXPY on 1M elements
  saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);

  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmaxf(maxError, fabsf(y[i]-4.0f));
  printf("Max error: %f\n", maxError);

  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
}

Writing saxpy.cu


In [4]:
!nvcc -arch=sm_70 saxpy.cu -o saxpy

In [5]:
!./saxpy

Max error: 0.000000


In [6]:
%%writefile saxpy_debug.cu
#include <stdio.h>
#include <math.h>
#include <cuda_runtime.h>

__global__
void saxpy(int n, float a, float *x, float *y)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) y[i] = a * x[i] + y[i];
}

int main(void)
{
  for (int exp = 15; exp <= 25; exp++) {
    int N = 1 << exp;
    float *x, *y, *d_x, *d_y;

    x = (float*)malloc(N * sizeof(float));
    y = (float*)malloc(N * sizeof(float));

    if (cudaMalloc(&d_x, N * sizeof(float)) != cudaSuccess ||
        cudaMalloc(&d_y, N * sizeof(float)) != cudaSuccess) {
      printf("CUDA malloc failed for N = %d\n", N);
      free(x); free(y);
      continue;
    }

    for (int i = 0; i < N; i++) {
      x[i] = 1.0f;
      y[i] = 2.0f;
    }

    cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, d_x, d_y);
    cudaEventRecord(stop);

    // ✅ Check for kernel launch errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
      printf("CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
      cudaFree(d_x); cudaFree(d_y); free(x); free(y);
      continue;
    }

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);

    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
      maxError = fmaxf(maxError, fabsf(y[i] - 4.0f));

    printf("N = 2^%d (%d): Max error = %f, Time = %.3f ms\n", exp, N, maxError, milliseconds);

    // ✅ Print a few y values to confirm correctness
    printf("Sample values: y[0] = %.1f, y[1] = %.1f, y[N-1] = %.1f\n\n", y[0], y[1], y[N-1]);

    cudaFree(d_x);
    cudaFree(d_y);
    free(x);
    free(y);
  }

  return 0;
}

Writing saxpy_debug.cu


In [7]:
!nvcc -arch=sm_70 saxpy_debug.cu -o saxpy_debug

In [8]:
!./saxpy_debug

N = 2^15 (32768): Max error = 0.000000, Time = 0.099 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^16 (65536): Max error = 0.000000, Time = 0.016 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^17 (131072): Max error = 0.000000, Time = 0.013 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^18 (262144): Max error = 0.000000, Time = 0.017 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^19 (524288): Max error = 0.000000, Time = 0.022 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^20 (1048576): Max error = 0.000000, Time = 0.053 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^21 (2097152): Max error = 0.000000, Time = 0.102 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^22 (4194304): Max error = 0.000000, Time = 0.192 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^23 (8388608): Max error = 0.000000, Time = 0.386 ms
Sample values: y[0] = 4.0, y[1] = 4.0, y[N-1] = 4.0

N = 2^24

In [9]:
%%writefile saxpy_fancy.cu
#include <stdio.h>
#include <math.h>
#include <cuda_runtime.h>

__global__
void saxpy(int n, float a, float *x, float *y) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) y[i] = a * x[i] + y[i];
}

int main(void) {
  for (int exp = 15; exp <= 25; exp++) {
    int N = 1 << exp;
    float *x, *y, *d_x, *d_y;

    // Start total timer
    cudaEvent_t total_start, total_stop;
    cudaEventCreate(&total_start);
    cudaEventCreate(&total_stop);
    cudaEventRecord(total_start);

    x = (float*)malloc(N * sizeof(float));
    y = (float*)malloc(N * sizeof(float));

    if (cudaMalloc(&d_x, N * sizeof(float)) != cudaSuccess ||
        cudaMalloc(&d_y, N * sizeof(float)) != cudaSuccess) {
      printf("CUDA malloc failed for N = %d\n", N);
      free(x); free(y);
      continue;
    }

    for (int i = 0; i < N; i++) {
      x[i] = 1.0f;
      y[i] = 2.0f;
    }

    cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);

    // Start kernel-only timer
    cudaEvent_t kernel_start, kernel_stop;
    cudaEventCreate(&kernel_start);
    cudaEventCreate(&kernel_stop);
    cudaEventRecord(kernel_start);

    saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, d_x, d_y);

    cudaEventRecord(kernel_stop);
    cudaEventSynchronize(kernel_stop);

    cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);

    // End total timer
    cudaEventRecord(total_stop);
    cudaEventSynchronize(total_stop);

    // Calculate times
    float kernel_time = 0, total_time = 0;
    cudaEventElapsedTime(&kernel_time, kernel_start, kernel_stop);
    cudaEventElapsedTime(&total_time, total_start, total_stop);

    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
      maxError = fmaxf(maxError, fabsf(y[i] - 4.0f));

    printf("N = 2^%d (%d): Max error = %.6f\n", exp, N, maxError);
    printf(" → Kernel-only time: %.3f ms\n", kernel_time);
    printf(" → Total execution time: %.3f ms\n", total_time);
    printf(" → Sample y[0] = %.1f, y[N-1] = %.1f\n\n", y[0], y[N - 1]);

    // Cleanup
    cudaFree(d_x); cudaFree(d_y); free(x); free(y);
    cudaEventDestroy(total_start); cudaEventDestroy(total_stop);
    cudaEventDestroy(kernel_start); cudaEventDestroy(kernel_stop);
  }

  return 0;
}

Writing saxpy_fancy.cu


In [10]:
!nvcc -arch=sm_70 saxpy_fancy.cu -o saxpy_fancy

In [11]:
!./saxpy_fancy

N = 2^15 (32768): Max error = 0.000000
 → Kernel-only time: 0.133 ms
 → Total execution time: 0.641 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^16 (65536): Max error = 0.000000
 → Kernel-only time: 0.015 ms
 → Total execution time: 0.825 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^17 (131072): Max error = 0.000000
 → Kernel-only time: 0.013 ms
 → Total execution time: 1.201 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^18 (262144): Max error = 0.000000
 → Kernel-only time: 0.019 ms
 → Total execution time: 2.192 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^19 (524288): Max error = 0.000000
 → Kernel-only time: 0.023 ms
 → Total execution time: 4.267 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^20 (1048576): Max error = 0.000000
 → Kernel-only time: 0.052 ms
 → Total execution time: 9.197 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^21 (2097152): Max error = 0.000000
 → Kernel-only time: 0.104 ms
 → Total execution time: 18.682 ms
 → Sample y[0] = 4.0, y[N-1] = 4.0

N = 2^22 (4194304):

In [12]:
%%writefile fibonacci.cu
#include <stdio.h>

__global__ void fibonacci_kernel(unsigned long long *fib, int N) {
  // Only one thread computes the whole sequence
  if (threadIdx.x == 0 && blockIdx.x == 0) {
    fib[0] = 0;
    if (N > 1) fib[1] = 1;

    for (int i = 2; i < N; ++i) {
      fib[i] = fib[i - 1] + fib[i - 2];
    }
  }
}

int main() {
  const int N = 93; // Upper limit for unsigned long long
  unsigned long long *fib, *d_fib;

  // Allocate memory on host and device
  fib = (unsigned long long*)malloc(N * sizeof(unsigned long long));
  cudaMalloc(&d_fib, N * sizeof(unsigned long long));

  // Launch 1 thread to do the entire sequence computation
  fibonacci_kernel<<<1, 1>>>(d_fib, N);

  // Copy back result
  cudaMemcpy(fib, d_fib, N * sizeof(unsigned long long), cudaMemcpyDeviceToHost);

  // Print result
  for (int i = 0; i < N; i++)
    printf("F[%d] = %llu\n", i, fib[i]);

  // Free memory
  cudaFree(d_fib);
  free(fib);
  return 0;
}

Writing fibonacci.cu


In [13]:
!nvcc -arch=sm_70 fibonacci.cu -o fibo

In [14]:
!./fibo

F[0] = 0
F[1] = 1
F[2] = 1
F[3] = 2
F[4] = 3
F[5] = 5
F[6] = 8
F[7] = 13
F[8] = 21
F[9] = 34
F[10] = 55
F[11] = 89
F[12] = 144
F[13] = 233
F[14] = 377
F[15] = 610
F[16] = 987
F[17] = 1597
F[18] = 2584
F[19] = 4181
F[20] = 6765
F[21] = 10946
F[22] = 17711
F[23] = 28657
F[24] = 46368
F[25] = 75025
F[26] = 121393
F[27] = 196418
F[28] = 317811
F[29] = 514229
F[30] = 832040
F[31] = 1346269
F[32] = 2178309
F[33] = 3524578
F[34] = 5702887
F[35] = 9227465
F[36] = 14930352
F[37] = 24157817
F[38] = 39088169
F[39] = 63245986
F[40] = 102334155
F[41] = 165580141
F[42] = 267914296
F[43] = 433494437
F[44] = 701408733
F[45] = 1134903170
F[46] = 1836311903
F[47] = 2971215073
F[48] = 4807526976
F[49] = 7778742049
F[50] = 12586269025
F[51] = 20365011074
F[52] = 32951280099
F[53] = 53316291173
F[54] = 86267571272
F[55] = 139583862445
F[56] = 225851433717
F[57] = 365435296162
F[58] = 591286729879
F[59] = 956722026041
F[60] = 1548008755920
F[61] = 2504730781961
F[62] = 4052739537881
F[63] = 6557470319842
F[

In [15]:
%%writefile cuda_NN.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cmath>

#define INPUT_SIZE 4
#define HIDDEN_SIZE 5
#define OUTPUT_SIZE 1

__device__ float relu(float x) {
    return x > 0 ? x : 0;
}

__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + expf(-x));
}

__global__ void compute_hidden(float *input, float *weights1, float *bias1, float *hidden_out) {
    int i = threadIdx.x;
    if (i < HIDDEN_SIZE) {
        float sum = 0;
        for (int j = 0; j < INPUT_SIZE; j++) {
            sum += input[j] * weights1[i * INPUT_SIZE + j];
        }
        hidden_out[i] = relu(sum + bias1[i]);
    }
}

__global__ void compute_output(float *hidden_out, float *weights2, float *bias2, float *output) {
    float sum = 0;
    for (int i = 0; i < HIDDEN_SIZE; i++) {
        sum += hidden_out[i] * weights2[i];
    }
    *output = sigmoid(sum + *bias2);
}

int main() {
    float h_input[INPUT_SIZE] = {1.0, 2.0, 3.0, 4.0};
    float h_weights1[HIDDEN_SIZE * INPUT_SIZE] = {
        0.2, 0.4, 0.1, 0.3,
        0.5, 0.6, 0.2, 0.1,
        0.3, 0.8, 0.5, 0.2,
        0.9, 0.4, 0.3, 0.7,
        0.6, 0.5, 0.2, 0.8
    };
    float h_bias1[HIDDEN_SIZE] = {0.1, 0.2, 0.3, 0.1, 0.0};
    float h_weights2[HIDDEN_SIZE] = {0.3, 0.7, 0.5, 0.6, 0.4};
    float h_bias2 = 0.1;
    float h_output = 0;

    float *d_input, *d_weights1, *d_bias1, *d_hidden_out;
    float *d_weights2, *d_bias2, *d_output;

    cudaMalloc((void**)&d_input, INPUT_SIZE * sizeof(float));
    cudaMalloc((void**)&d_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float));
    cudaMalloc((void**)&d_bias1, HIDDEN_SIZE * sizeof(float));
    cudaMalloc((void**)&d_hidden_out, HIDDEN_SIZE * sizeof(float));
    cudaMalloc((void**)&d_weights2, HIDDEN_SIZE * sizeof(float));
    cudaMalloc((void**)&d_bias2, sizeof(float));
    cudaMalloc((void**)&d_output, sizeof(float));

    cudaMemcpy(d_input, h_input, INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_weights1, h_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_bias1, h_bias1, HIDDEN_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_weights2, h_weights2, HIDDEN_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_bias2, &h_bias2, sizeof(float), cudaMemcpyHostToDevice);

    compute_hidden<<<1, HIDDEN_SIZE>>>(d_input, d_weights1, d_bias1, d_hidden_out);
    compute_output<<<1, 1>>>(d_hidden_out, d_weights2, d_bias2, d_output);

    cudaMemcpy(&h_output, d_output, sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "Output: " << h_output << std::endl;

    cudaFree(d_input);
    cudaFree(d_weights1);
    cudaFree(d_bias1);
    cudaFree(d_hidden_out);
    cudaFree(d_weights2);
    cudaFree(d_bias2);
    cudaFree(d_output);

    return 0;
}


Overwriting cuda_NN.cu


In [16]:
!nvcc -arch=sm_70 cuda_NN.cu -o NN

In [17]:
!./NN

Output: 0.999976


In [18]:
%%writefile pytorch_NN.py
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleFFNN(nn.Module):
    def __init__(self):
        super(SimpleFFNN, self).__init__()
        self.fc1 = nn.Linear(4, 5)
        self.fc2 = nn.Linear(5, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleFFNN().to(device)

input_tensor = torch.tensor([[1.0, 2.0, 3.0, 4.0]], device=device)
output = model(input_tensor)

print("Using device:", device)
print("Output:", output.item())

Writing pytorch_NN.py


In [19]:
!pip install torch --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [20]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define a simple feedforward neural network
class SimpleFFNN(nn.Module):
    def __init__(self):
        super(SimpleFFNN, self).__init__()
        self.fc1 = nn.Linear(4, 5)  # Input layer (4) → Hidden layer (5)
        self.fc2 = nn.Linear(5, 1)  # Hidden layer (5) → Output layer (1)

    def forward(self, x):
        x = F.relu(self.fc1(x))         # ReLU activation for hidden layer
        x = torch.sigmoid(self.fc2(x))  # Sigmoid for output
        return x

# Choose device: GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model and move to device
model = SimpleFFNN().to(device)

# Dummy input tensor
input_tensor = torch.tensor([[1.0, 2.0, 3.0, 4.0]], device=device)

# Forward pass
output = model(input_tensor)

# Print results
print("Using device:", device)
print("Model output:", output.item())

Using device: cuda
Model output: 0.42889878153800964


In [21]:
%%writefile cuda_NN_timing.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cmath>
#include <chrono>

#define INPUT_SIZE 4
#define HIDDEN_SIZE 5
#define OUTPUT_SIZE 1
#define ITERATIONS 10000

__device__ float relu(float x) {
    return x > 0 ? x : 0;
}

__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + expf(-x));
}

__global__ void compute_hidden(float *input, float *weights1, float *bias1, float *hidden_out) {
    int i = threadIdx.x;
    if (i < HIDDEN_SIZE) {
        float sum = 0;
        for (int j = 0; j < INPUT_SIZE; j++) {
            sum += input[j] * weights1[i * INPUT_SIZE + j];
        }
        hidden_out[i] = relu(sum + bias1[i]);
    }
}

__global__ void compute_output(float *hidden_out, float *weights2, float *bias2, float *output) {
    float sum = 0;
    for (int i = 0; i < HIDDEN_SIZE; i++) {
        sum += hidden_out[i] * weights2[i];
    }
    *output = sigmoid(sum + *bias2);
}

int main() {
    float h_input[INPUT_SIZE] = {1.0, 2.0, 3.0, 4.0};
    float h_weights1[HIDDEN_SIZE * INPUT_SIZE] = {
        0.2, 0.4, 0.1, 0.3,
        0.5, 0.6, 0.2, 0.1,
        0.3, 0.8, 0.5, 0.2,
        0.9, 0.4, 0.3, 0.7,
        0.6, 0.5, 0.2, 0.8
    };
    float h_bias1[HIDDEN_SIZE] = {0.1, 0.2, 0.3, 0.1, 0.0};
    float h_weights2[HIDDEN_SIZE] = {0.3, 0.7, 0.5, 0.6, 0.4};
    float h_bias2 = 0.1;
    float h_output = 0;

    float *d_input, *d_weights1, *d_bias1, *d_hidden_out;
    float *d_weights2, *d_bias2, *d_output;

    cudaMalloc(&d_input, INPUT_SIZE * sizeof(float));
    cudaMalloc(&d_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float));
    cudaMalloc(&d_bias1, HIDDEN_SIZE * sizeof(float));
    cudaMalloc(&d_hidden_out, HIDDEN_SIZE * sizeof(float));
    cudaMalloc(&d_weights2, HIDDEN_SIZE * sizeof(float));
    cudaMalloc(&d_bias2, sizeof(float));
    cudaMalloc(&d_output, sizeof(float));

    cudaMemcpy(d_input, h_input, INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_weights1, h_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_bias1, h_bias1, HIDDEN_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_weights2, h_weights2, HIDDEN_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_bias2, &h_bias2, sizeof(float), cudaMemcpyHostToDevice);

    // Warmup
    for (int i = 0; i < 100; ++i) {
        compute_hidden<<<1, HIDDEN_SIZE>>>(d_input, d_weights1, d_bias1, d_hidden_out);
        compute_output<<<1, 1>>>(d_hidden_out, d_weights2, d_bias2, d_output);
    }

    cudaDeviceSynchronize();

    // Benchmarking
    auto start = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < ITERATIONS; ++i) {
        compute_hidden<<<1, HIDDEN_SIZE>>>(d_input, d_weights1, d_bias1, d_hidden_out);
        compute_output<<<1, 1>>>(d_hidden_out, d_weights2, d_bias2, d_output);
    }

    cudaDeviceSynchronize();
    auto end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double, std::milli> elapsed = end - start;
    std::cout << "CUDA Avg Inference Time: " << (elapsed.count() / ITERATIONS) << " ms" << std::endl;

    cudaMemcpy(&h_output, d_output, sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "Final Output: " << h_output << std::endl;

    cudaFree(d_input);
    cudaFree(d_weights1);
    cudaFree(d_bias1);
    cudaFree(d_hidden_out);
    cudaFree(d_weights2);
    cudaFree(d_bias2);
    cudaFree(d_output);

    return 0;
}


Writing cuda_NN_timing.cu


In [22]:
!nvcc -arch=sm_70 cuda_NN_timing.cu -o NN_timing

In [23]:
!./NN_timing

CUDA Avg Inference Time: 0.00681973 ms
Final Output: 0.999976


In [24]:
%%writefile pytorch_NN_timing.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import time

class SimpleFFNN(nn.Module):
    def __init__(self):
        super(SimpleFFNN, self).__init__()
        self.fc1 = nn.Linear(4, 5)
        self.fc2 = nn.Linear(5, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleFFNN().to(device)

input_tensor = torch.tensor([[1.0, 2.0, 3.0, 4.0]], device=device)
iterations = 10000

# Warm-up
for _ in range(100):
    _ = model(input_tensor)
if torch.cuda.is_available():
    torch.cuda.synchronize()

# Benchmarking
start = time.time()
for _ in range(iterations):
    output = model(input_tensor)
    if torch.cuda.is_available():
        torch.cuda.synchronize()
end = time.time()

avg_time = (end - start) * 1000 / iterations
print(f"PyTorch Avg Inference Time: {avg_time:.6f} ms")
print("Final Output:", output.item())


Writing pytorch_NN_timing.py


In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time

class SimpleFFNN(nn.Module):
    def __init__(self):
        super(SimpleFFNN, self).__init__()
        self.fc1 = nn.Linear(4, 5)
        self.fc2 = nn.Linear(5, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleFFNN().to(device)

input_tensor = torch.tensor([[1.0, 2.0, 3.0, 4.0]], device=device)
iterations = 10000

# Warm-up
for _ in range(100):
    _ = model(input_tensor)
if torch.cuda.is_available():
    torch.cuda.synchronize()

# Benchmarking
start = time.time()
for _ in range(iterations):
    output = model(input_tensor)
    if torch.cuda.is_available():
        torch.cuda.synchronize()
end = time.time()

avg_time = (end - start) * 1000 / iterations
print(f"PyTorch Avg Inference Time: {avg_time:.6f} ms")
print("Final Output:", output.item())

PyTorch Avg Inference Time: 0.235297 ms
Final Output: 0.2794197201728821


In [1]:
%%writefile NN_SIZES_compare.py
import torch
import torch.nn as nn
import time
import matplotlib.pyplot as plt

# Define FFNN model builder
def create_model(input_size, hidden_size, depth, output_size):
    layers = [nn.Linear(input_size, hidden_size)]
    for _ in range(depth - 1):
        layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_size, hidden_size))
    layers.append(nn.ReLU())
    layers.append(nn.Linear(hidden_size, output_size))
    layers.append(nn.Sigmoid())
    return nn.Sequential(*layers)

# Benchmark function
def benchmark_model(model, input_tensor, iterations=1000):
    model.eval()
    device = next(model.parameters()).device

    # Warm-up
    for _ in range(10):
        _ = model(input_tensor)
    if device.type == 'cuda':
        torch.cuda.synchronize()

    # Benchmarking
    start = time.time()
    for _ in range(iterations):
        _ = model(input_tensor)
        if device.type == 'cuda':
            torch.cuda.synchronize()
    end = time.time()
    return (end - start) * 1000 / iterations  # average ms

# Config
depths = [2, 4, 6, 8, 10]
widths = [64, 128, 256, 512]
input_size = 4
output_size = 1
batch_size = 1
iterations = 1000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

results = {}

# Benchmarking loop
for width in widths:
    avg_times = []
    for depth in depths:
        model = create_model(input_size, width, depth, output_size).to(device)
        input_tensor = torch.rand(batch_size, input_size, device=device)
        avg_time = benchmark_model(model, input_tensor, iterations)
        avg_times.append(avg_time)
    results[f'Width {width}'] = avg_times

# Plotting
plt.figure(figsize=(10, 6))
for label, times in results.items():
    plt.plot(depths, times, marker='o', label=label)

plt.xlabel('Network Depth (# of Layers)')
plt.ylabel('Avg Inference Time per Sample (ms)')
plt.title('PyTorch FFNN Benchmark (Variable Depth & Width)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


Writing NN_SIZES_compare.py


In [2]:
%%writefile sorting_code.py
import torch

# Enable GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def bubble_sort_gpu(arr):
    n = len(arr)
    data = torch.tensor(arr, dtype=torch.float32, device=device)

    for i in range(n):
        for j in range(0, n - i - 1):
            if data[j].item() > data[j+1].item():
                temp = data[j].item()
                data[j] = data[j+1].item()
                data[j+1] = temp

    return data.cpu().numpy()

# Take user input
user_input = input("Enter numbers separated by spaces: ")
arr = list(map(float, user_input.strip().split()))

# Run bubble sort
sorted_arr = bubble_sort_gpu(arr)

print("Sorted Array:", sorted_arr)

Writing sorting_code.py


In [3]:
%%writefile sorting_compare.py
import torch
import time
import matplotlib.pyplot as plt
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def bubble_sort_gpu(arr):
    n = len(arr)
    data = torch.tensor(arr, dtype=torch.float32, device=device)

    for i in range(n):
        for j in range(0, n - i - 1):
            if data[j].item() > data[j+1].item():
                temp = data[j].item()
                data[j] = data[j+1].item()
                data[j+1] = temp

    return data.cpu().numpy()

sizes = [10, 100, 500, 1000]
times = []

for size in sizes:
    arr = np.random.rand(size) * 1000
    start = time.time()
    bubble_sort_gpu(arr)
    torch.cuda.synchronize() if device.type == 'cuda' else None
    end = time.time()
    times.append((end - start) * 1000)

plt.plot(sizes, times, marker='o')
plt.title("Bubble Sort on GPU - Execution Time vs Input Size")
plt.xlabel("Array Size")
plt.ylabel("Execution Time (ms)")
plt.grid(True)
plt.show()

Writing sorting_compare.py


In [6]:
%%writefile sorting_test.py
import unittest
import torch

# Bubble sort function (from your code)
def bubble_sort_gpu(arr):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n = len(arr)
    data = torch.tensor(arr, dtype=torch.float32, device=device)

    for i in range(n):
        for j in range(0, n - i - 1):
            if data[j].item() > data[j + 1].item():
                temp = data[j].item()
                data[j] = data[j + 1].item()
                data[j + 1] = temp

    return data.cpu().numpy()

# Unit test class
class TestBubbleSortGPU(unittest.TestCase):

    def test_sorted_array(self):
        self.assertTrue(np.allclose(bubble_sort_gpu([1, 2, 3]), [1, 2, 3]))

    def test_reverse_array(self):
        self.assertTrue(np.allclose(bubble_sort_gpu([3, 2, 1]), [1, 2, 3]))

    def test_unsorted_array(self):
        self.assertTrue(np.allclose(bubble_sort_gpu([64, 34, 25, 12, 22, 11, 90]), sorted([64, 34, 25, 12, 22, 11, 90])))

    def test_duplicates(self):
        self.assertTrue(np.allclose(bubble_sort_gpu([5, 1, 2, 2, 3]), sorted([5, 1, 2, 2, 3])))

    def test_empty_array(self):
        self.assertTrue(np.allclose(bubble_sort_gpu([]), []))

    def test_single_element(self):
        self.assertTrue(np.allclose(bubble_sort_gpu([42]), [42]))

if __name__ == '__main__':
    import numpy as np
    unittest.main(argv=[''], exit=False)

Overwriting sorting_test.py


In [8]:
!python sorting_test.py

......
----------------------------------------------------------------------
Ran 6 tests in 0.373s

OK
