In [1]:
import os, subprocess, shutil

# replace this with the nvcc path you posted (we compute CUDA_HOME automatically)
nvcc_path = "/packages/apps/spack/21/opt/spack/linux-rocky8-zen3/gcc-12.1.0/cuda-12.0.1-x3bnvayrybncl3rqu6zk4zzu4oztblqi/bin/nvcc"

if not os.path.exists(nvcc_path):
    raise FileNotFoundError(f"nvcc not found at {nvcc_path} — update nvcc_path if different")

cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
os.environ["CUDA_HOME"] = cuda_home
os.environ["PATH"] = os.path.join(cuda_home, "bin") + os.pathsep + os.environ.get("PATH", "")
# Append existing LD_LIBRARY_PATH if present
os.environ["LD_LIBRARY_PATH"] = os.path.join(cuda_home, "lib64") + os.pathsep + os.environ.get("LD_LIBRARY_PATH", "")

print("CUDA_HOME =", cuda_home)
print("which nvcc ->", shutil.which("nvcc"))

# quick verify (runs in a shell inheriting these env vars)
proc = subprocess.run("/bin/bash -lc 'which nvcc && nvcc --version'", shell=True, capture_output=True, text=True, env=os.environ)
print(proc.stdout)
if proc.returncode != 0:
    print("nvcc failed to run; stderr:\n", proc.stderr)

CUDA_HOME = /packages/apps/spack/21/opt/spack/linux-rocky8-zen3/gcc-12.1.0/cuda-12.0.1-x3bnvayrybncl3rqu6zk4zzu4oztblqi
which nvcc -> /packages/apps/spack/21/opt/spack/linux-rocky8-zen3/gcc-12.1.0/cuda-12.0.1-x3bnvayrybncl3rqu6zk4zzu4oztblqi/bin/nvcc
/packages/apps/spack/21/opt/spack/linux-rocky8-zen3/gcc-12.1.0/cuda-12.0.1-x3bnvayrybncl3rqu6zk4zzu4oztblqi/bin/nvcc
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Fri_Jan__6_16:45:21_PST_2023
Cuda compilation tools, release 12.0, V12.0.140
Build cuda_12.0.r12.0/compiler.32267302_0



In [2]:
%%bash
export CUDA_HOME=/packages/apps/spack/21/opt/spack/linux-rocky8-zen3/gcc-12.1.0/cuda-12.0.1-x3bnvayrybncl3rqu6zk4zzu4oztblqi
export PATH=$CUDA_HOME/bin:$PATH
export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH

cat > add.cu <<'EOF'
#include <cstdio>

__global__ void add(int n, float *x, float *y) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) y[i] = x[i] + 1.0f;
}

int main() {
  int N = 16;
  float *x, *y;
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));
  for (int i=0;i<N;i++) x[i]=i;
  add<<<1,32>>>(N,x,y);
  cudaDeviceSynchronize();
  for (int i=0;i<8;i++) printf("%f ", y[i]);
  printf("\n");
  cudaFree(x); cudaFree(y);
  return 0;
}
EOF

nvcc -std=c++17 add.cu -o add_test && ./add_test


1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 


In [5]:
%%bash
cat > vector_add.cu <<'EOF'

#include <iostream>
#include <cuda_runtime.h>

// Define the size of the arrays
const int N = 1 << 20; // 1,048,576 elements

// --- KERNEL DEFINITION (Executed on Device) ---
// The kernel is the parallel function
__global__ void vectorAdd(const int *A, const int *B, int *C, int size) {
    // Calculate the unique global index 'i' for this thread
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    // Safety check: ensure the index is within the array bounds
    if (i < size) {
        C[i] = A[i] + B[i];
    }
}

// --- HOST MAIN FUNCTION (Executed on CPU) ---
int main() {
    // --- 0. Setup and Initialization ---
    // Memory size in bytes
    const int BYTES = N * sizeof(int);

    // Host Pointers (CPU memory, allocated with standard malloc)
    int *h_A, *h_B, *h_C;
    // Device Pointers (GPU memory, allocated with cudaMalloc)
    int *d_A, *d_B, *d_C;

    // Allocate Host memory
    h_A = (int*)malloc(BYTES);
    h_B = (int*)malloc(BYTES);
    h_C = (int*)malloc(BYTES);

    // Initialize Host arrays A and B
    for (int i = 0; i < N; i++) {
        h_A[i] = i;      // A = [0, 1, 2, 3, ...]
        h_B[i] = i * 2;  // B = [0, 2, 4, 6, ...]
    }
    std::cout << "Data initialized on Host (CPU)." << std::endl;

    // 1. MEMORY ALLOCATION (Device/GPU)
    cudaMalloc((void**)&d_A, BYTES);
    cudaMalloc((void**)&d_B, BYTES);
    cudaMalloc((void**)&d_C, BYTES);
    std::cout << "Memory allocated on Device (GPU)." << std::endl;

    // 2. DATA TRANSFER (Host -> Device)
    // Copy the initialized input arrays A and B from CPU to GPU
    cudaMemcpy(d_A, h_A, BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, BYTES, cudaMemcpyHostToDevice);
    std::cout << "Input data copied to Device." << std::endl;
    
    // --- 3. KERNEL LAUNCH SETUP ---
    // Define the execution configuration
    int threadsPerBlock = 256; 
    // Calculate the grid size (number of blocks) using ceiling division
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; 

    // 4. KERNEL LAUNCH (The work begins on the GPU)
    std::cout << "Launching kernel: " << blocksPerGrid << " blocks, " 
              << threadsPerBlock << " threads/block." << std::endl;

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

    // Wait for the GPU to finish execution
    cudaDeviceSynchronize();
    
    // 5. DATA TRANSFER (Device -> Host)
    // Copy the result array C back from GPU to CPU
    cudaMemcpy(h_C, d_C, BYTES, cudaMemcpyDeviceToHost);
    std::cout << "Result copied back to Host." << std::endl;
    
    // --- 6. Validation and Output (Host) ---
    // Check first few results: C[i] = i + 2*i = 3*i
    if (h_C[0] == 0 && h_C[1] == 3 && h_C[N-1] == 3 * (N-1)) {
        std::cout << "SUCCESS! Vector addition verified." << std::endl;
        std::cout << "Example: C[1] = " << h_C[1] << " (Expected 3)" << std::endl;
        std::cout << "Example: C[" << N-1 << "] = " << h_C[N-1] << std::endl;
    } else {
        std::cerr << "FAILURE! Results did not match." << std::endl;
    }

    // 7. CLEANUP
    // Free Host memory
    free(h_A); 
    free(h_B); 
    free(h_C); 
    // Free Device memory
    cudaFree(d_A); 
    cudaFree(d_B); 
    cudaFree(d_C); 
    std::cout << "Cleanup complete. Exiting." << std::endl;

    return 0;
}

EOF

nvcc -std=c++17 vector_add.cu -o vector_add_test && ./vector_add_test

Data initialized on Host (CPU).
Memory allocated on Device (GPU).
Input data copied to Device.
Launching kernel: 4096 blocks, 256 threads/block.
Result copied back to Host.
SUCCESS! Vector addition verified.
Example: C[1] = 3 (Expected 3)
Example: C[1048575] = 3145725
Cleanup complete. Exiting.


In [8]:
%%bash
cat > vector_add_compare.cu <<'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>

const int N = 1 << 20;  // 1,048,576 elements

__global__ void vectorAdd(const int *A, const int *B, int *C, int size) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < size) C[i] = A[i] + B[i];
}

int main() {
    const int BYTES = N * sizeof(int);

    int *h_A = (int*)malloc(BYTES);
    int *h_B = (int*)malloc(BYTES);
    int *h_C = (int*)malloc(BYTES);

    for (int i = 0; i < N; i++) {
        h_A[i] = i;
        h_B[i] = i * 2;
    }

    // ---------------- CPU TIMING ----------------
    auto cpu_start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < N; i++) {
        h_C[i] = h_A[i] + h_B[i];
    }
    auto cpu_end = std::chrono::high_resolution_clock::now();
    double cpu_ms = std::chrono::duration<double, std::milli>(cpu_end - cpu_start).count();
    std::cout << "CPU time: " << cpu_ms << " ms" << std::endl;

    // ---------------- GPU -----------------------
    int *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, BYTES);
    cudaMalloc((void**)&d_B, BYTES);
    cudaMalloc((void**)&d_C, BYTES);

    auto gpu_start = std::chrono::high_resolution_clock::now();

    cudaMemcpy(d_A, h_A, BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, BYTES, cudaMemcpyHostToDevice);

    int threads = 256;
    int blocks = (N + threads - 1) / threads;

    vectorAdd<<<blocks, threads>>>(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();

    cudaMemcpy(h_C, d_C, BYTES, cudaMemcpyDeviceToHost);

    auto gpu_end = std::chrono::high_resolution_clock::now();
    double gpu_ms = std::chrono::duration<double, std::milli>(gpu_end - gpu_start).count();

    std::cout << "GPU time (HtoD + kernel + DtoH): " << gpu_ms << " ms" << std::endl;
    std::cout << "Speedup (CPU/GPU): " << cpu_ms / gpu_ms << "x" << std::endl;

    // Validate last result
    if (h_C[N-1] == 3*(N-1)) std::cout << "Result OK\n";
    else std::cout << "Wrong result!\n";

    free(h_A); free(h_B); free(h_C);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
}
EOF

nvcc -std=c++17 vector_add_compare.cu -o vector_add_compare
./vector_add_compare


CPU time: 2.81444 ms
GPU time (HtoD + kernel + DtoH): 3.7577 ms
Speedup (CPU/GPU): 0.74898x
Result OK


In [10]:
%%bash
cat > reduce_sum.cu <<'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <cmath>

// Define the size and thread configuration
const int N = 1 << 20;     // 1,048,576 elements
const int THREADS_PER_BLOCK = 256; 

// --- KERNEL DEFINITION (Assumed to be defined here, copied from prior context) ---
__global__ void reduceSum(int *g_input, int *g_output, int size) {
    // Shared Memory declaration (size of the block)
    extern __shared__ int s_data[]; 
    
    int tid = threadIdx.x; 
    int i = blockIdx.x * blockDim.x + tid; // Global index

    // 1. Data Staging (Global -> Shared)
    if (i < size) {
        s_data[tid] = g_input[i];
    } else {
        s_data[tid] = 0; // Handle out-of-bounds safety
    }

    __syncthreads(); 

    // 2. Reduction Loop (Parallel Summation)
    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {
        if (tid < stride) {
            s_data[tid] += s_data[tid + stride];
        }
        __syncthreads(); 
    }
    
    // 3. Write Result (Only Thread 0 writes the final block sum)
    if (tid == 0) {
        g_output[blockIdx.x] = s_data[0]; 
    }
}
// --- END KERNEL DEFINITION ---


// --- HOST MAIN FUNCTION (Executed on CPU) ---
int main() {
    // --- 0. Setup and Initialization ---
    const int BYTES = N * sizeof(int);
    
    // Host Pointers
    int *h_A, *h_PartialSum, *h_FinalSum; 
    
    // Device Pointers
    int *d_A, *d_PartialSums, *d_FinalSum;

    // Allocate Host memory
    h_A = (int*)malloc(BYTES);
    h_FinalSum = (int*)malloc(sizeof(int));

    // Initialize Host array A (e.g., A = [1, 1, 1, ..., 1]. Expected sum = N)
    long long expected_sum = 0;
    for (int i = 0; i < N; i++) {
        h_A[i] = 1; 
        expected_sum += h_A[i];
    }
    std::cout << "Data initialized on Host. Expected Sum: " << expected_sum << std::endl;

    // 1. MEMORY ALLOCATION (Device/GPU)
    cudaMalloc((void**)&d_A, BYTES);

    // Calculate number of blocks for Pass 1 (K)
    int K = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; 
    
    // Allocate Device memory for K partial sums and 1 final sum
    cudaMalloc((void**)&d_PartialSums, K * sizeof(int));
    cudaMalloc((void**)&d_FinalSum, 1 * sizeof(int));
    
    // Allocate Host memory for partial sums result check (optional)
    h_PartialSum = (int*)malloc(K * sizeof(int));

    // 2. DATA TRANSFER (Host -> Device)
    cudaMemcpy(d_A, h_A, BYTES, cudaMemcpyHostToDevice);
    std::cout << "Input data copied to Device. K (Partial Sums) = " << K << std::endl;

    // --- PASS 1: Reduce N elements to K partial sums ---
    // Execution Configuration: K blocks, 256 threads/block. Shared memory size: 256 * sizeof(int)
    reduceSum<<<K, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(int)>>>(d_A, d_PartialSums, N); 
    cudaDeviceSynchronize();
    
    std::cout << "Pass 1 complete. Partial sums created." << std::endl;

    // --- PASS 2: Reduce K partial sums to 1 final sum ---
    
    // Calculate blocks needed for Pass 2
    int blocksForPass2 = (K + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; 

    // If K is small (<= 1 block size), we can write the final result directly.
    // Otherwise, we use d_PartialSums as the input and d_FinalSum as the output.
    reduceSum<<<blocksForPass2, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(int)>>>(d_PartialSums, d_FinalSum, K);
    cudaDeviceSynchronize();

    std::cout << "Pass 2 complete. Final sum calculated." << std::endl;

    // 3. DATA TRANSFER (Device -> Host)
    // Copy the single final result back to the host
    cudaMemcpy(h_FinalSum, d_FinalSum, sizeof(int), cudaMemcpyDeviceToHost); 
    
    // --- 4. Validation and Output (Host) ---
    if (*h_FinalSum == expected_sum) {
        std::cout << "✅ SUCCESS! Total Sum Verified: " << *h_FinalSum << std::endl;
    } else {
        std::cerr << "❌ FAILURE! Expected " << expected_sum << ", Got " << *h_FinalSum << std::endl;
    }

    // 5. CLEANUP
    free(h_A); 
    free(h_PartialSum); 
    free(h_FinalSum); 
    cudaFree(d_A); 
    cudaFree(d_PartialSums); 
    cudaFree(d_FinalSum); 
    std::cout << "Cleanup complete. Exiting." << std::endl;

    return 0;
}

EOF

nvcc -std=c++17 reduce_sum.cu -o reduce_sum_output && ./reduce_sum_output


Data initialized on Host. Expected Sum: 1048576
Input data copied to Device. K (Partial Sums) = 4096
Pass 1 complete. Partial sums created.
Pass 2 complete. Final sum calculated.
Cleanup complete. Exiting.


❌ FAILURE! Expected 1048576, Got 65536
