In [None]:
!wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
!sudo sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit

--2025-04-22 09:48:37--  https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.59.88.195, 23.59.88.207
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|23.59.88.195|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4336730777 (4.0G) [application/octet-stream]
Saving to: ‘cuda_11.8.0_520.61.05_linux.run’


2025-04-22 09:49:08 (132 MB/s) - ‘cuda_11.8.0_520.61.05_linux.run’ saved [4336730777/4336730777]

Unknown option: --silent --toolkit


In [None]:
import os
os.environ['PATH'] = '/usr/local/cuda-11.8/bin:' + os.environ['PATH']
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda-11.8/lib64:' + os.environ['LD_LIBRARY_PATH']

In [None]:
%%writefile non_linear_cpu.cpp
// cpu_nonlinear.cpp
#include <stdio.h>
#include <math.h>
#include <chrono>

#define N 10000000

void nonlinear_cpu(float* x, float* y, int n) {
    for (int i = 0; i < n; i++) {
        y[i] = sinf(x[i]) + logf(x[i] + 1.0f) + sqrtf(x[i]);
    }
}

int main() {
    float *x, *y;
    x = new float[N];
    y = new float[N];

    // Initialize input
    for (int i = 0; i < N; i++) {
        x[i] = ((float)rand() / RAND_MAX);
    }

    auto t1 = std::chrono::high_resolution_clock::now();
    nonlinear_cpu(x, y, N);
    auto t2 = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> elapsed = t2 - t1;
    printf("CPU time: %.4f s\n", elapsed.count());

    delete[] x;
    delete[] y;
    return 0;
}


Writing non_linear_cpu.cpp


In [None]:
!nvcc -o cpu_exec non_linear_cpu.cpp

In [None]:
!./cpu_exec

CPU time: 0.3272 s


In [None]:
%%writefile non_linear_gpu.cu
// gpu_nonlinear.cu
#include <stdio.h>
#include <math.h>
#include <cuda_runtime.h>

#define N 10000000

__global__ void nonlinear_kernel(float* x, float* y, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        y[idx] = sinf(x[idx]) + logf(x[idx] + 1.0f) + sqrtf(x[idx]);
    }
}

int main() {
    float *x_host, *y_host;
    float *x_dev, *y_dev;

    x_host = (float*)malloc(N * sizeof(float));
    y_host = (float*)malloc(N * sizeof(float));

    for (int i = 0; i < N; i++) {
        x_host[i] = ((float)rand() / RAND_MAX);
    }

    cudaMalloc(&x_dev, N * sizeof(float));
    cudaMalloc(&y_dev, N * sizeof(float));
    cudaMemcpy(x_dev, x_host, N * sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    printf("Threads per block: %d\n", threadsPerBlock);
    printf("Blocks per grid:   %d\n", blocksPerGrid);
    printf("Total GPU threads launched: %d\n", blocksPerGrid * threadsPerBlock);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    nonlinear_kernel<<<blocksPerGrid, threadsPerBlock>>>(x_dev, y_dev, N);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float ms = 0;
    cudaEventElapsedTime(&ms, start, stop);
    printf("GPU time: %.4f s\n", ms / 1000.0f);

    cudaMemcpy(y_host, y_dev, N * sizeof(float), cudaMemcpyDeviceToHost);

    free(x_host);
    free(y_host);
    cudaFree(x_dev);
    cudaFree(y_dev);
    return 0;
}


Writing non_linear_gpu.cu


In [None]:
!nvcc -o gpu_exec non_linear_gpu.cu


In [None]:
!./gpu_exec

Threads per block: 256
Blocks per grid:   39063
Total GPU threads launched: 10000128
GPU time: 0.0506 s
