In [None]:
# %% Example 1: Basic CuPy Array Operations
# !pip install cupy
import cupy as cp
import numpy as np

# Create large arrays
x_cpu = np.random.rand(10**7)
x_gpu = cp.random.rand(10**7)

# GPU-accelerated operations
y_gpu = cp.sin(x_gpu) * 2 + cp.log(x_gpu)
y_cpu = y_gpu.get()  # Transfer back to CPU

# Timing comparison
%timeit np.sin(x_cpu) * 2 + np.log(x_cpu)  # CPU
%timeit cp.sin(x_gpu) * 2 + cp.log(x_gpu)   # GPU

In [None]:
# %% Example 2: Custom CUDA Kernel with CuPy
kernel_code = '''
extern "C" __global__
void vector_add(const float* a, const float* b, float* c, int n) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < n) {
        c[tid] = a[tid] + b[tid];
    }
}
'''
vector_add = cp.RawKernel(kernel_code, 'vector_add')

n = 10**7
a = cp.random.rand(n, dtype=cp.float32)
b = cp.random.rand(n, dtype=cp.float32)
c = cp.empty_like(a)

# Configure grid/block dimensions
threads_per_block = 128
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block

vector_add((blocks_per_grid,), (threads_per_block,), (a, b, c, n))
cp.cuda.Stream.null.synchronize()
print(c[:5])  # Show first 5 elements

In [None]:
# %% Example 3: Matrix Multiplication Comparison
import cupy as cp

# Create large matrices
a = cp.random.rand(5000, 5000)
b = cp.random.rand(5000, 5000)

# Built-in matmul
%timeit a @ b

# Custom matrix multiplication kernel (naive implementation)
matmul_kernel = cp.RawKernel(r'''
extern "C" __global__
void matmul(const float* A, const float* B, float* C, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}
''', 'matmul')

c = cp.zeros((5000, 5000))
block = (16, 16)
grid = ( (5000 + block[0] - 1) // block[0], (5000 + block[1] - 1) // block[1] )

%timeit matmul_kernel(grid, block, (a, b, c, 5000, 5000, 5000))

In [None]:
# %% Example 4: Numba CUDA Acceleration
# !pip install numba
from numba import cuda
import numpy as np

@cuda.jit
def numba_vector_add(a, b, c):
    tid = cuda.grid(1)
    if tid < len(c):
        c[tid] = a[tid] + b[tid]

n = 10**7
a = np.random.rand(n).astype(np.float32)
b = np.random.rand(n).astype(np.float32)
c = np.empty_like(a)

# Copy to device
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.device_array_like(c)

# Configure and launch kernel
threads_per_block = 128
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block

numba_vector_add[blocks_per_grid, threads_per_block](d_a, d_b, d_c)
d_c.copy_to_host(c)
print(c[:5])  # Show first 5 elements

In [None]:
# %% Example 5: GPU-Accelerated Sorting
import cupy as cp
import numpy as np

n = 10**7
data_gpu = cp.random.rand(n)
data_cpu = data_gpu.get()

# GPU sorting
%timeit cp.sort(data_gpu)

# CPU sorting
%timeit np.sort(data_cpu)

In [None]:
# %% Example 6: Image Processing with CUDA
# !pip install Pillow
import cupy as cp
from PIL import Image

# Generate random image (1024x1024 RGB)
cpu_img = np.random.randint(0, 256, (1024, 1024, 3), dtype=np.uint8)
gpu_img = cp.asarray(cpu_img)

# Grayscale conversion kernel
gray_kernel = cp.RawKernel(r'''
extern "C" __global__
void rgb2gray(const unsigned char* input, unsigned char* output, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x < width && y < height) {
        int idx = y * width + x;
        output[idx] = 0.299f * input[3*idx] + 
                      0.587f * input[3*idx+1] + 
                      0.114f * input[3*idx+2];
    }
}
''', 'rgb2gray')

# Prepare output buffer
gray_gpu = cp.empty((1024, 1024), dtype=cp.uint8)

# Launch kernel
block = (16, 16)
grid = ( (1024 + block[0] - 1) // block[0], (1024 + block[1] - 1) // block[1] )
gray_kernel(grid, block, (gpu_img, gray_gpu, 1024, 1024))

# Convert back to PIL Image
gray_cpu = gray_gpu.get()
Image.fromarray(gray_cpu).save('grayscale.jpg')

In [None]:
# %% Example 7: Machine Learning with cuML
# !pip install cuml
from cuml import KMeans
import cupy as cp

# Generate sample data
n_samples = 10**6
n_features = 50
X_gpu = cp.random.rand(n_samples, n_features)

# GPU-accelerated K-Means
kmeans = KMeans(n_clusters=5, max_iter=300)
kmeans.fit(X_gpu)

print("Cluster centers shape:", kmeans.cluster_centers_.shape)
print("First 5 labels:", kmeans.labels_[:5].get())