In [20]:
#Import package

In [21]:
import numpy as np
import time
import os

In [22]:
#Check for CUDA

In [23]:
try:
    import cupy as cp
    cuda_available = True
    print("CUDA is available. Using GPU for computation.")
except (ImportError, Exception) as e:
    cuda_available = False
    print(f"CUDA is not available: {str(e)}")
    print("Falling back to CPU-only computation.")

CUDA is available. Using GPU for computation.


In [4]:
#Data Points

In [5]:
N = 1000   # Number of points (10,000)
B = 100    # Block size (adjust based on memory)
np.random.seed(42)  # For reproducibility

In [6]:
#Generate randon ponits

In [7]:
points_cpu = np.random.uniform(0, 100, (N, 3))
print(f"Generated {N} random 3D points")

Generated 1000 random 3D points


In [8]:
#Create temporary files for memory-mapped arrays

In [9]:
cpu_memmap_file = "cpu_dist_matrix.dat"
gpu_memmap_file = "gpu_dist_matrix.dat"

In [10]:
#Create memory-mapped arrays

In [11]:
dist_cpu_memmap = np.memmap(cpu_memmap_file, dtype="float32", mode="w+", shape=(N, N))
if cuda_available:
    dist_gpu_memmap = np.memmap(gpu_memmap_file, dtype="float32", mode="w+", shape=(N, N))
    # Transfer points to GPU
    points_gpu = cp.asarray(points_cpu)

CUDARuntimeError: cudaErrorInsufficientDriver: CUDA driver version is insufficient for CUDA runtime version

In [12]:
#CPU batch-wise computation

In [13]:
print("\nStarting CPU batch-wise computation...")
start_cpu = time.time()

for i in range(0, N, B):
    i_end = min(i + B, N)
    Xi = points_cpu[i:i_end]
    Xi_norm = np.sum(Xi**2, axis=1)[:, np.newaxis]

    for j in range(0, N, B):
        j_end = min(j + B, N)
        Xj = points_cpu[j:j_end]
        Xj_norm = np.sum(Xj**2, axis=1)[np.newaxis, :]

        # Compute block distance
        dist_block = np.sqrt(np.maximum(
            Xi_norm + Xj_norm - 2 * Xi @ Xj.T,
            0.0  # Ensure no negative values due to floating point errors
        ))

        # Store in memmap
        dist_cpu_memmap[i:i_end, j:j_end] = dist_block

end_cpu = time.time()
cpu_time = end_cpu - start_cpu
print(f"CPU batch-wise computation finished in {cpu_time:.2f} seconds")


Starting CPU batch-wise computation...
CPU batch-wise computation finished in 0.24 seconds


In [14]:
#GPU batch-wise computation (if available)

In [15]:
if cuda_available:
    print("\nStarting GPU batch-wise computation...")
    start_gpu = time.time()

    for i in range(0, N, B):
        i_end = min(i + B, N)
        Xi = points_gpu[i:i_end]
        Xi_norm = cp.sum(Xi**2, axis=1)[:, cp.newaxis]

        for j in range(0, N, B):
            j_end = min(j + B, N)
            Xj = points_gpu[j:j_end]
            Xj_norm = cp.sum(Xj**2, axis=1)[cp.newaxis, :]

            # Compute block distance
            dist_block = cp.sqrt(cp.maximum(
                Xi_norm + Xj_norm - 2 * Xi @ Xj.T,
                0.0  # Ensure no negative values
            ))

            # Move block to CPU & store in memmap
            dist_gpu_memmap[i:i_end, j:j_end] = cp.asnumpy(dist_block)

    end_gpu = time.time()
    gpu_time = end_gpu - start_gpu
    print(f"GPU batch-wise computation finished in {gpu_time:.2f} seconds")

    # Performance comparison
    if gpu_time > 0:
        speedup = cpu_time / gpu_time
        print(f"\n Speedup = {speedup:.2f}x (GPU vs CPU)")


Starting GPU batch-wise computation...


NameError: name 'points_gpu' is not defined

In [16]:
#Verification (compare a small sample)

In [17]:
print("\nSample CPU distances (5x5):\n", dist_cpu_memmap[:5, :5])
if cuda_available:
    print("\nSample GPU distances (5x5):\n", dist_gpu_memmap[:5, :5])

    # Check if results are close
    if np.allclose(dist_cpu_memmap[:5, :5], dist_gpu_memmap[:5, :5], rtol=1e-5, atol=1e-5):
        print("\nVerification: CPU and GPU results match! ✓")
    else:
        print("\nVerification: CPU and GPU results differ! ✗")


Sample CPU distances (5x5):
 [[  0.       100.675     35.27332  101.636086 102.83767 ]
 [100.675      0.        99.7335    83.232925  24.185583]
 [ 35.27332   99.7335     0.       112.850365 109.67857 ]
 [101.636086  83.232925 112.850365   0.        82.05581 ]
 [102.83767   24.185583 109.67857   82.05581    0.      ]]

Sample GPU distances (5x5):
 [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]

Verification: CPU and GPU results differ! ✗


In [18]:
# Clean up temporary files

In [19]:
try:
    os.remove(cpu_memmap_file)
    if cuda_available:
        os.remove(gpu_memmap_file)
    print("\nTemporary files cleaned up.")
except:
    print("\nWarning: Could not remove temporary files.")


