## traditional

In [1]:
import numpy as np

def make_cluster(natoms, radius=20, seed=1981):
    np.random.seed(seed)
    arr = np.random.normal(0, radius, size=(natoms,3))-0.5
    return arr

In [2]:
class lj_pure(object):
    
    @classmethod
    def lj(cls, r):
        sr6 = (1./r)**6
        pot = 4.*(sr6*sr6 - sr6)
        return pot


    @classmethod
    def distance(cls, atom1, atom2):
        dx = atom2[0] - atom1[0]
        dy = atom2[1] - atom1[1]
        dz = atom2[2] - atom1[2]

        r = (dx*dx + dy*dy + dz*dz)**0.5
        return r


    @classmethod
    def potential(cls, cluster):
        energy = 0.0
        for i in range(len(cluster)-1):
            for j in range(i+1,len(cluster)):
                r = cls.distance(cluster[i],cluster[j])
                e = cls.lj(r)
                energy += e
        return energy

In [3]:
import numpy as np
class lj_numpy(object):
    
    @classmethod
    def lj(cls, r):
        sr6 = (1./r)**6
        pot = 4.*(sr6*sr6 - sr6)
        return pot
    
    
    @classmethod
    def distances(cls, cluster):
        diff = cluster[:, np.newaxis, :] - cluster[np.newaxis, :, :]
        mat = np.sqrt((diff*diff).sum(-1))
        return mat

    
    @classmethod
    def potential(cls, cluster):
        d = cls.distances(cluster)
        dtri = np.triu(d)
        energy = cls.lj(dtri[dtri > 1e-4]).sum()
        return energy

In [4]:
cluster = make_cluster(int(2e3), radius=100)

In [5]:
%timeit lj_pure.potential(cluster)

4.22 s ± 39.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%timeit lj_numpy.potential(cluster)

297 ms ± 4.23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## numba

In [10]:
import numba

@numba.jit(nopython=True)
def lj(r):
    sr6 = (1./r)**6
    pot = 4.*(sr6*sr6 - sr6)
    return pot


@numba.jit(nopython=True)
def distance(atom1, atom2):
    dx = atom2[0] - atom1[0]
    dy = atom2[1] - atom1[1]
    dz = atom2[2] - atom1[2]

    r = np.sqrt(dx*dx + dy*dy + dz*dz)
    return r


@numba.jit(nopython=True)
def potential(cluster):
    energy = 0.0
    for i in range(len(cluster)-1):
        for j in range(i+1,len(cluster)):
            r = distance(cluster[i],cluster[j])
            e = lj(r)
            energy += e
    return energy

In [8]:
%time potential(np.zeros(shape=(1,3)))

CPU times: user 228 ms, sys: 12.7 ms, total: 241 ms
Wall time: 218 ms


0.0

In [None]:
cluster = make_cluster(int(2e4), radius=100)
%timeit lj_numpy.potential(cluster)

In [9]:
%timeit potential(cluster)

16.1 ms ± 262 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## cuda

In [9]:
d = lj_numpy.distances(cluster)

In [10]:
%timeit lj_numpy.distances(cluster)
%timeit lj_numpy.lj(d)

183 ms ± 1.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)




153 ms ± 1.39 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
from numba import cuda
cuda.detect()

Found 1 CUDA devices
id 0      b'GeForce GT 750M'                              [SUPPORTED]
                      compute capability: 3.0
                           pci device id: 0
                              pci bus id: 1
Summary:
	1/1 devices are supported


True

In [12]:
@numba.vectorize(['float64(float64)'], target='cuda')
def cu_lj(r):
    sr6 = (1./r)**6
    pot = 4.*(sr6*sr6 - sr6)
    return pot

In [13]:
%timeit cu_lj(d)

21.1 ms ± 449 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### the easy function first

In [14]:
def cu_simple(cluster):
    d = lj_numpy.distances(cluster)
    p = cu_lj(d)
    e = np.nansum(p) / 2
    return e

In [15]:
lj_numpy.potential(cluster)

-1.3431786584750056

In [16]:
%timeit cu_simple(cluster)

222 ms ± 1.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%timeit lj_numpy.potential(cluster)

291 ms ± 4.56 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### try a little harder

In [19]:
d_device = cuda.to_device(d)

%timeit cu_lj(d_device)

4.5 ms ± 223 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
@cuda.jit('float64(float64)', device=True)
def cu_lj_device(d):
    sr6 = (1./d)**6
    pot = 4.*(sr6*sr6 - sr6)
    return pot

@cuda.jit
def cu_potential(result, d):
    idx, idy = cuda.grid(2)
    
    if idx < d.shape[0] and idy < d.shape[1]:
        p = cu_lj_device(d[idx, idy])
    
    cuda.atomic.add(result, 0, p)

In [23]:
import math 

d = lj_numpy.distances(cluster)
d_device = cuda.to_device(d)
result = np.zeros(shape=(1,), dtype=np.float64)

threadsperblock = (8, 8)
blockspergrid_x = math.ceil(d_device.shape[0] / threadsperblock[0])
blockspergrid_y = math.ceil(d_device.shape[1] / threadsperblock[1])
blockspergrid = (blockspergrid_x, blockspergrid_y)

cu_potential[blockspergrid, threadsperblock](result, d_device)


result[0] / 2

CudaAPIError: [702] Call to cuMemcpyDtoH results in CUDA_ERROR_LAUNCH_TIMEOUT