In [8]:
from numba import cuda, vectorize, guvectorize
from numba import void, uint8 , uint32, uint64, int32, int64, float32, float64, f8
import numpy as np

In [9]:
device = cuda.get_current_device()

n = 100

# Host memory
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32)

# Assign equivalent storage on device
da = cuda.to_device(a)
db = cuda.to_device(b)

# Assign storage on device for output
dc = cuda.device_array_like(a)

# Set up enough threads for kernel
tpb = device.WARP_SIZE

In [15]:
@cuda.jit('void(float32[:],float32,float32,float32,float32,float32[:],float32[:] ,float32[:], uint64)',device = True)
def calcLocalContribution(uvals,P,Q,R,vjsqrinv,quadCoeffs,JLvals,JRvals,JSize):
        #do a finite difference stencil for the second derivative at every point
        #in our mesh. do a 4 point stencil at the ends to preserve accuracy
        for i in range(size):
            if i != 0 and i != (size-1):
                quadCoeffs.append((uvals[i-1] - 2*uvals[i] + uvals[i+1])*1/self.vj_**2 )
            elif i == 0:
                quadCoeffs.append((2*uvals[0] - 5*uvals[1] + 4*uvals[2] - uvals[3])*1/self.vj_**2 )
            else:
                quadCoeffs.append((2*uvals[-1] - 5*uvals[-2] +4*uvals[-3] - uvals[-4])*1/self.vj_**2)
            #evaluate the polynomial integral for each J between 0 and N.
        for j in range(size):

            if j != 0:
                #recursive formula to update the value of J
                JLcurr  = self.P_*uvals[j] + self.Q_*uvals[j-1] + quadCoeffs[j]*self.R_
                JLval.append( self.dj_*JLval[j-1] + JLcurr )

            if j != size-1:
                JRreverse = self.P_*uvals[-(j+2)]+self.Q_*uvals[ -(j+1) ] + self.R_*quadCoeffs[-(j+2)] 
                JRval.append(self.dj_*JRval[j] + JRreverse)
        JRval.reverse()
        self.w_ = [JLval[i]+JRval[i]  for i in range(size)]
        return self.reportEndPoints()