In [1]:
import pyopencl as cl
import pyopencl.array as cl_array
import numpy as np
import numpy.linalg as la
import math
import torch
import torch.nn as nn
import torch.functional as F

In [2]:
%load_ext pyopencl.ipython_ext

In [3]:
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags

In [4]:
class LeNet(nn.Module):
    def __init__(self, num_classes):
        super(LeNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(6),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1 ),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.fclayer = nn.Sequential(
            nn.Linear(16*5*5, 120),
            nn.ReLU(),
            nn.Linear(120,84),
            nn.ReLU(),
            nn.Linear(84,num_classes)
        ) 
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x=x.view(-1, 16*5*5)
        x=self.fclayer(x)
        return x

model = LeNet(10)
model.load_state_dict(torch.load('model.pth'))
fclayer_Linear_1 = model.fclayer[0]

In [5]:
fclayer_Linear_1.weight.shape, fclayer_Linear_1.bias.shape

(torch.Size([120, 400]), torch.Size([120]))

In [6]:
def linear(input_numpy, weight_numpy, bias_numpy):
    Ci = len(input_numpy)
    Co, Ci = weight_numpy.shape
    out = np.zeros((Co,))
    for j in range(Co):
        for k in range(Ci):
            out[j]+= input_numpy[k]*weight_numpy[j][k]
    for j in range(Co):
        out[j] += bias_numpy[j]
    return out

In [7]:
input_cpu = np.random.rand(400).astype(np.float32)
weight_cpu = fclayer_Linear_1.weight.detach().numpy()
bias_cpu = fclayer_Linear_1.bias.detach().numpy()

Co, Ci = weight_cpu.shape

output_cpu = np.zeros((Co,)).astype(np.float32)

In [8]:
input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_cpu)

weight_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = weight_cpu)
bias_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = bias_cpu)

output_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Co))
input_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ci))

output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)

In [9]:
%%cl_kernel -o "-cl-fast-relaxed-math"

__kernel void Linear(__global const float *ift, 
                     __global float *weight, __global float *bias,
                     __global int *output_channel, __global int *input_channel,
                     __global float *oft)
{
    int Co = *output_channel, Ci = *input_channel;
    int posCo = get_global_id(0);
    
    oft[posCo] = bias[posCo];
    for(int k = 0; k < Ci; k++) {
        oft[posCo] += ift[k]*weight[posCo*Ci+k];
    }
}

In [17]:
Linear(queue, output_cpu.shape, None, 
           input_gpu, 
           weight_gpu, bias_gpu,
           output_channel_gpu, input_channel_gpu,
           output_gpu)

<pyopencl._cl.Event at 0x1729ccd3d08>

In [18]:
cl.enqueue_copy(queue, output_cpu, output_gpu)

<pyopencl._cl.NannyEvent at 0x1729ccd50a8>

In [19]:
output_cpu

array([ 6.9857401e-01, -1.0514938e+00,  6.9363385e-01,  9.5965707e-01,
        1.2066766e+00,  1.3771670e+00, -1.3590750e+00, -2.0118271e-01,
       -7.7706903e-01, -4.8265231e-01, -1.0459918e+00, -5.5095005e-01,
       -2.2567570e+00,  8.6947680e-01,  3.6334932e-01,  4.9455038e-01,
       -1.0909736e+00, -5.3922325e-01, -3.6158512e+00, -2.1679578e+00,
        4.2211663e-02, -2.8100297e-01,  6.8839595e-02, -4.4237435e-01,
       -9.7579271e-01, -7.7192599e-01, -2.9107885e+00, -2.2154856e+00,
        1.2929473e+00,  2.7048808e-01, -1.3069842e+00, -4.1822740e-01,
        4.4032171e-01,  3.9952981e-01, -1.6993780e+00, -1.8120874e+00,
       -1.1808238e+00, -4.4633274e+00,  2.0462520e+00,  8.7749553e-01,
       -1.0146166e+00,  8.0423284e-01,  2.2040368e-03, -7.8716916e-01,
       -8.2852429e-01, -1.6030115e+00, -2.1449836e-01,  5.5198008e-01,
       -3.1762860e+00,  5.5280790e+00, -2.4784594e+00,  6.0961530e-02,
       -1.6547663e+00, -6.9561124e-01, -3.2892818e+00,  1.2655298e+00,
      

In [20]:
np_res = linear(input_cpu, weight_cpu, bias_cpu)

In [21]:
np_res

array([ 6.98574177e-01, -1.05149326e+00,  6.93633602e-01,  9.59656661e-01,
        1.20667679e+00,  1.37716678e+00, -1.35907468e+00, -2.01182844e-01,
       -7.77068943e-01, -4.82652498e-01, -1.04599104e+00, -5.50949993e-01,
       -2.25675623e+00,  8.69476712e-01,  3.63349369e-01,  4.94550438e-01,
       -1.09097350e+00, -5.39224082e-01, -3.61585078e+00, -2.16795845e+00,
        4.22116029e-02, -2.81003163e-01,  6.88396066e-02, -4.42374418e-01,
       -9.75792973e-01, -7.71925610e-01, -2.91078899e+00, -2.21548594e+00,
        1.29294735e+00,  2.70487657e-01, -1.30698395e+00, -4.18227691e-01,
        4.40321088e-01,  3.99529642e-01, -1.69937735e+00, -1.81208597e+00,
       -1.18082446e+00, -4.46333073e+00,  2.04625404e+00,  8.77495702e-01,
       -1.01461632e+00,  8.04233158e-01,  2.20383961e-03, -7.87169555e-01,
       -8.28524309e-01, -1.60301172e+00, -2.14498302e-01,  5.51979905e-01,
       -3.17628453e+00,  5.52807809e+00, -2.47845943e+00,  6.09613112e-02,
       -1.65476657e+00, -