In [None]:
import numpy as np
import torch.nn as nn
import torch
from SparsityAnalysis import extract_patterns, SparseConvArrays

### A Simple Demo Test

In [9]:
weight = np.array(
        [[[[0., 0., 0.],
           [0., 1., 0.],
           [1., 1., 1.]],

          [[0., 0., 0.],
           [0., 1., 0.],
           [1., 1., 1.]],

          [[0., 0., 0.],
           [0., 1., 0.],
           [1., 1., 1.]]]])
weight.shape

(1, 3, 3, 3)

In [10]:
patterns = np.array(extract_patterns(weight))
patterns

array([[0, 0, 0, 0, 1, 0, 1, 1, 1]])

In [11]:
sparse_conv_arrays = SparseConvArrays(weight, patterns)

In [12]:
offset = sparse_conv_arrays.offset
reorder = sparse_conv_arrays.reorder
index = sparse_conv_arrays.index
stride = sparse_conv_arrays.stride
weight = sparse_conv_arrays.weight
ptset = sparse_conv_arrays.ptset
print(f"offset:{offset}\n\nreorder:{reorder}\n\nindex:{index}\n\nstride:{stride}\n\nweight:{weight}\n\nptset:\n{ptset}")

offset:[0 3]

reorder:[0]

index:[0 1 2]

stride:[0 3]

weight:[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

ptset:
[[[1 1]
  [2 0]
  [2 1]
  [2 2]]]


### Load Actual Weights from Pattern-Pruned ResNet-34

In [10]:
path = 'resnet34_6_pattern_connectivity_pruning.pt'
state_dict = torch.load(path, map_location=torch.device('cpu'))

# residual_conv_dict = {k:v.cpu().numpy() for (k,v) in state_dict.items() if "layer" in k and "conv" in k}
residual_convs = [v.cpu().numpy() for (k, v) in state_dict.items() if "layer" in k and "conv" in k]
data_shapes = [
    [1, 64, 32, 32], [1, 64, 32, 32], [1, 64, 32, 32], [1, 64, 32, 32], [1, 64, 32, 32], [1, 64, 32, 32],
    [1, 64, 32, 32], [1, 128, 16, 16], [1, 128, 16, 16], [1, 128, 16, 16], [1, 128, 16, 16], [1, 128, 16, 16],
    [1, 128, 16, 16], [1, 128, 16, 16], [1, 128, 16, 16], [1, 256, 8, 8], [1, 256, 8, 8], [1, 256, 8, 8],
    [1, 256, 8, 8], [1, 256, 8, 8], [1, 256, 8, 8], [1, 256, 8, 8], [1, 256, 8, 8], [1, 256, 8, 8],
    [1, 256, 8, 8], [1, 256, 8, 8], [1, 256, 8, 8], [1, 512, 4, 4], [1, 512, 4, 4], [1, 512, 4, 4],
    [1, 512, 4, 4], [1, 512, 4, 4],
]

### Correctness Check - unit test

In [7]:
from conv_naive import Convolution
conv = Convolution()
ip = np.ones((1,64,32,32)).astype(np.float32)
mask = np.ones((128,64,3,3)).astype(np.float32)
output_1,_ = conv.conv_multiple_filters(ip, mask)
output_gt = nn.functional.conv2d(torch.tensor(ip), torch.tensor(mask),padding=1)
output_gt = output_gt.cpu().numpy()
# print(output_gt)
# print(output_1)
print(f'conv_cuda|nnpack: {np.allclose(output_1,output_gt)}')

conv_cuda|nnpack: True


### Correctness Check - full model

In [21]:
from conv_naive import Convolution
import numpy as np
conv = Convolution()
for idx in range(len(residual_convs[:1])):
    input_data = np.ones(data_shapes[idx]).astype(np.float32)
    conv_mask = residual_convs[idx].astype(np.float32)
    output_1, _ = conv.conv_multiple_filters(input_data, conv_mask)
    output_gt = nn.functional.conv2d(torch.tensor(input_data), torch.tensor(conv_mask),padding=1)
    output_gt = output_gt.cpu().numpy()
    print(np.sum(output_1))
    print(np.sum(output_gt))
    print('*'*10 + f' conv layer {idx} ' + '*'*10)
    print(f'conv_cuda|nnpack: {np.allclose(output_1,output_gt,rtol=1e-5)}\n')


0.0
-14950.9375
********** conv layer 0 **********
conv_cuda|nnpack: False



### Time Cost w/o memory transfer - nnpack

In [1]:
from conv_naive import Convolution
import numpy as np
from tqdm import tqdm
import time
conv = Convolution()
cuda0 = torch.device('cuda:0')
cpu = torch.device('cpu')
total_time = 0
output_time_nnpack = []


for idx in tqdm(range(len(residual_convs[:]))):
    input_data = np.ones(data_shapes[idx]).astype(np.float32)
    conv_mask = residual_convs[idx].astype(np.float32)
    input_data_g = torch.tensor(input_data, device = cuda0)
    conv_mask_g = torch.tensor(conv_mask, device = cuda0)
    torch.cuda.synchronize()
    start = time.time()
    output_gt = nn.functional.conv2d(input_data_g, conv_mask_g,padding=1)
    #output_gt = nn.functional.conv2d(torch.tensor(input_data), torch.tensor(conv_mask),padding=1)
    torch.cuda.synchronize()
    end =  time.time()
    total_time += end - start
    output_time_nnpack.append(end - start)


print(f'{round(total_time,3)}s')



NameError: name 'torch' is not defined

### Time Cost w/o memory transfer - conv_naive

In [16]:
from conv_naive import Convolution
import numpy as np
from tqdm import tqdm
import time
conv = Convolution()
cuda0 = torch.device('cuda:0')
total_time = 0
output_list = []

for idx in tqdm(range(len(residual_convs[:]))):
    input_data = np.ones(data_shapes[idx]).astype(np.float32)
    conv_mask = residual_convs[idx].astype(np.float32)
    output_1, time_= conv.conv_multiple_filters(input_data, conv_mask)
    output_list.append(time_)
    total_time += time_

print(total_time)
print(output_list)



100%|██████████| 32/32 [00:00<00:00, 235.60it/s]

0.08982825589179994
[0.0024256000518798827, 0.0022520639896392823, 0.0024153919219970705, 0.002389663934707642, 0.0023896000385284426, 0.0024553918838500976, 0.004481056213378906, 0.002386176109313965, 0.002385215997695923, 0.0023887999057769778, 0.0023819200992584227, 0.002382528066635132, 0.0023797760009765627, 0.0023854079246520997, 0.004687488079071045, 0.0021965761184692383, 0.0021887359619140625, 0.002189568042755127, 0.0021876800060272217, 0.002187295913696289, 0.0021900479793548586, 0.002189568042755127, 0.0021905601024627686, 0.002204416036605835, 0.0021965439319610596, 0.0021927359104156495, 0.00426262378692627, 0.00425276803970337, 0.004253759860992432, 0.004253727912902832, 0.00425161600112915, 0.004253952026367187]





### Space Cost Analysis

In [18]:
for idx in range(len(residual_convs[-3:-1])):
    patterns = np.array(extract_patterns(residual_convs[idx]))
    sparse_conv_arrays = SparseConvArrays(residual_convs[idx], patterns)
    offset = sparse_conv_arrays.offset
    reorder = sparse_conv_arrays.reorder
    index = sparse_conv_arrays.index
    stride = sparse_conv_arrays.stride
    weight = sparse_conv_arrays.weight
    ptset = sparse_conv_arrays.ptset
    #print(f"offset:{offset}\n\nreorder:{reorder}\n\nindex:{index}\n\nstride:{stride}\n\nweight:{weight}\n\nptset:\n{ptset}")
    #print(conv_layer_weight)
    ### Space
    print('*'*10 + f' conv layer {idx} ' + '*'*10)
    print(f'Normal_conv_mask:\n{residual_convs[idx].nbytes}')
    print(f'FKW_conv_mask:\n{offset.nbytes+reorder.nbytes+index.nbytes+stride.nbytes+weight.nbytes+ptset.nbytes}\n')
    #print(patterns)

********** conv layer 0 **********
Normal_conv_mask:
147456
FKW_conv_mask:
6600

********** conv layer 1 **********
Normal_conv_mask:
147456
FKW_conv_mask:
6600



In [19]:
147456/6600

22.341818181818184

### Time Cost w/o memory transfer - sparse_naive

In [17]:
from execution_time import SparseConvolution
from tqdm import tqdm


conv = SparseConvolution()

path = 'resnet34_6_pattern_connectivity_pruning.pt'
state_dict = torch.load(path, map_location=torch.device('cpu'))

residual_convs = [v.cpu().numpy() for (k, v) in state_dict.items() if "layer" in k and "conv" in k]
data_shapes = [
        [1, 64, 32, 32], [1, 64, 32, 32], [1, 64, 32, 32], [1, 64, 32, 32], [1, 64, 32, 32], [1, 64, 32, 32],
        [1, 64, 32, 32], [1, 128, 16, 16], [1, 128, 16, 16], [1, 128, 16, 16], [1, 128, 16, 16], [1, 128, 16, 16],
        [1, 128, 16, 16], [1, 128, 16, 16], [1, 128, 16, 16], [1, 256, 8, 8], [1, 256, 8, 8], [1, 256, 8, 8],
        [1, 256, 8, 8], [1, 256, 8, 8], [1, 256, 8, 8], [1, 256, 8, 8], [1, 256, 8, 8], [1, 256, 8, 8],
        [1, 256, 8, 8], [1, 256, 8, 8], [1, 256, 8, 8], [1, 512, 4, 4], [1, 512, 4, 4], [1, 512, 4, 4],
        [1, 512, 4, 4], [1, 512, 4, 4]]

time_without_mem_list_naive = []
time_include_mem_list_naive = []
time_without_mem_list_shared = []
time_include_mem_list_shared = []
time_without_mem_list_constant = []
time_include_mem_list_constant = []

time_wo_naive = 0
time_wo_shared = 0
time_wo_constant = 0


for i in tqdm(range(len(residual_convs[:]))):
    input_data = np.float32(np.ones(data_shapes[i]))

    if i == len(residual_convs) - 1:
        output_data =  np.float32(np.zeros(data_shapes[i]))
    else:
        output_data =  np.float32(np.zeros(data_shapes[i + 1]))

    conv_layer_weight = residual_convs[i].astype(np.float32)
    patterns = np.array(extract_patterns(conv_layer_weight))
    sparse_conv_arrays = SparseConvArrays(conv_layer_weight, patterns)
    offset = sparse_conv_arrays.offset
    reorder = sparse_conv_arrays.reorder
    index = sparse_conv_arrays.index
    stride = sparse_conv_arrays.stride
    sparse_weight = sparse_conv_arrays.weight
    ptset = np.float32(sparse_conv_arrays.ptset)

    # step 卷积步长
    if i == len(residual_convs) - 1:
        step = 1
    else:
        step = int(data_shapes[i][2] / data_shapes[i + 1][2])
        
    output_naive, time_without_mem_naive, time_include_mem_naive = conv.conv_sparse_naive(input_data, offset, reorder, index, stride, sparse_weight, ptset, step, output_data)
    output_shared, time_without_mem_shared, time_include_mem_shared = conv.conv_sparse_shared_mem(input_data, offset, reorder, index, stride, sparse_weight, ptset, step, output_data)

    time_without_mem_list_naive.append(time_without_mem_naive)
    time_include_mem_list_naive.append(time_include_mem_naive)
    time_without_mem_list_shared.append(time_without_mem_shared)
    time_include_mem_list_shared.append(time_include_mem_shared)
    
    time_wo_naive += time_without_mem_naive
    time_wo_shared += time_without_mem_shared
    

        #constant memory limit
    if sparse_weight.shape[0] <= 16384:
        output_constant, time_without_mem_constant,  time_include_mem_constant = conv.conv_sparse_shared_constant_mem(input_data, offset, reorder, index, stride, sparse_weight, ptset, step, output_data)
        time_without_mem_list_constant.append(time_without_mem_constant)
        time_include_mem_list_constant.append(time_include_mem_constant)



print(time_wo_naive)
print(time_wo_shared)
print(time_without_mem_list_naive)
print(time_without_mem_list_shared)

100%|██████████| 32/32 [00:22<00:00,  1.41it/s]

0.005999040022492409
0.0052144320085644735
[0.00015772800147533417, 0.0001335040032863617, 0.00014745600521564483, 0.00012931199371814727, 0.0001446399986743927, 0.00013475200533866882, 0.00012166400253772736, 0.00013926400244235993, 0.0001566399931907654, 0.0001228799968957901, 0.00015568000078201294, 0.00012492799758911133, 0.000179967999458313, 0.0001433600038290024, 0.00014921599626541138, 0.0001470720022916794, 0.0002229440063238144, 0.0001536320000886917, 0.00024374400079250335, 0.00015836800634860994, 0.0002260800004005432, 0.00017369599640369417, 0.0002258239984512329, 0.0001842560023069382, 0.0001966399997472763, 0.0002396160066127777, 0.0002070080041885376, 0.00030649599432945254, 0.0003256320059299469, 0.0002825599908828735, 0.0003645760118961334, 0.00019990399479866028]
[0.00013385599851608278, 0.00013046400249004366, 0.0001380160003900528, 0.00012086399644613266, 0.00015641599893569946, 0.00013836799561977388, 0.00011161600053310394, 0.00011846400052309037, 0.0001295039951


