In [None]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR

import pandas as pd
import os
import itertools
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
default_threads = torch.get_num_threads() 
print(default_threads)

In [None]:
class Net(nn.Module):
    def __init__(self, filters1=32, filters2=64, units1=128):
        super(Net, self).__init__()

        size = filters2 * 12 * 12

        self.conv1 = nn.Conv2d(1, filters1, 3, 1)
        self.conv2 = nn.Conv2d(filters1, filters2, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(size, units1)
        self.fc2 = nn.Linear(units1, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

In [None]:
def train(model, device, train_loader, optimizer, epoch, log_interval, dry_run):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            if dry_run:
                break

In [None]:
def test(model, device, test_loader, cpu_threads, verbose):
    print(device)
    #torch.set_num_threads(cpu_threads)
    
    model.eval()
    test_loss = 0
    correct = 0
    
    dt_inf = []
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            #torch.set_num_threads(cpu_threads)
            dt = %timeit -o -n 1 -r 1 -q model(data)
            dt_inf.append(dt)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    #torch.set_num_threads(1)

    if not verbose:
        return dt_inf
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [None]:
def main(batch_size=64, test_batch_size=1000,
         epochs=14, lr=1, gamma=0.7, no_cuda=False, dry_run=False,
         seed=1, log_interval=10, save_model=True,
         filters1=32, filters2=64, units1=128, verbose=True, cpu_threads=1):

    # Training settings
    #parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    # parser.add_argument('--batch-size', type=int, default=64, metavar='N',
    #                    help='input batch size for training (default: 64)')
    # parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
    #                    help='input batch size for testing (default: 1000)')
    # parser.add_argument('--epochs', type=int, default=14, metavar='N',
    #                    help='number of epochs to train (default: 14)')
    # parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
    #                    help='learning rate (default: 1.0)')
    # parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
    #                    help='Learning rate step gamma (default: 0.7)')
    # parser.add_argument('--no-cuda', action='store_true', default=False,
    #                    help='disables CUDA training')
    # parser.add_argument('--dry-run', action='store_true', default=False,
    #                    help='quickly check a single pass')
    # parser.add_argument('--seed', type=int, default=1, metavar='S',
    #                    help='random seed (default: 1)')
    # parser.add_argument('--log-interval', type=int, default=10, metavar='N',
    #                    help='how many batches to wait before logging training status')
    # parser.add_argument('--save-model', action='store_true', default=True,
    #                    help='For Saving the current Model')
    #args = parser.parse_args()

    use_cuda = not no_cuda and torch.cuda.is_available()

    torch.manual_seed(seed)
    #torch.set_num_threads(cpu_threads)

    device = torch.device("cuda" if use_cuda else "cpu")
    
    print('Using CUDA', use_cuda, 'with device', device)
    

    train_kwargs = {'batch_size': batch_size, 'num_workers': 0}
    test_kwargs = {'batch_size': test_batch_size, 'num_workers': 0}
    if use_cuda:
        cuda_kwargs = {'num_workers': 1,
                       'pin_memory': True,
                       'shuffle': True}
        train_kwargs.update(cuda_kwargs)
        test_kwargs.update(cuda_kwargs)

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])
    dataset1 = datasets.MNIST('../data', train=True, download=True,
                              transform=transform)
    dataset2 = datasets.MNIST('../data', train=False,
                              transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

    print('Number of test batches: ', len(test_loader))

    model = Net(filters1=filters1, filters2=filters2, units1=units1).to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=lr)

    scheduler = StepLR(optimizer, step_size=1, gamma=gamma)
    inference_times = []

    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer,
              epoch, log_interval, dry_run)
        #dt = %timeit -o -n 3 -r 3 -q test(model, device, test_loader, verbose)
        dt = test(model, device, test_loader, cpu_threads, verbose)
        inference_times.append(dt)

        scheduler.step()

    if save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")

    return inference_times

In [None]:
def print_summary(model, verbose=True):
    params = 0

    for k in model.state_dict().keys():
        if verbose:
            print(k)

        a = model.state_dict()[k]
        params += len(a.flatten())

    if verbose:
        print('Total parameters: ', params)

    return params


model = Net(units1=4)
print_summary(model)

In [None]:
filename = 'cpu-vs-gpu-inference.csv'

columns = ['use_cuda', 'test_batch_size', 'filters1', 'filters2', 'units1', 'params', 'cpu_threads', 'timings_avg_s', 'timings_best_s', 'timings_worst_s']

In [None]:
overwrite = True

if os.path.exists(filename) and not overwrite:
    df = pd.read_csv(filename)
else:
    df = pd.DataFrame(columns=columns)

In [None]:
# define value range for the experiment

# measure the execution time of model(data)

v_use_cuda = [True, False]
v_test_batch_size = [32]
v_filters1 = [1, 2, 4, 8, 16, 32, 64]
v_filters2 = [1, 2, 4, 8, 16, 32, 64]
v_units1 = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
cpu_threads = ['auto'] # [1, 8]

i = 0

for use_cuda, test_batch_size, filters1, filters2, units1, cpu_thread in itertools.product(v_use_cuda, v_test_batch_size, v_filters1, v_filters2, v_units1, cpu_threads):
    print(use_cuda, test_batch_size, filters1, filters2, units1, cpu_thread)

    no_cuda = not use_cuda
    
    tmp = Net(units1=units1, filters1=filters1, filters2=filters2)
    params = print_summary(tmp, verbose=False)

    inference_times = main(log_interval=1000, epochs=3, test_batch_size=test_batch_size,
                           dry_run=True, verbose=False, no_cuda=no_cuda,
                           filters1=filters1, filters2=filters2, units1=units1, cpu_threads=cpu_thread)

    timings = np.array([it.timings[:] for it in  [itt for it in inference_times for itt in it]])
    
    new_df = pd.DataFrame(dict(use_cuda=use_cuda,
                               test_batch_size=test_batch_size,
                               filters1=filters1,
                               filters2=filters2,
                               units1=units1,
                               params=params,
                               cpu_threads=cpu_thread,
                               timings_avg_s=np.mean(timings),
                               timings_best_s=np.min(timings),
                               timings_worst_s=np.max(timings)), index=[i])

    i += 1
    df = pd.concat([df, new_df], axis=0)

In [None]:
#df.columns = columns
df.to_csv(filename)

In [None]:
df

In [None]:
sns.set_context('talk')
sns.set_style('ticks')

df['timings_avg_ms'] = df['timings_avg_s'] * 1e3 # to ms
sns.lineplot(data=df, x='params', y='timings_avg_ms', hue='use_cuda', marker='o', legend=False)
ax=plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.legend(['CPU', 'GPU'], ncol=2)
ax.set_xlabel('Network parameters')
ax.set_ylabel('Model call (ms)')
ax.set_yticks([0.1, 0.2, 0.5, 1, 2, 5, 10])
ax.set_yticklabels([0.1, 0.2, 0.5, 1, 2, 5, 10])

plt.gcf().tight_layout()
plt.show()

In [None]:
sns.lineplot(data=df, x='units1', y='timings_avg_ms', hue='use_cuda', marker='o', legend=False)
ax=plt.gca()
ax.set_yscale('log')
#ax.set_xscale('log')
ax.legend(['CPU', 'GPU'], ncol=2)
ax.set_xlabel('Dense layer units')
ax.set_ylabel('Model call (ms)')
ax.set_yticks([0.1, 0.2, 0.5, 1, 2, 5, 10])
ax.set_yticklabels([0.1, 0.2, 0.5, 1, 2, 5, 10])

plt.gcf().tight_layout()
plt.show()

In [None]:
%%time
torch.set_num_threads(250)
t1 = main(no_cuda=True, dry_run=True, epochs=1, verbose=False)

In [None]:
threads = [1, 2, 4, 8, 16, 32, 64, 128, 256]
thread_times = []

for thread in threads:
    torch.set_num_threads(thread)
    t1 = main(no_cuda=True, dry_run=True, epochs=1, verbose=False, filters1=32, filters2=64, units1=128)
    t2 = np.array([it.timings[:] for it in  [itt for it in t1 for itt in it]])
    thread_times.append(np.mean(t2))

In [None]:
tdf = pd.DataFrame(dict(threads=threads, thread_times=thread_times))
sns.pointplot(data=tdf, x='threads', y='thread_times', linestyles=':')


#plt.semilogx(threads, thread_times)
#plt.hlines(0.5, 1, 250,)
plt.xlabel('Pytorch CPU threads')
plt.ylabel('Model call (ms)')
ax=plt.gca()
#ax.set_xscale('log')

#ax.set_xticks(threads)
#ax.set_xticklabels(threads)


ax.set_yticks(np.arange(0.1, 0.9, 0.1))
plt.savefig('model_call_vs_cpu_threads.png', dpi=300)

In [None]:
threads[np.argmin(thread_times)]

In [None]:
df[(df['filters1']==32) & (df['filters2']==64) & (df['units1']==128)]

In [None]:
cpu_df = df[df['use_cuda'] == False]
gpu_df = df[df['use_cuda'] == True]

cpu_df = cpu_df.reset_index().drop(columns='index')
gpu_df = gpu_df.reset_index().drop(columns='index')

In [None]:
cpu_df['cpu_to_gpu_factor'] = cpu_df['timings_avg_s'] / gpu_df['timings_avg_s']
cpu_df

In [None]:
cpu_df['mag_params'] = np.round(np.log10(cpu_df['params'].astype(float)))

sns.lineplot(data=cpu_df, x='mag_params', y='cpu_to_gpu_factor', marker='o')
ax=plt.gca()
ax.set_xlabel('Network parameters')
ax.set_ylabel(r'$\tau_\mathrm{CPU} / \tau_\mathrm{GPU}$')

ax.set_xticks([2, 3, 4, 5, 6, 7])
ax.set_xticklabels([r'$10^2$', r'$10^3$', r'$10^4$', r'$10^5$', r'$10^6$', r'$10^7$'])
ax.set_yscale('log')

ax.set_yticks([1, 2, 5, 10, 20, 50])
ax.set_yticklabels([1, 2, 5, 10, 20, 50])


plt.savefig('cpu_to_gpu_factor_vs_params.png', dpi=300)