# Train the network using the REDISAI db as an exchange place and debug the problems


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision.models as models
import torch.utils.data as tdata

import numpy as np
import redisai as rai

from dataclasses import dataclass

# import the modules used in the program
import train_utils

c:\users\diego\cs\thesis\venv\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
c:\users\diego\cs\thesis\venv\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
  stacklevel=1)


## Create the network

In [3]:
@dataclass
class TrainParams:
    ps_id: str
    N: int
    task: str
    func_id: int
    lr: float
    batch_size: int
    

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output
    
def create_model(init: bool):
    """Creates the model used to train the network

    For this example we'll be using the simple model from the MNIST examples
    (https://github.com/pytorch/examples/blob/master/mnist/main.py)
    """

    def init_weights(m: nn.Module):
        """Initialize the weights of the network"""
        if isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform_(m.weight)
            nn.init.constant_(m.bias, 0.01)
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            nn.init.constant_(m.bias, 0.01)

    # Create the model and initialize the weights
    model = Net()

    # If the task is initializing the layers do so
    if init:
        print('Initializing layers...')
        model.apply(init_weights)

    return model

In [3]:
torch.cuda.is_available()

True

## Create the Dataloader

In [4]:

# TODO max document size is 16 MB, this could give us problems in the future
# when the datasets are so big, we should calculate the size (easy, and divide the dataset)
def split_dataset(X, Y, subsets):
    """Splits the X and Y in N different subsets"""
    X_split = np.split(X, subsets)
    Y_split = np.split(Y, subsets)
    
    return X_split, Y_split


def approx_size(a: np.array):
    """ approx size of float 32 array in MB"""
    return (32/8) * np.prod(a.shape) / 1e6



In [15]:
47*128, 16*128

(6016, 2048)

In [21]:
transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
        ])

train_data = datasets.MNIST('./data', train=True, download=False, transform=transform)
val_data = datasets.MNIST('./data', train=False, download=False, transform=transform)

train_data.data, train_data.targets = train_data.data[:3000], train_data.targets[:3000]
val_data.data, val_data.targets = val_data.data[:2000], val_data.targets[:2000]

In [22]:

train_loader = torch.utils.data.DataLoader(train_data, batch_size=128)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=128)
len(train_loader), len(val_loader)

(24, 16)

# Define the train and test methods


In [23]:
def train(model: nn.Module, device,
          train_loader: tdata.DataLoader,
          optimizer: torch.optim.Optimizer, tensor_dict) -> float:
    """Loop used to train the network"""
    model.train()
    loss, tot = 0, 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)

        loss = F.nll_loss(output, target)
        tot += loss.item()
        loss.backward()

        # Here save the gradients to publish on the database
        train_utils.update_tensor_dict(model, tensor_dict)
        optimizer.step()
        

        if batch_idx % 5 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                1, batch_idx * len(data), len(train_loader.dataset),
                   100. * batch_idx / len(train_loader), loss.item()))

    return tot/len(train_loader)


def validate(model, device, val_loader: tdata.DataLoader) -> (float, float):
    """Loop used to validate the network"""
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(val_loader.dataset)

    accuracy = 100. * correct / len(val_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(val_loader.dataset),
        100. * correct / len(val_loader.dataset)))
    return accuracy, test_loss

## Main entrypoint of the code

In [30]:
from copy import deepcopy

params = TrainParams(ps_id='example', func_id=0, N =2, task='train', lr=0.01, batch_size=128)


torch.manual_seed(42)
device = device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create the model

model = create_model(True).to(device)
model

initial = deepcopy(model.state_dict())

# train_utils.save_model_weights(model, params, con)

Initializing layers...


In [11]:
initial['conv2.bias']
torch.set_printoptions(precision=10)

In [26]:
# Create the redis connection
addr = '192.168.99.101'
port = 31618
con = rai.Client(debug=True, host=addr, port=port)

In [28]:
train_utils.load_model_weights(model, '268b3327', con)

Loading weights for layer conv1
AI.TENSORGET 268b3327:conv1.weight META BLOB
Loading bias for layer conv1
AI.TENSORGET 268b3327:conv1.bias META BLOB
Loading weights for layer conv2
AI.TENSORGET 268b3327:conv2.weight META BLOB
Loading bias for layer conv2
AI.TENSORGET 268b3327:conv2.bias META BLOB
Loading weights for layer fc1
AI.TENSORGET 268b3327:fc1.weight META BLOB
Loading bias for layer fc1
AI.TENSORGET 268b3327:fc1.bias META BLOB
Loading weights for layer fc2
AI.TENSORGET 268b3327:fc2.weight META BLOB
Loading bias for layer fc2
AI.TENSORGET 268b3327:fc2.bias META BLOB
Bias layer is tensor([0.0098, 0.0102, 0.0091, 0.0090, 0.0100, 0.0104, 0.0104, 0.0115, 0.0095,
        0.0100])
Loaded state dict from the database dict_keys(['conv1.weight', 'conv1.bias', 'conv2.weight', 'conv2.bias', 'fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias'])


  state[weight_key[9:]] = torch.from_numpy(w)


In [29]:
validate(model, device, val_loader)


Test set: Average loss: 2.3226, Accuracy: 218/2000 (11%)



(10.9, 2.322626625061035)

### Train for a couple of epochs

In [31]:
# create the tensor dict
tdict = dict()

optimizer = optim.SGD(model.parameters(), lr=0.01)

print('Epoch', i)
train(model, device, train_loader, optimizer, tdict)
validate(model, device, val_loader)


Epoch 5

Test set: Average loss: 1.9604, Accuracy: 1177/2000 (59%)



(58.85, 1.9604190673828126)

In [32]:
final = deepcopy(model.state_dict())


In [34]:
train_utils.load_model_weights(model, 'e27bf8b8', con)

Loading weights for layer conv1
AI.TENSORGET e27bf8b8:conv1.weight META BLOB
Loading bias for layer conv1
AI.TENSORGET e27bf8b8:conv1.bias META BLOB
Loading weights for layer conv2
AI.TENSORGET e27bf8b8:conv2.weight META BLOB
Loading bias for layer conv2
AI.TENSORGET e27bf8b8:conv2.bias META BLOB
Loading weights for layer fc1
AI.TENSORGET e27bf8b8:fc1.weight META BLOB
Loading bias for layer fc1
AI.TENSORGET e27bf8b8:fc1.bias META BLOB
Loading weights for layer fc2
AI.TENSORGET e27bf8b8:fc2.weight META BLOB
Loading bias for layer fc2
AI.TENSORGET e27bf8b8:fc2.bias META BLOB
Bias layer is tensor([0.0095, 0.0102, 0.0094, 0.0098, 0.0101, 0.0101, 0.0103, 0.0107, 0.0100,
        0.0099])
Loaded state dict from the database dict_keys(['conv1.weight', 'conv1.bias', 'conv2.weight', 'conv2.bias', 'fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias'])


In [35]:
dist = model.state_dict()

In [37]:
final['conv1.bias'] - dist['conv1.bias']

tensor([-5.3157e-04,  2.4102e-03,  4.9774e-03, -2.2192e-03,  1.1392e-03,
        -1.0168e-03,  1.7234e-03, -1.9600e-03, -5.1682e-04, -1.1000e-03,
         2.9163e-03,  4.1495e-03, -8.2636e-06,  4.6715e-04, -1.2589e-03,
        -2.0384e-03,  5.5351e-03,  2.3986e-03,  2.8878e-03,  4.5331e-03,
         3.3866e-03,  5.2829e-03,  9.3734e-04, -3.7543e-04,  6.0971e-03,
        -2.5996e-03,  1.7083e-03, -4.0774e-04,  2.4535e-03,  1.4973e-03,
         1.7151e-03, -1.7721e-03], device='cuda:0')

In [40]:
[(k, torch.mean(v)) for k, v in tdict.items()]

[('conv1.weight.grad', tensor(0.0010779696, device='cuda:0')),
 ('conv1.bias.grad', tensor(0.0054637371, device='cuda:0')),
 ('conv2.weight.grad', tensor(0.0006221278, device='cuda:0')),
 ('conv2.bias.grad', tensor(0.0024095972, device='cuda:0')),
 ('fc1.weight.grad', tensor(0.0002331834, device='cuda:0')),
 ('fc1.bias.grad', tensor(0.0006510335, device='cuda:0')),
 ('fc2.weight.grad', tensor(-1.3606040739e-09, device='cuda:0')),
 ('fc2.bias.grad', tensor(-3.7252902985e-09, device='cuda:0'))]

In [19]:


train_utils.save_gradients(tdict,params, con)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
model.conv2.bias

In [None]:
tdict['conv2.bias.grad']

In [None]:
torch.eq(model.conv2.weight, initial['conv2.weight']).all()

In [None]:
import numpy as np
a = con.tensorget("example:conv2.bias.grad/0", as_numpy=False)
a