In [1]:
%matplotlib inline


`Learn the Basics <intro.html>`_ ||
**Quickstart** ||
`Tensors <tensorqs_tutorial.html>`_ ||
`Datasets & DataLoaders <data_tutorial.html>`_ ||
`Transforms <transforms_tutorial.html>`_ ||
`Build Model <buildmodel_tutorial.html>`_ ||
`Autograd <autogradqs_tutorial.html>`_ ||
`Optimization <optimization_tutorial.html>`_ ||
`Save & Load Model <saveloadrun_tutorial.html>`_

Quickstart
===================
This section runs through the API for common tasks in machine learning. Refer to the links in each section to dive deeper.

Working with data
-----------------
PyTorch has two `primitives to work with data <https://pytorch.org/docs/stable/data.html>`_:
``torch.utils.data.DataLoader`` and ``torch.utils.data.Dataset``.
``Dataset`` stores the samples and their corresponding labels, and ``DataLoader`` wraps an iterable around
the ``Dataset``.




In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
import torch.nn.functional as F


PyTorch offers domain-specific libraries such as `TorchText <https://pytorch.org/text/stable/index.html>`_,
`TorchVision <https://pytorch.org/vision/stable/index.html>`_, and `TorchAudio <https://pytorch.org/audio/stable/index.html>`_,
all of which include datasets. For this tutorial, we  will be using a TorchVision dataset.

The ``torchvision.datasets`` module contains ``Dataset`` objects for many real-world vision data like
CIFAR, COCO (`full list here <https://pytorch.org/vision/stable/datasets.html>`_). In this tutorial, we
use the FashionMNIST dataset. Every TorchVision ``Dataset`` includes two arguments: ``transform`` and
``target_transform`` to modify the samples and labels respectively.



In [3]:
# Download training data from open datasets.
training_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

We pass the ``Dataset`` as an argument to ``DataLoader``. This wraps an iterable over our dataset, and supports
automatic batching, sampling, shuffling and multiprocess data loading. Here we define a batch size of 64, i.e. each element
in the dataloader iterable will return a batch of 64 features and labels.



In [4]:
batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print("Shape of X [N, C, H, W]: ", X.shape)
    print("Shape of y: ", y.shape, y.dtype)
    break


Shape of X [N, C, H, W]:  torch.Size([64, 1, 28, 28])
Shape of y:  torch.Size([64]) torch.int64


Read more about `loading data in PyTorch <data_tutorial.html>`_.




--------------




Creating Models
------------------
To define a neural network in PyTorch, we create a class that inherits
from `nn.Module <https://pytorch.org/docs/stable/generated/torch.nn.Module.html>`_. We define the layers of the network
in the ``__init__`` function and specify how data will pass through the network in the ``forward`` function. To accelerate
operations in the neural network, we move it to the GPU if available.



Read more about `building neural networks in PyTorch <buildmodel_tutorial.html>`_.




--------------




In [5]:
import torch
print(torch.__version__)
# define the network
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

class DLRANet(nn.Module):
    weights_K:list[torch.Tensor]  # n1 x r
    weights_Lt:list[torch.Tensor] # n2 x r
    weights_S:list[torch.Tensor]  # r x r
    aux_U:list[torch.Tensor]      # n1 x r
    aux_Unp1:list[torch.Tensor]   # n1 x r
    aux_Vt:list[torch.Tensor]     # r x n2
    aux_Vtnp1:list[torch.Tensor]  # r x n2
    aux_N:list[torch.Tensor]      # r x r
    aux_M:list[torch.Tensor]      # r x r
    biases:list[torch.Tensor]     # n2
    num_layers:int 
    layer_width:int
    
    def __init__(self,input_dim:int,output_dim:int,layer_width:int,num_layers:int,low_rank:int=10):
        self.num_layers = num_layers
        self.layer_width = layer_width
        # weight initialization
        W = torch.rand(input_dim,layer_width)
        W = W/torch.norm(W)
        #print(W.size())
        u, s, v = torch.svd(W)
        #print(W)
        #print(torch.mm(u,torch.mm(torch.diag(s),torch.transpose(v,0,1))))
        #print(torch.mm(torch.narrow(u, 1, 0, low_rank),torch.mm(torch.narrow(torch.narrow(torch.diag(s), 0, 0, low_rank) , 1, 0, low_rank),torch.narrow(torch.transpose(v,0,1), 0, 0, low_rank))))
        #print(torch.mm(torch.narrow(u, 1, 0, low_rank),torch.mm(torch.narrow(torch.narrow(torch.diag(s), 0, 0, low_rank) , 1, 0, low_rank),torch.narrow(torch.transpose(v,0,1), 0, 0, low_rank)))-W)
        #print(u.size())
        #print(s.size())
        #print(v.size())
        #t = torch.narrow(torch.narrow(torch.diag(s), 0, 0, low_rank) , 1, 0, low_rank)
        #print(t)
        vt=torch.transpose(v,0,1)
        self.weights_K:list[torch.Tensor] = [torch.rand(input_dim,low_rank)]    # gets overwritten in K-step
        self.weights_Lt:list[torch.Tensor] = [torch.rand(layer_width,low_rank)] # gets overwritten in L-step
        self.weights_S:list[torch.Tensor] = [torch.narrow(torch.narrow(torch.diag(s), 0, 0, low_rank) , 1, 0, low_rank)]       # narrow to rxr
        self.aux_U:list[torch.Tensor] = [torch.narrow(u, 1, 0, low_rank)]      # narrow to n2xr
        self.aux_Unp1:list[torch.Tensor] = [torch.rand(input_dim,low_rank)]     # gets overwritten 
        self.aux_Vt:list[torch.Tensor] = [torch.narrow(vt, 0, 0, low_rank)]    # narrow to rxn1
        self.aux_Vtnp1:list[torch.Tensor] = [torch.rand(low_rank,layer_width)]   # gets overwritten 
        self.aux_N:list[torch.Tensor] = [torch.rand(low_rank,low_rank)]  # gets overwritten 
        self.aux_M:list[torch.Tensor] = [torch.rand(low_rank,low_rank)]  # gets overwritten 
        self.biases:list[torch.Tensor] = [torch.rand(layer_width)]       
        for i in range(1,num_layers-1):
            # weight initialization
            self.W = torch.rand(layer_width,layer_width)
            self.W = self.W/torch.norm(self.W)
            u, s, v = torch.svd(self.W)
            vt=torch.transpose(v,0,1)
            self.weights_K.append(torch.rand(layer_width,low_rank, requires_grad=True))
            self.weights_Lt.append(torch.rand(layer_width,low_rank, requires_grad=True))
            self.weights_S.append(torch.narrow(torch.narrow(torch.diag(s), 0, 0, low_rank) , 1, 0, low_rank))
            self.aux_U.append(torch.narrow(u, 1, 0, low_rank))
            self.aux_Unp1.append(torch.rand(layer_width,low_rank, requires_grad=False))
            self.aux_Vt.append(torch.narrow(vt, 0, 0, low_rank))
            self.aux_Vtnp1.append(torch.rand(low_rank,layer_width, requires_grad=False))
            self.aux_N.append(torch.rand(low_rank,low_rank, requires_grad=False))
            self.aux_M.append(torch.rand(low_rank,low_rank, requires_grad=False))
            self.biases.append(torch.rand(layer_width, requires_grad=True))
        W = torch.rand(layer_width,output_dim)
        W = W/torch.norm(W)
        u, s, v = torch.svd(W)
        vt=torch.transpose(v,0,1)
        self.weights_K.append(torch.rand(layer_width,low_rank, requires_grad=True))
        self.weights_Lt.append(torch.rand(output_dim,low_rank, requires_grad=True))
        self.weights_S.append(torch.narrow(torch.narrow(torch.diag(s), 0, 0, low_rank) , 1, 0, low_rank))
        self.aux_U.append(torch.narrow(u, 1, 0, low_rank))
        self.aux_Unp1.append(torch.rand(layer_width,low_rank, requires_grad=False))
        self.aux_Vt.append(torch.narrow(vt, 0, 0, low_rank))
        self.aux_Vtnp1.append(torch.rand(low_rank,output_dim, requires_grad=False))
        self.aux_N.append(torch.rand(low_rank,low_rank, requires_grad=False))
        self.aux_M.append(torch.rand(low_rank,low_rank, requires_grad=False))
        self.biases.append(torch.rand(output_dim, requires_grad=True))

        with torch.no_grad():
            for i in range(0,self.num_layers):
                # mark for auto differentiation tape
                self.weights_K[i].requires_grad = True
                self.weights_Lt[i].requires_grad = True
                self.weights_S[i].requires_grad = True
                self.aux_U[i].requires_grad = False
                self.aux_Vt[i].requires_grad = False
                self.aux_N[i].requires_grad = False
                self.aux_M[i].requires_grad = False
                self.biases[i].requires_grad = True
        return None
    
    def K_step_forward(self,input_tensor:torch.Tensor)->torch.Tensor:
        # K-step of DRLA (forward pass)
        # prepare  K 
        with torch.no_grad():
            for i in range(0,self.num_layers):
                # a) update K
                self.weights_K[i] = torch.matmul(self.aux_U[i],self.weights_S[i])
                self.weights_K[i].requires_grad = True
        z = input_tensor
        # pass forward
        for i in range(0,self.num_layers-1):
            #z = f(xW+b) = f(xKV + b)
            z = F.relu(torch.matmul(z,torch.matmul(self.weights_K[i],self.aux_Vt[i])))

        return F.log_softmax(torch.matmul(z,torch.matmul(self.weights_K[self.num_layers-1],self.aux_Vt[self.num_layers-1])),-1)
    
    def K_step_update(self,stepsize:float=1e-3):
        # K-step of DRLA (update)
        with torch.no_grad():
            for i in range(0,self.num_layers):
                # gradient update
                self.weights_K[i]=self.weights_K[i]- stepsize* self.weights_K[i].grad
                self.weights_K[i].requires_grad = True
                # Create U
                self.aux_Unp1[i],_ = torch.qr(self.weights_K[i])
                #print(self.weights_K[i].size())
                #print(t.size())
                #print("unp1")
                #print(self.aux_Unp1[i].size())
                #print("U")
                #print(self.aux_U[i].size())
                #print("prod")
                #tmp = torch.matmul(torch.transpose(self.aux_Unp1[i], 0, 1),self.aux_U[i])
                #print(tmp.size())
                #print("____")
                # Create N
                self.aux_N[i] = torch.matmul(torch.transpose(self.aux_Unp1[i], 0, 1),self.aux_U[i])
        return None
    
    def L_step_forward(self,input_tensor:torch.Tensor)->torch.Tensor:
        # L-step of DLRA (forward)
        # prepare  L 
        with torch.no_grad():
            for i in range(0,self.num_layers):
                # a) update L
                self.weights_Lt[i] = torch.matmul(self.weights_S[i],self.aux_Vt[i]) # L-transpose 
                self.weights_Lt[i].requires_grad = True
        z = input_tensor
        # pass forward
        for i in range(0,self.num_layers-1):
            #z = f(xW+b) = f(xUL + b)
            z = F.relu(torch.matmul(z,torch.matmul(self.aux_U[i],self.weights_Lt[i])))
        return F.log_softmax(torch.matmul(z,torch.matmul(self.aux_U[self.num_layers-1],self.weights_Lt[self.num_layers-1])),-1) 
     
    def L_step_update(self,stepsize:float=1e-3):
        # L-step of DRLA (update)
        with torch.no_grad():
            for i in range(0,self.num_layers):
                # gradient update
                self.weights_Lt[i]=self.weights_Lt[i]- stepsize* self.weights_Lt[i].grad
                self.weights_Lt[i].requires_grad = True
                # Create V_np1
                self.aux_Vtnp1[i], _ = torch.qr(torch.transpose(self.weights_Lt[i], 0, 1))
                self.aux_Vtnp1[i] = torch.transpose(self.aux_Vtnp1[i], 0, 1)
                # Create N
                #print("S")
                #print(self.weights_S[i].size())
                #print("V")
                #print(self.aux_Vt[i].size())
                #print("L")
                #print(self.weighweights_Ltts_L[i].size())
                #print("Vnp1")
                #print(self.aux_Vtnp1[i].size())
                #print("V")
                #print(self.aux_Vt[i].size())
                #print("prod")
                #tmp =  torch.matmul(self.aux_Vtnp1[i],torch.transpose(self.aux_Vt[i], 0, 1))
                #print(tmp.size())
                #print("____")
                self.aux_M[i] = torch.matmul(self.aux_Vtnp1[i],torch.transpose(self.aux_Vt[i], 0, 1)) #Vtnp1*V
                # Update U_np1
                #self.aux_U[i] = U_np1
        return None
    
    def S_step_forward(self,input_tensor:torch.Tensor)->torch.Tensor:
        # S-step of DLRA (forward)
        # prepare  S
        with torch.no_grad():
            for i in range(0,self.num_layers):
                # a) update S
                self.weights_S[i] = torch.matmul(torch.matmul(self.aux_N[i],self.weights_S[i]),torch.transpose(self.aux_M[i], 0, 1)) # S
                self.weights_S[i].requires_grad = True
        
        z = input_tensor
        # pass forward
        for i in range(0,self.num_layers-1):
            #z = f(xW+b) \approx f(xUnp1 S Vnp1^T + b)
            z = F.relu(torch.matmul(z,torch.matmul(self.aux_Unp1[i],torch.matmul(self.weights_S[i],self.aux_Vtnp1[i]))))
        #print(self.W.size())
        #print(self.W)
        #t = torch.matmul(self.aux_Unp1[self.num_layers-2],torch.matmul(self.weights_S[self.num_layers-2],self.aux_Vtnp1[self.num_layers-2]))
        #print(t.size())
        #print(t)
        #print("____")
        return F.log_softmax(torch.matmul(z,torch.matmul(self.aux_Unp1[self.num_layers-1],torch.matmul(self.weights_S[self.num_layers-1],self.aux_Vtnp1[self.num_layers-1]))),-1) 
     
    def S_step_update(self,stepsize:float=1e-3):
        # S-step of DRLA (update)
        with torch.no_grad():
            for i in range(0,self.num_layers):
                # gradient update
                self.weights_S[i]=self.weights_S[i]- stepsize* self.weights_S[i].grad
                self.weights_S[i].requires_grad = True
                # update U to Unp1 and V to Vnp1
                self.aux_U[i] = self.aux_Unp1[i]
                self.aux_Vt[i] = self.aux_Vtnp1[i]
        return None
    
    def clear_grads(self):
        #print("Clear Grads")
        for i in range(0,self.num_layers):
            if self.weights_K[i].grad is not None:
                self.weights_K[i].grad.data.zero_()
            if self.weights_Lt[i].grad is not None:
                self.weights_Lt[i].grad.data.zero_()
            if self.weights_S[i].grad is not None:
                self.weights_S[i].grad.data.zero_()
            if self.aux_U[i].grad is not None:
                self.aux_U[i].grad.data.zero_()
            if self.aux_Unp1[i].grad is not None:
                self.aux_Unp1[i].grad.data.zero_()
            if self.aux_Vt[i].grad is not None:
                self.aux_Vt[i].grad.data.zero_()
            if self.aux_Vtnp1[i].grad is not None:
                self.aux_Vtnp1[i].grad.data.zero_()
            if self.aux_N[i].grad is not None:
                self.aux_N[i].grad.data.zero_()
            if self.aux_M[i].grad is not None:
                self.aux_M[i].grad.data.zero_()
            if self.biases[i].grad is not None:
                self.biases[i].grad.data.zero_()
        return None
        
    def print_layer_size(self):
         for i in range(0,self.num_layers):
            print("layer "+ str(i))
            print("K")
            print(self.weights_K[i].size())
            print("Lt")
            print(self.weights_Lt[i].size())
            print("S")
            print(self.weights_S[i].size())
            print("U")
            print(self.aux_U[i].size())
            print("Unp1")
            print(self.aux_Unp1[i].size())
            print("Vt")
            print(self.aux_Vt[i].size())
            print("Vtnp1")
            print(self.aux_Vtnp1[i].size())
            print("N")
            print(self.aux_N[i].size())
            print("M")
            print(self.aux_M[i].size())
            print("b")
            print(self.biases[i].size())
    
    def print_layer_weights(self):
        for i in range(0,self.num_layers):
            print("layer "+ str(i))
            print("K")
            print(self.weights_K[i])
            print("L")
            print(self.weights_Lt[i])
            print("S")
            print(self.weights_S[i])
            print("U")
            print(self.aux_U[i])
            print("Unp1")
            print(self.aux_Unp1[i])
            print("Vt")
            print(self.aux_Vt[i])
            print("Vtnp1")
            print(self.aux_Vtnp1[i])
            print("N")
            print(self.aux_N[i])
            print("M")
            print(self.aux_M[i])
            print("b")
            print(self.biases[i])
        return None
    
    def print_weights_K(self):
        print("K")
        for i in range(0,self.num_layers):
            print("layer "+ str(i))
            print(self.weights_K[i])
            
    def print_weights_Lt(self):
        print("L")
        for i in range(0,self.num_layers):
            print("layer "+ str(i))
            print(self.weights_Lt[i])
            
    def print_weights_S(self):
        print("S")
        for i in range(0,self.num_layers):
            print("layer "+ str(i))
            print(self.weights_S[i])
    
    def print_aux_M(self):
        print("M")
        for i in range(0,self.num_layers):
            print("layer "+ str(i))
            print(self.aux_M[i])
    
    def print_aux_N(self):
        print("N")
        for i in range(0,self.num_layers):
            print("layer "+ str(i))
            print(self.aux_N[i])
    

1.7.1
Using cpu device


In [6]:
## some sanity checks 
test_net = DLRANet(input_dim=10,output_dim=10,layer_width=10,num_layers=3,low_rank=10)
#test_net.print_layer_size()
print("input")
x = torch.rand(5,10) # random flattened images
y = torch.randint(0,9,(5,)) #random labels
#print(x)
#print(x.size())
#print("labels")
#print(y.size())

input


In [7]:
print("layer size")
#test_net.print_layer_size()

layer size


In [8]:
print("layer weights")
#test_net.print_layer_weights()

layer weights


In [9]:
print("K_step")
out = test_net.K_step_forward(x)
print("output")
print(out.size())
loss = F.nll_loss(out, y)
print("loss")
print(loss.size())
print(loss)
loss.backward()
test_net.K_step_update(stepsize=1e-2)
test_net.clear_grads()

K_step
output
torch.Size([5, 10])
loss
torch.Size([])
tensor(2.3223, grad_fn=<NllLossBackward>)


In [10]:
print("L_step_forward")
out = test_net.L_step_forward(x)
print("output")
print(out.size())
loss = F.nll_loss(out, y)
print("loss")
print(loss.size())
print(loss)
loss.backward()
test_net.L_step_update(stepsize=1e-2)
test_net.clear_grads()

L_step_forward
output
torch.Size([5, 10])
loss
torch.Size([])
tensor(2.3223, grad_fn=<NllLossBackward>)


In [11]:
print("S_step_forward")
out = test_net.S_step_forward(x)
print("output")
print(out.size())
loss = F.nll_loss(out, y)
print("loss")
print(loss.size())
print(loss)
loss.backward()
test_net.S_step_update(stepsize=1e-2)
test_net.clear_grads()

S_step_forward
output
torch.Size([5, 10])
loss
torch.Size([])
tensor(2.3223, grad_fn=<NllLossBackward>)


# Define Network for MNIST

In a single training loop, the model makes predictions on the training dataset (fed to it in batches), and
backpropagates the prediction error to adjust the model's parameters.



We also check the model's performance against the test dataset to ensure it is learning.



The training process is conducted over several iterations (*epochs*). During each epoch, the model learns
parameters to make better predictions. We print the model's accuracy and loss at each epoch; we'd like to see the
accuracy increase and the loss decrease with every epoch.



Read more about `Training your model <optimization_tutorial.html>`_.




--------------




In [17]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    #model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            x = nn.Flatten()(X)
            ## Only S-Step ##
            pred = model.S_step_forward(x)
            ## L-Step ##
            test_loss += F.nll_loss(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [18]:
def train(dataloader, model, loss_fn,curr_epoch):
    size = len(dataloader.dataset)
    #model.train()
    for batch, (X, y) in enumerate(dataloader):
        
        X, y = X.to(device), y.to(device)
        x = nn.Flatten()(X)
       
        # Compute prediction error
        
        ## K-Step ##
        #print("K-Step")
        out = model.K_step_forward(x)
        loss = F.nll_loss(out, y)
        #print(loss)
        loss.backward()
        model.K_step_update(stepsize=1e-2/(curr_epoch))
        #model.print_weights_K()
        model.clear_grads()
        
        ## L-Step ##
        #print("L-Step")
        out = model.L_step_forward(x)
        loss = F.nll_loss(out, y)
        #print(loss)
        loss.backward()
        model.L_step_update(stepsize=1e-2/(curr_epoch))
        #model.print_weights_Lt()
        model.clear_grads()
        
        ## S-Step ##
        #print("S-Step")
        out = model.S_step_forward(x)
        #model.print_weights_S()

        loss = F.nll_loss(out, y)
        #print(loss)
        loss.backward()
        #model.print_weights_S()
        #model.print_aux_M()
        #model.print_aux_N()
        model.S_step_update(stepsize=1e-2/(curr_epoch))
        model.clear_grads()
        

        #print(model.weights[1])
        #if batch % 100 == 0:
        #    loss, current = loss.item(), batch * len(X)
        #    print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


In [24]:
## Define network
batch_size = 64
input_dim = 784
output_dim = 10
layer_width = 30
layer_num = 3
low_rank = 10
model = DLRANet(input_dim,output_dim,layer_width,layer_num,low_rank)
loss_fn =  torch.nn.functional.nll_loss


In [25]:
# check weight initializations
#model.print_layer_size()
#model.print_layer_weights()

In [None]:
epochs =100
for t in range(epochs):
    #model.print_layer_size()
    #model.print_layer_weights()
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn,t+1)
    test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
Test Error: 
 Accuracy: 9.8%, Avg loss: 2.250179 

Epoch 2
-------------------------------
Test Error: 
 Accuracy: 9.8%, Avg loss: 2.116750 

Epoch 3
-------------------------------
Test Error: 
 Accuracy: 9.8%, Avg loss: 2.095636 

Epoch 4
-------------------------------
Test Error: 
 Accuracy: 9.8%, Avg loss: 2.085290 

Epoch 5
-------------------------------
Test Error: 
 Accuracy: 9.8%, Avg loss: 2.077646 

Epoch 6
-------------------------------
Test Error: 
 Accuracy: 9.8%, Avg loss: 2.070265 

Epoch 7
-------------------------------
Test Error: 
 Accuracy: 9.8%, Avg loss: 2.063419 

Epoch 8
-------------------------------
Test Error: 
 Accuracy: 9.8%, Avg loss: 2.057779 

Epoch 9
-------------------------------


In [None]:
torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Model State to model.pth")