In [1]:
import time
import numpy as np
import torch
import pandas as pd
import torch.nn as nn
import torch.backends.mps
import matplotlib.pyplot as plt
import matplotlib.image as img
import matplotlib.pylab as pylab
from torch.autograd.anomaly_mode import set_detect_anomaly

params = {'font.size' : 16 }
pylab.rcParams.update(params)

In [2]:
#Hyperparameters
batch_size = 10
learning_rate = 0.003
padding_size = 2
filter_size = 3
step_size = 1
num_features = 8
num_connected_layers = 5
num_conv_layers = 2
num_classifications = 10
num_neurons_per_layer = 50
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
    torch.dtype=torch.float32
else:
    device = "cpu"
train=True
if train:
    train_data = pd.read_csv("Data/MNIST/mnist_train.csv")
else:
    train_data = pd.read_csv("Data/MNIST/mnist_test.csv")
torch.autograd.anomaly_mode.set_detect_anomaly(False)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x14b03dd30>

In [3]:
# for multi dimensioanl images with a step size of 1 - 1:1 image to conved_image size; dimensions increase by 2
# convolution using a matrix multiplication
class Convolver:
    def __init__(self, kern, bias, x, device=device):
        if(x.dim() <= 2):
            self.img = x.unsqueeze(dim=1)
        else:
            self.img = x
        self.w = kern
        self.b = bias
        self.device = device
    
    def flatten_pad(self):
        pad_size = int((filter_size - 1) / 2) 
        img = nn.functional.pad(self.img, (pad_size, pad_size, pad_size, pad_size))
        n = torch.zeros(2)
        for i in range(2):
            n[i] = ((img.shape[-i - 1] - filter_size) / step_size) + 1
        flat_img = img.flatten().clone().view(img.shape[-4], img.shape[-3], img.shape[-2]*img.shape[-1]).float()
        flat_kern = self.w.view(self.w.shape[-3], filter_size * filter_size)
        return n, img, flat_img, flat_kern

    def change_kern_multi(self):
        n, img, flat_img, flat_kern = self.flatten_pad()
        new_kern = torch.zeros(self.w.shape[-3] , int(torch.prod(n)), img.shape[-2] * img.shape[-1], device=device) #produces num_feat x n x n convolution
        flat_kern = torch.cat([flat_kern for _ in range(1)], dim=0)
        for k  in range(new_kern.shape[-3]):
            start_idx = 0
            for i in range(new_kern.shape[-2]):
                j = 0
                c = 0
                if i == 0:
                    start_idx = start_idx
                elif i % int(n[0]) != 0:
                    start_idx += step_size * step_size
                else: 
                    start_idx = int((i+1-step_size)//int(n[0]) * img.shape[-2])
                while j < new_kern.shape[-1] and start_idx+j+filter_size <= new_kern.shape[-1]:
                    new_kern[k][i][start_idx + j:start_idx + j + filter_size] = flat_kern[k][c:c + filter_size]
                    c += filter_size
                    j += img.shape[-2]
                    if c == flat_kern.shape[-1]:
                        break
        return n, new_kern, flat_img
    
    def convolve(self): #convolves image using a kernel made from self.w
        start_time = time.time()
        conved_final = []
        n, changed_kern, flat_img = self.change_kern_multi()
        to_be_cat = [flat_img for _ in range(self.img.shape[-3])]
        copy_flat_img = torch.cat(to_be_cat, dim=1).unsqueeze(dim=-2)
        for batch in range(self.img.shape[-4]):
            conv = [changed_kern @ torch.transpose(copy_flat_img[batch][j], -1, -2) for j in range(self.img.shape[-3])]
            conved_final.append(torch.concat(conv, dim=0).squeeze(dim=0))
        conved_final = torch.stack(conved_final)
        squeezed_convd_mat = conved_final.squeeze(dim=-1)
        squeezed_convd_mat += self.b
        conv_img = squeezed_convd_mat.reshape(batch_size, self.img.shape[-3] * num_features, int(n[-2]), int(n[-1]))
        end_time = time.time()
        return conv_img  
    
    def pool(self, step_size: int, conv_img: torch.tensor):
        pool_kern = torch.ones(step_size, step_size, device=self.device)
        pool_size = [conv_img.shape[-4], conv_img.shape[-3], int((conv_img.shape[-2] - step_size)/step_size + 1), int((conv_img.shape[-1] - step_size)/step_size + 1)]
        pool_img = torch.zeros(pool_size, device=device)
        for batch in range(conv_img.shape[-4]):
            for feat in range(num_features):
                row = 0
                for i in range(pool_img.shape[-2]):
                    col = 0
                    for j in range(pool_img.shape[-1]):
                        pixels = conv_img[batch, feat, row:row + step_size, col:col + step_size]
                        conv = pixels * pool_kern
                        pool_img[feat, i, j] = torch.max(conv)
                        col += step_size 
                    row += step_size 
        return pool_img


In [15]:
class Layer:
    def __init__(self, num_neurons, num_inputs_per_neuron, device):
        self.w = torch.randn(num_neurons, num_inputs_per_neuron, dtype=torch.float32, requires_grad=True, device=device) 
        self.b = torch.randn(num_neurons, 1, dtype=torch.float32, requires_grad=True, device=device) 
        self.gamma = torch.ones(1, dtype=torch.float32, device = device, requires_grad=True)
        self.beta = torch.zeros(1, dtype=torch.float32, device = device, requires_grad=True)
        
    def forward(self, x):
        out = ((self.w @ x) + self.b)
        out = torch.nn.functional.relu(out)
        out = self.batch_norm(out)
        return out
    
    def batch_norm(self, x):
        mean = torch.mean(x, dim=0, keepdim=True)
        variance = torch.std(x, dim=0, keepdim=True)
        # normal = [(x[j][i] - mean) / (variance + 1e-8) for i in range(x.shape[-3])]
        x = self.gamma * (x - mean) / (variance + 1e-8) + self.beta
        return x
    

In [16]:
# Conv Layer using calling convolver
class Conv_layer():
    def __init__(self, img, filter_size:int, num_features: int, device=device):
        self.device = device
        if(img.dim() <= 2):
            self.img = img.unsqueeze(dim=0)
        else:
            self.img = img
        self.num_features = num_features
        self.w = torch.randn(num_features, filter_size, filter_size, requires_grad=True, dtype=torch.float32, device=self.device) 
        self.b = torch.randn(1, self.img.shape[-1] * self.img.shape[-2], requires_grad=True, dtype=torch.float32, device=self.device) 
        self.gamma = torch.ones(1, dtype=torch.float32, device = self.device, requires_grad=True)
        self.beta = torch.zeros(1, dtype=torch.float32, device = self.device, requires_grad=True)

    def convolve(self, image: torch.tensor):
        convolver = Convolver(kern=self.w, x=image, bias=self.b)
        conv_img = convolver.convolve()
        return conv_img
    
    def forward_relu(self, image: torch.tensor):
        
        conv_img = self.convolve(image) 
        out = torch.nn.functional.relu(conv_img)
        conv_img = self.batch_norm(image)
        return out
    
    def batch_norm(self, input):
        conv_img_temp = input.clone()
        stack = []
        # for j in range(conv_img_temp.shape[-4]):
        #     mean = torch.mean(conv_img_temp[j], dim=-3, keepdim=True)
        #     variance = torch.std(conv_img_temp[j], dim=-3, keepdim=True, unbiased=True)
        #     normal = [(conv_img_temp[j][i] - mean) / (variance + 1e-8) for i in range(conv_img_temp.shape[-3])]
        #     stack.append(torch.concat(normal, dim=0))
        # conv_img_temp = self.gamma * torch.stack(stack) + self.beta
        mean = torch.mean(conv_img_temp, dim=0, keepdim=True)
        variance = torch.std(conv_img_temp, dim=0, keepdim=True)
        # normal = [(conv_img_temp[j][i] - mean) / (variance + 1e-8) for i in range(conv_img_temp.shape[-3])]
        conv_img_temp = self.gamma * (conv_img_temp - mean) / (variance + 1e-8) + self.beta
        return conv_img_temp
    


In [17]:
class CNN:
    def __init__(self, num_conected_layers, num_conv_layers, num_neurons_per_layer, 
                 num_final_out, datagen, batch_size, device=device):
        
        self.device = device
        self.datagen = datagen(batch_size)
        img, _ = self.datagen.data_generator()

        self.convlayers = [Conv_layer(img, filter_size, num_features)]
        for _ in range(1, num_conv_layers):
            self.convlayers.append(Conv_layer(img, filter_size, num_features))
        self.num_inputs_per_neuron = (img.shape[-1] * img.shape[-2]) * num_features**(num_conv_layers) 

        self.layers = [Layer(num_neurons_per_layer, self.num_inputs_per_neuron, device=device)]
        for _ in range(1, num_conected_layers-1):
            self.layers.append(Layer(num_neurons_per_layer, num_neurons_per_layer, device=device))
        self.layers.append(Layer(num_final_out, num_neurons_per_layer, device=device))
    
    def forward(self, x):
        out=torch.zeros(1, requires_grad=True)
        for i in range(0, num_conv_layers):
            if i==0:
                out = self.convlayers[i].forward_relu(x)
            elif i==num_conv_layers-1:
                out = self.convlayers[i].forward_relu(out)
                out = out.reshape(self.num_inputs_per_neuron, batch_size)
            else:
                out = self.convlayers[i].forward_relu(out)


        for i in range(0, num_connected_layers):
            out = self.layers[i].forward(out)

        return out
    
    def train(self, epochs, learning_rate):
        loss_func = nn.CrossEntropyLoss()
        for i in range(epochs):
            x, y = self.datagen.data_generator()
            out = self.forward(x).view(y.shape[0], y.shape[1])
            # out = torch.tensor([-torch.inf if i.item()==0.0 else i for i in out.flatten()], device=device, requires_grad=True).view(y.shape[0], y.shape[1])
            loss = loss_func(out, y) 
            # loss = ((y - out)**2).flatten().sum() 
            print(f"Loss: {loss} at epoch: {i}")
            loss.backward()
            for conv_layer in self.convlayers:
                #manual clip grad, to be fixed
                # w_grad = torch.tensor([c.item() if torch.abs(c) < 1000.0 else 1000.0 for c in conv_layer.w.grad.flatten()], dtype=torch.float, device=self.device)
                # print(w_grad, "\n", type(conv_layer.w.grad))
                # conv_layer.w.grad = w_grad
                # b_grad = torch.tensor([c.item() if torch.abs(c) < 100.0 else 100.0 for c in conv_layer.b.grad.flatten()], dtype=torch.float, device=self.device)
                # conv_layer.b.grad = b_grad
                conv_layer.w.data -= learning_rate * conv_layer.w.grad
                conv_layer.b.data -= learning_rate * conv_layer.b.grad
                conv_layer.w.grad = None
                conv_layer.b.grad = None
                
            for layer in self.layers:
                layer.w.data -= learning_rate * layer.w.grad
                layer.b.data -= learning_rate * layer.b.grad
                layer.w.grad = None
                layer.b.grad = None

In [18]:
class Datagen:
    def __init__(self, batch_size) -> None:
        self.batch_size = batch_size

    def one_hot_encoder(self, y):
            y_out = []
            for i in y:
                # num = np.array([-torch.inf for _ in range(num_classifications)])
                num = np.zeros(num_classifications)
                num[i] = 1
                y_out.append(num)
            return(torch.tensor(np.array(y_out), dtype=torch.float32)).to(device)
    
    def data_generator(self, fix_seed=False, train=True):
        
        if fix_seed==True:
            seed_idx = torch.tensor(int(input("Enter seed index number")))
        else:
            seed_idx = (torch.randint(low=0, high=len(train_data) - self.batch_size, size=(1,1))).item()
        
        y_out = torch.tensor(train_data['label'].iloc[seed_idx:seed_idx+self.batch_size].to_numpy()).to(device)
        x = (torch.tensor(train_data.iloc[seed_idx:seed_idx+self.batch_size, 1:].to_numpy()).view(self.batch_size, 1, 28, 28).to(device)).float()
        y_out = self.one_hot_encoder(y_out).float()
        return x, y_out

In [19]:
datagen = Datagen(batch_size=batch_size)

In [20]:
x, y = datagen.data_generator()

In [21]:
cnn = CNN(num_connected_layers, num_conv_layers, num_neurons_per_layer, num_classifications, Datagen, batch_size, device)

In [22]:
cnn.train(epochs=30, learning_rate=0.0005)

Loss: 2.9705283641815186 at epoch: 0
Loss: 2.385960340499878 at epoch: 1
Loss: 2.8118996620178223 at epoch: 2
Loss: 2.7869930267333984 at epoch: 3
Loss: 2.3940792083740234 at epoch: 4
Loss: 2.946594715118408 at epoch: 5
Loss: 2.5264711380004883 at epoch: 6


KeyboardInterrupt: 

In [14]:
cnn.convlayers[0].w.grad

tensor([[[-1.4011e+01, -4.7642e+01, -6.7258e+01],
         [ 1.2936e+02, -1.0393e+01, -2.6423e+01],
         [ 5.9078e+01, -3.3350e+00,  4.6669e+01]],

        [[-1.4132e+02,  7.2798e+01,  6.5263e+01],
         [-1.1615e+02,  7.5025e+00,  2.9697e+01],
         [-2.3716e+01,  3.0742e+01,  2.0322e+01]],

        [[ 1.1718e+00, -1.8158e+01,  5.0958e+00],
         [-3.0504e+01, -3.9549e+01, -5.8561e+01],
         [-1.4784e-01, -3.3997e+01, -4.1191e+01]],

        [[-1.1824e+01, -7.8971e+01, -5.2081e-01],
         [ 3.1681e+01,  3.8837e+01,  4.3595e+01],
         [-3.3055e+01, -5.8856e+00, -3.8480e+00]],

        [[-4.3322e+01, -3.5121e+01,  7.4484e+01],
         [ 6.0489e+01,  4.8112e+00,  8.5939e+00],
         [ 3.2113e+01,  1.4331e+01,  4.2711e+01]],

        [[-1.3780e+02, -5.6488e+01, -7.4232e+01],
         [-8.3689e+01, -9.4623e+01, -7.1628e+01],
         [-1.4588e+01, -2.1396e+01, -6.6734e+01]],

        [[ 1.8737e+01,  2.3232e+01,  2.0566e+01],
         [ 9.4050e+01, -5.9419e+00, -4

In [247]:
cnn.forward(x=x).view(y.shape[0], y.shape[1])

tensor([[ 0.6503,  0.9994,  0.9974, -1.0000, -1.0000,  0.3477, -0.0837, -1.0000,
         -0.9942, -1.0000],
        [-0.7948, -0.5337,  0.9990,  1.0000,  1.0000, -1.0000, -1.0000,  0.9999,
          0.5209, -0.9989],
        [-0.9187,  0.9980, -0.9490,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
         -1.0000,  1.0000],
        [ 0.9891,  0.9569, -0.9863, -0.7765,  0.9942, -0.9389,  1.0000, -0.2122,
         -0.9985,  1.0000],
        [ 0.9998,  1.0000,  0.9574,  0.9695,  0.9862, -1.0000,  0.7552, -0.9999,
         -0.9924, -0.9989],
        [-0.9440,  0.9330, -1.0000,  0.9540, -0.9996,  1.0000, -1.0000,  1.0000,
         -0.9981,  0.7708],
        [ 0.9835,  0.9984, -1.0000,  0.3683,  0.9994,  0.9934,  0.9641,  1.0000,
          0.9978, -0.9166],
        [ 1.0000,  1.0000, -0.1194,  1.0000,  1.0000,  0.8573, -1.0000,  1.0000,
          1.0000,  1.0000],
        [-1.0000, -0.9999, -0.7459, -1.0000, -0.9591, -0.9800,  1.0000, -0.9865,
         -1.0000,  1.0000],
        [-1.0000, -

In [246]:
y

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]], device='mps:0')