Programmer: Jacob Maurer
Date: 9/18/2024
Description: This notebook is going to explore the idea of the data footprint, which is just the (final trained model weights) - (starting out weights). 

In [8]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor


In [109]:
rr_weight_layer_one_pre = pd.read_csv("results/rr_s_weight_pre_1.csv").to_numpy()
rr_weight_layer_one_post = pd.read_csv("results/rr_S_weight_post_1.csv").to_numpy()
rr_footprint = rr_weight_layer_one_post - rr_weight_layer_one_pre
print(np.max(rr_footprint))
print(np.min(rr_footprint))
print(np.mean(rr_footprint))
print(np.median(rr_footprint))

0.058070543999999995
-0.055686748999999994
0.0010366239325176113
0.00011877050000000042


In [110]:
ss_weight_layer_one_pre = pd.read_csv("results/ss_s_weight_pre_1.csv").to_numpy()
ss_weight_layer_one_post = pd.read_csv("results/ss_s_weight_post_1.csv").to_numpy()
ss_footprint = ss_weight_layer_one_post - ss_weight_layer_one_pre
print(np.max(ss_footprint))
print(np.min(ss_footprint))
print(np.mean(ss_footprint))
print(np.median(ss_footprint))

0.0417123994
-0.049363683000000005
-0.0002584007362623034
-3.2806499999999544e-05


In [111]:
modified_ss_layer = ss_weight_layer_one_pre + rr_footprint
modified_rr_layer = rr_weight_layer_one_pre + ss_footprint
print("Old Layer: ")
print(np.max(ss_weight_layer_one_post))
print(np.min(ss_weight_layer_one_post))
print(np.mean(ss_weight_layer_one_post))
print(np.std(ss_weight_layer_one_post))
print(np.median(ss_weight_layer_one_post))
print("New Layer: ")
print(np.max(modified_ss_layer))
print(np.min(modified_ss_layer))
print(np.mean(modified_ss_layer))
print(np.std(modified_ss_layer))
print(np.median(modified_ss_layer))

Old Layer: 
0.07146686
-0.07438539
-0.00021260627435817675
0.0223022551476396
-0.000205464065
New Layer: 
0.08447945300000001
-0.086401961
0.0010824183944217385
0.021391103875374413
0.0010576660500000003


Why this might be important?

It means that we are able to significantly reduce training time by just adding the footprint to the layer. The only problem is, as of right now, it only works when the model has these similar parameters

Hypothesis: Reduction/expansion with pooling and convolution not using neural networks. 
Evidence: Pooling/convolution scale the data while retaining most of the original pattern.

Pooling2d used as a possible way to prune the network??

In [9]:
#Import the dataset

training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data\FashionMNIST\raw\train-images-idx3-ubyte.gz


100%|██████████| 26421880/26421880 [00:07<00:00, 3320078.66it/s]


Extracting data\FashionMNIST\raw\train-images-idx3-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data\FashionMNIST\raw\train-labels-idx1-ubyte.gz


100%|██████████| 29515/29515 [00:00<00:00, 327164.73it/s]


Extracting data\FashionMNIST\raw\train-labels-idx1-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz


100%|██████████| 4422102/4422102 [00:05<00:00, 881581.30it/s] 


Extracting data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz


100%|██████████| 5148/5148 [00:00<?, ?it/s]

Extracting data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz to data\FashionMNIST\raw






In [136]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.Sigmoid(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return correct

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [137]:
print(model.state_dict())

OrderedDict([('linear_relu_stack.0.weight', tensor([[-0.0041, -0.0227,  0.0028,  ..., -0.0200,  0.0312, -0.0309],
        [-0.0341, -0.0111,  0.0178,  ..., -0.0121, -0.0350, -0.0059],
        [ 0.0145, -0.0277,  0.0242,  ..., -0.0223,  0.0331, -0.0082],
        ...,
        [ 0.0355, -0.0140, -0.0332,  ..., -0.0105,  0.0304, -0.0010],
        [-0.0071,  0.0342, -0.0125,  ...,  0.0202,  0.0349, -0.0034],
        [-0.0181,  0.0039,  0.0330,  ..., -0.0007,  0.0001, -0.0082]],
       device='cuda:0')), ('linear_relu_stack.0.bias', tensor([ 2.1006e-02, -8.1743e-03,  5.9776e-03,  2.8017e-03, -2.1296e-02,
         1.0029e-02,  2.0166e-02, -1.7554e-02,  2.6146e-02, -2.7567e-02,
        -2.0030e-02,  3.6295e-03,  3.0830e-02,  1.9033e-02,  1.6304e-02,
         2.2928e-02,  3.5663e-02, -8.9360e-03,  1.5613e-02, -1.1824e-02,
        -1.6636e-02,  2.8873e-02, -3.5365e-02, -2.2869e-02, -1.3977e-02,
         3.1993e-02,  2.5239e-02, -3.4526e-02,  1.4082e-02, -2.3280e-02,
         9.2048e-03, -4.0441e

Tests below are run using the smaller model traing first. Will move to larger model footprint after these same model tests.

In [132]:
#ReLU single: Trial 1: 3 epochs
#Modified: Trial 1: 2 epochs 
#ss footprint: Trial 1: 3 epochs
#Sigmoid single: Trial 1: 14 epochs
#Modified: Trial 1: 13 epochs 
#rr footprint: Trial 1: 10 epochs
t, acc = 0, 0
while acc < 80:
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    acc = test(test_dataloader, model, loss_fn) * 100
    t += 1
print("Done!")

Epoch 1
-------------------------------
loss: 2.319736  [   64/60000]
loss: 2.227729  [ 6464/60000]
loss: 2.124299  [12864/60000]
loss: 2.050216  [19264/60000]
loss: 1.846248  [25664/60000]
loss: 1.802203  [32064/60000]
loss: 1.670375  [38464/60000]
loss: 1.549551  [44864/60000]
loss: 1.497507  [51264/60000]
loss: 1.406090  [57664/60000]
Test Error: 
 Accuracy: 64.2%, Avg loss: 1.369527 

Epoch 2
-------------------------------
loss: 1.421298  [   64/60000]
loss: 1.378088  [ 6464/60000]
loss: 1.191825  [12864/60000]
loss: 1.300091  [19264/60000]
loss: 1.103312  [25664/60000]
loss: 1.155924  [32064/60000]
loss: 1.123164  [38464/60000]
loss: 1.035343  [44864/60000]
loss: 1.082098  [51264/60000]
loss: 1.024225  [57664/60000]
Test Error: 
 Accuracy: 70.9%, Avg loss: 0.992452 

Epoch 3
-------------------------------
loss: 1.035349  [   64/60000]
loss: 1.033785  [ 6464/60000]
loss: 0.836653  [12864/60000]
loss: 1.037098  [19264/60000]
loss: 0.887570  [25664/60000]
loss: 0.923294  [32064/600

In [138]:
new_state = model.state_dict()
new_state["linear_relu_stack.0.weight"] = new_state["linear_relu_stack.0.weight"] + torch.from_numpy(rr_footprint).to(device)
model.load_state_dict(new_state)

<All keys matched successfully>

In [139]:
# modified random training sequence
# cutoff criteria: 80%
t, acc = 0, 0
while acc < 80:
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    acc = test(test_dataloader, model, loss_fn) * 100
    t += 1
print("Done!")

Epoch 1
-------------------------------
loss: 2.325613  [   64/60000]
loss: 2.112521  [ 6464/60000]
loss: 1.908429  [12864/60000]
loss: 1.786030  [19264/60000]
loss: 1.508749  [25664/60000]
loss: 1.486175  [32064/60000]
loss: 1.381909  [38464/60000]
loss: 1.276413  [44864/60000]
loss: 1.256505  [51264/60000]
loss: 1.154146  [57664/60000]
Test Error: 
 Accuracy: 73.6%, Avg loss: 1.122879 

Epoch 2
-------------------------------
loss: 1.163631  [   64/60000]
loss: 1.136131  [ 6464/60000]
loss: 0.948411  [12864/60000]
loss: 1.085122  [19264/60000]
loss: 0.913428  [25664/60000]
loss: 0.957501  [32064/60000]
loss: 0.951497  [38464/60000]
loss: 0.885593  [44864/60000]
loss: 0.929091  [51264/60000]
loss: 0.867335  [57664/60000]
Test Error: 
 Accuracy: 75.0%, Avg loss: 0.840928 

Epoch 3
-------------------------------
loss: 0.844175  [   64/60000]
loss: 0.878672  [ 6464/60000]
loss: 0.674682  [12864/60000]
loss: 0.892983  [19264/60000]
loss: 0.773587  [25664/60000]
loss: 0.787558  [32064/600