Programmer: Jacob Maurer
Date: 9/18/2024
Description: This notebook is going to explore the idea of the data footprint, which is just the (final trained model weights) - (starting out weights). 

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor


In [3]:
rr_weight_layer_one_pre = pd.read_csv("results/rr_s_weight_pre_1.csv").to_numpy()
rr_weight_layer_one_post = pd.read_csv("results/rr_s_weight_post_1.csv").to_numpy()
rr_footprint = rr_weight_layer_one_post - rr_weight_layer_one_pre
print(np.max(rr_footprint))
print(np.min(rr_footprint))
print(np.mean(rr_footprint))
print(np.median(rr_footprint))

0.060906764
-0.0401217225
0.001037473949165444
0.00018947100000000269


In [4]:
ss_weight_layer_one_pre = pd.read_csv("results/ss_s_weight_pre_1.csv").to_numpy()
ss_weight_layer_one_post = pd.read_csv("results/ss_s_weight_post_1.csv").to_numpy()
ss_footprint = ss_weight_layer_one_post - ss_weight_layer_one_pre
print(np.max(ss_footprint))
print(np.min(ss_footprint))
print(np.mean(ss_footprint))
print(np.median(ss_footprint))

0.07387706499999999
-0.06480385799999999
-0.00012459526740174432
-7.177000000000572e-06


In [5]:
modified_ss_layer = ss_weight_layer_one_pre + rr_footprint
modified_rr_layer = rr_weight_layer_one_pre + ss_footprint
print("Old Layer: ")
print(np.max(ss_weight_layer_one_post))
print(np.min(ss_weight_layer_one_post))
print(np.mean(ss_weight_layer_one_post))
print(np.std(ss_weight_layer_one_post))
print(np.median(ss_weight_layer_one_post))
print("New Layer: ")
print(np.max(modified_ss_layer))
print(np.min(modified_ss_layer))
print(np.mean(modified_ss_layer))
print(np.std(modified_ss_layer))
print(np.median(modified_ss_layer))

Old Layer: 
0.09991215
-0.09559656
-0.00013230133303434018
0.023558188617021514
-0.000157678935
New Layer: 
0.08342661
-0.0654329772
0.001029767883532848
0.021198322257155098
0.0010217178299999992


Why this might be important?

It means that we are able to significantly reduce training time by just adding the footprint to the layer. The only problem is, as of right now, it only works when the model has these similar parameters

Hypothesis: Reduction/expansion with pooling and convolution not using neural networks. 
Evidence: Pooling/convolution scale the data while retaining most of the original pattern.

Pooling2d used as a possible way to prune the network??

In [7]:
#Import the dataset

training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [15]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return correct

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [12]:
print(model.state_dict())

OrderedDict([('linear_relu_stack.0.weight', tensor([[ 0.0341,  0.0272,  0.0321,  ...,  0.0197, -0.0168, -0.0107],
        [-0.0066,  0.0117, -0.0312,  ...,  0.0284,  0.0262, -0.0167],
        [ 0.0256, -0.0338,  0.0014,  ..., -0.0231, -0.0342,  0.0114],
        ...,
        [-0.0232,  0.0178,  0.0313,  ...,  0.0040,  0.0155, -0.0116],
        [-0.0047,  0.0297, -0.0166,  ..., -0.0259,  0.0153, -0.0306],
        [-0.0340, -0.0062,  0.0120,  ..., -0.0149, -0.0313, -0.0245]],
       device='cuda:0')), ('linear_relu_stack.0.bias', tensor([ 0.0320, -0.0275,  0.0030, -0.0306, -0.0110,  0.0285, -0.0167, -0.0039,
        -0.0225, -0.0339, -0.0263, -0.0272,  0.0219,  0.0334, -0.0208,  0.0185,
         0.0325,  0.0132, -0.0124, -0.0002,  0.0082, -0.0204, -0.0028, -0.0287,
         0.0130,  0.0340, -0.0164, -0.0280,  0.0283,  0.0283, -0.0149, -0.0108,
         0.0045, -0.0319, -0.0247, -0.0050,  0.0145,  0.0341, -0.0151, -0.0076,
        -0.0312, -0.0259, -0.0157, -0.0345, -0.0031,  0.0208,  0.02

Tests below are run using the smaller model traing first. Will move to larger model footprint after these same model tests.

In [16]:
#ReLU single: Trial 1: 3 epochs
#Modified: Trial 1: 2 epochs 
#ss footprint: Trial 1: 3 epochs
#Sigmoid single: Trial 1: 14 epochs
#Modified: Trial 1: 13 epochs 
#rr footprint: Trial 1: 10 epochs
t, acc = 0, 0
while acc < 80:
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    acc = test(test_dataloader, model, loss_fn) * 100
    t += 1
print("Done!")

Epoch 1
-------------------------------
loss: 2.295743  [   64/60000]
loss: 1.815825  [ 6464/60000]
loss: 1.299510  [12864/60000]
loss: 1.198148  [19264/60000]
loss: 0.926620  [25664/60000]
loss: 0.889095  [32064/60000]
loss: 0.893613  [38464/60000]
loss: 0.801828  [44864/60000]
loss: 0.816976  [51264/60000]
loss: 0.748970  [57664/60000]
Test Error: 
 Accuracy: 74.1%, Avg loss: 0.739310 

Epoch 2
-------------------------------
loss: 0.719671  [   64/60000]
loss: 0.797256  [ 6464/60000]
loss: 0.542338  [12864/60000]
loss: 0.781250  [19264/60000]
loss: 0.641137  [25664/60000]
loss: 0.640792  [32064/60000]
loss: 0.676180  [38464/60000]
loss: 0.683666  [44864/60000]
loss: 0.689323  [51264/60000]
loss: 0.607981  [57664/60000]
Test Error: 
 Accuracy: 79.1%, Avg loss: 0.610954 

Epoch 3
-------------------------------
loss: 0.549364  [   64/60000]
loss: 0.639952  [ 6464/60000]
loss: 0.432332  [12864/60000]
loss: 0.687585  [19264/60000]
loss: 0.563966  [25664/60000]
loss: 0.572663  [32064/600

In [13]:
new_state = model.state_dict()
new_state["linear_relu_stack.0.weight"] = new_state["linear_relu_stack.0.weight"] + torch.from_numpy(rr_footprint).to(device)
model.load_state_dict(new_state)

<All keys matched successfully>

In [14]:
# modified random training sequence
# cutoff criteria: 80%
t, acc = 0, 0
while acc < 80:
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    acc = test(test_dataloader, model, loss_fn) * 100
    t += 1
print("Done!")

Epoch 1
-------------------------------
loss: 2.350044  [   64/60000]
loss: 1.286771  [ 6464/60000]
loss: 0.869158  [12864/60000]
loss: 0.912096  [19264/60000]
loss: 0.744546  [25664/60000]
loss: 0.721017  [32064/60000]
loss: 0.731283  [38464/60000]
loss: 0.711199  [44864/60000]
loss: 0.721409  [51264/60000]
loss: 0.614578  [57664/60000]
Test Error: 
 Accuracy: 79.0%, Avg loss: 0.630674 

Epoch 2
-------------------------------
loss: 0.569716  [   64/60000]
loss: 0.671819  [ 6464/60000]
loss: 0.440914  [12864/60000]
loss: 0.674982  [19264/60000]
loss: 0.572050  [25664/60000]
loss: 0.575476  [32064/60000]
loss: 0.593051  [38464/60000]
loss: 0.656144  [44864/60000]
loss: 0.651581  [51264/60000]
loss: 0.517869  [57664/60000]
Test Error: 
 Accuracy: 81.1%, Avg loss: 0.554890 

Done!


In [47]:
class NeuralNetwork2(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.pool = nn.MaxPool1d(3, stride = 2)
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(391, 512),
            nn.Sigmoid(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        x = self.pool(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork2().to(device)
print(model)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

NeuralNetwork2(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (pool): MaxPool1d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=391, out_features=512, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [48]:
new_state = model.state_dict()
pooling = nn.MaxPool1d(3, stride=2)
pooled_rr_footprint = pooling(torch.from_numpy(rr_footprint))
pooled_ss_footprint = pooling(torch.from_numpy(ss_footprint))
new_state["linear_relu_stack.0.weight"] = new_state["linear_relu_stack.0.weight"] + torch.from_numpy(np.asarray(pooled_rr_footprint)).to(device)
model.load_state_dict(new_state)

<All keys matched successfully>

In [49]:
#Norm: 5 epochs
#rr_footprint: 5 epochs
#ss_footprint: 6 epochs
#Norm: 24 epochs
#ss_footprint: 26 epochs 
t, acc = 0, 0
while acc < 80:
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    acc = test(test_dataloader, model, loss_fn) * 100
    t += 1
print("Done!")

Epoch 1
-------------------------------
loss: 2.293046  [   64/60000]
loss: 2.224894  [ 6464/60000]
loss: 2.132917  [12864/60000]
loss: 2.049005  [19264/60000]
loss: 1.869001  [25664/60000]
loss: 1.824632  [32064/60000]
loss: 1.706986  [38464/60000]
loss: 1.606118  [44864/60000]
loss: 1.562098  [51264/60000]
loss: 1.470821  [57664/60000]
Test Error: 
 Accuracy: 59.9%, Avg loss: 1.452219 

Epoch 2
-------------------------------
loss: 1.514148  [   64/60000]
loss: 1.450181  [ 6464/60000]
loss: 1.306089  [12864/60000]
loss: 1.367158  [19264/60000]
loss: 1.176891  [25664/60000]
loss: 1.237265  [32064/60000]
loss: 1.181069  [38464/60000]
loss: 1.122512  [44864/60000]
loss: 1.162620  [51264/60000]
loss: 1.112746  [57664/60000]
Test Error: 
 Accuracy: 64.6%, Avg loss: 1.090834 

Epoch 3
-------------------------------
loss: 1.155158  [   64/60000]
loss: 1.137584  [ 6464/60000]
loss: 0.980134  [12864/60000]
loss: 1.134294  [19264/60000]
loss: 0.953459  [25664/60000]
loss: 1.025285  [32064/600