In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

device_in_use = 'cuda'

## Weight Averaging Ensembling Vs Regular Training
### Same number of epochs, same dataset and split
**Weight averaging clearly results in a lower expected loss and variance**

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

#reading in dataset
ch = fetch_california_housing()
df = pd.DataFrame(data=ch.data, columns=ch.feature_names)

df_train, df_temp = train_test_split(df, train_size=.70, random_state=42)
df_val, df_test = train_test_split(df_temp, train_size=.5, random_state=42)

print(df_train.shape[0])
print(df_val.shape[0])
print(df_test.shape[0])

from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.features = dataframe.drop('MedInc', axis=1).values
        self.labels = dataframe['MedInc'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float), torch.tensor(self.labels[idx], dtype=torch.float)

train_dataset = CustomDataset(df_train)
val_dataset = CustomDataset(df_val)
test_dataset = CustomDataset(df_test)

train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

14447
3096
3097


In [3]:
# Define a function to move an entire dataset to the device in advance
def preload_dataset_to_device(loader, batch_size, device):
    preloaded_data = [(inputs.to(device), targets.to(device)) for inputs, targets in loader]
    return DataLoader(preloaded_data, batch_size=batch_size)

# Preload datasets to device (if they fit into your device memory)
train_loader = preload_dataset_to_device(train_loader, len(train_dataset), device_in_use)
val_loader = preload_dataset_to_device(val_loader, len(val_dataset), device_in_use)

In [4]:
class LinearModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        return self.linear(x)
    

# Example parameters
input_size = 7  # Number of input features
output_size = 1  # Number of output features (for regression tasks)

# Initialize the model
model = LinearModel(input_size, output_size).to(device_in_use)

# Define loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error Loss for regression tasks
optimizer = optim.SGD(model.parameters(), lr=0.0000002)  # Stochastic Gradient Descent

In [38]:

def harness(epochs, train_loader, val_loader):
    # Initialize the model
    model = LinearModel(input_size, output_size).to(device_in_use)

    # Define loss function and optimizer
    criterion = nn.MSELoss()  # Mean Squared Error Loss for regression tasks
    optimizer = optim.SGD(model.parameters(), lr=0.0000002)  # Stochastic Gradient Descent

    for epoch in range(epochs):
        model.train()  

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device_in_use), targets.to(device_in_use)
            optimizer.zero_grad()  
            outputs = model(inputs)  
            loss = criterion(outputs, targets.unsqueeze(2))  
            loss.backward() 
            optimizer.step()  

        train_loss = loss


        model.eval()  

        with torch.no_grad(): 
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device_in_use), targets.to(device_in_use)
                outputs = model(inputs)
                loss = criterion(outputs, targets.unsqueeze(2))

        test_loss = loss

    return test_loss.item(), train_loss.item()

In [15]:
train_loss = []
test_loss = []

for _ in range(100):

    model = LinearModel(input_size, output_size).to(device_in_use)

    criterion = nn.MSELoss()  
    optimizer = optim.SGD(model.parameters(), lr=0.0000002) 

    testloss, trainloss = harness(model, 50, train_loader, val_loader)

    train_loss.append(trainloss) 
    test_loss.append(testloss)
    print(trainloss)
    print(testloss)

print(np.mean(train_loss), np.mean(test_loss))
print(np.std(train_loss), np.std(test_loss))

441.7445068359375
444.7723083496094
264.6326599121094
266.943359375
151.4241943359375
128.75701904296875
482.56494140625
485.1209716796875
167.10023498535156
148.8119354248047
32.21702575683594
23.027896881103516
327.7943115234375
315.4488220214844
31.97795295715332
22.892133712768555
10.379721641540527
6.296657562255859
219.43775939941406
216.83673095703125
1382.3360595703125
1368.7249755859375
521.5062866210938
524.0630493164062
51.77092361450195
49.194374084472656
62.38676834106445
62.911380767822266
253.6999053955078
245.31967163085938
30.61112403869629
25.487802505493164
35.26033401489258
24.902217864990234
17.64187240600586
16.89542579650879
966.4056396484375
981.4949951171875
28.91860008239746
25.77888298034668
511.3040466308594
512.1527709960938
38.1777458190918
26.499223709106445
330.36865234375
319.0923156738281
663.5975952148438
661.4681396484375
252.0320587158203
249.5060577392578
167.97650146484375
163.39614868164062
18.22220230102539
12.340455055236816
672.3510131835938
6

In [16]:
# Initialize the model
model = LinearModel(input_size, output_size).to(device_in_use)

# Define loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error Loss for regression tasks
optimizer = optim.SGD(model.parameters(), lr=0.0000002)  # Stochastic Gradient Descent

test1, train1 = harness(model, 10, train_loader, val_loader)

# Initialize the model
model2 = LinearModel(input_size, output_size).to(device_in_use)

# Define loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error Loss for regression tasks
optimizer = optim.SGD(model2.parameters(), lr=0.0000002)  # Stochastic Gradient Descent

test1, train1 = harness(model2, 10, train_loader, val_loader)

In [21]:
averaged_model = LinearModel(input_size, output_size)

# Iterate through the parameters of both models
for param1, param2, param_avg in zip(model.parameters(), model2.parameters(), averaged_model.parameters()):
    # Average the weights and update the parameters of the averaged_model
    param_avg.data.copy_((param1.data + param2.data) / 2)

x, y = model.parameters()
print(x)
print(y)
x, y = model2.parameters()
print(x)
print(y)
x, y = averaged_model.parameters()
print(x)
print(y)

Parameter containing:
tensor([[-0.2808,  0.1736,  0.2340,  0.0231, -0.0548, -0.2834,  0.2829]],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0.1017], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([[ 0.0573,  0.0978,  0.0317,  0.0038,  0.1684, -0.2727, -0.0177]],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0.0103], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([[-0.1118,  0.1357,  0.1329,  0.0134,  0.0568, -0.2781,  0.1326]],
       requires_grad=True)
Parameter containing:
tensor([0.0560], requires_grad=True)


In [52]:
def weight_avg(epochs, train_loader, val_loader):
    epochs = epochs//3

    model1 = LinearModel(input_size, output_size).to(device_in_use)

    # Define loss function and optimizer
    criterion1 = nn.MSELoss()  # Mean Squared Error Loss for regression tasks
    optimizer1 = optim.SGD(model1.parameters(), lr=0.0000002)  # Stochastic Gradient Descent

    model2 = LinearModel(input_size, output_size).to(device_in_use)

    # Define loss function and optimizer
    criterion2 = nn.MSELoss()  # Mean Squared Error Loss for regression tasks
    optimizer2 = optim.SGD(model2.parameters(), lr=0.0000002)  # Stochastic Gradient Descent

    model3 = LinearModel(input_size, output_size).to(device_in_use)

    # Define loss function and optimizer
    criterion3 = nn.MSELoss()  # Mean Squared Error Loss for regression tasks
    optimizer3 = optim.SGD(model3.parameters(), lr=0.0000002)  # Stochastic Gradient Descent

    for _ in range(epochs):
        model1.train()  

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device_in_use), targets.to(device_in_use)
            optimizer1.zero_grad()  
            outputs = model1(inputs)  
            loss = criterion1(outputs, targets.unsqueeze(2))  
            loss.backward() 
            optimizer1.step() 


    for _ in range(epochs):
        model2.train()  

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device_in_use), targets.to(device_in_use)
            optimizer2.zero_grad()  
            outputs = model2(inputs)  
            loss = criterion2(outputs, targets.unsqueeze(2))  
            loss.backward() 
            optimizer2.step()

    for param1, param2, param_avg in zip(model1.parameters(), model2.parameters(), model3.parameters()):
        # Average the weights and update the parameters of the averaged_model
        param_avg.data.copy_((param1.data + param2.data) / 2)

    for _ in range(epochs):
        model3.train()  

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device_in_use), targets.to(device_in_use)
            optimizer3.zero_grad()  
            outputs = model3(inputs)  
            loss = criterion3(outputs, targets.unsqueeze(2))  
            loss.backward() 
            optimizer3.step()

        train_loss = loss


        model3.eval()  

        with torch.no_grad(): 
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device_in_use), targets.to(device_in_use)
                outputs = model3(inputs)
                loss = criterion(outputs, targets.unsqueeze(2))

        test_loss = loss
    
    return test_loss.item(), train_loss.item()
    

testloss, trainloss = weight_avg(120, train_loader, val_loader)
print(testloss, trainloss)

498.8960876464844 496.87158203125


In [54]:
train_loss_traditional = []
test_loss_traditional = []

train_loss_new = []
test_loss_new = []

for _ in range(1000):

    model = LinearModel(input_size, output_size).to(device_in_use)

    criterion = nn.MSELoss()  
    optimizer = optim.SGD(model.parameters(), lr=0.0000002) 

    testloss, trainloss = harness(120, train_loader, val_loader)

    train_loss_traditional.append(trainloss) 
    test_loss_traditional.append(testloss)

    testloss, trainloss = weight_avg(120, train_loader, val_loader)
    train_loss_new.append(trainloss) 
    test_loss_new.append(testloss)


print("Traditional:",np.mean(test_loss_traditional),np.std(test_loss_traditional))
print("Weight Avg:",np.mean(test_loss_new),np.std(test_loss_new))

Traditional: 184.30738068342208 186.52231584919582
Weight Avg: 119.71605833745002 137.85051488397693
