In [3]:
!pip install spuco --upgrade



In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from spuco.datasets import SpuCoMNIST
from torch.utils.data import DataLoader
from sklearn.cluster import KMeans
import numpy as np


In [5]:
class ConvNet(nn.Module):
  def __init__(self):
    super(ConvNet, self).__init__()
    self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
    self.relu = nn.ReLU()
    self.maxpool = nn.MaxPool2d(2)
    self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
    self.fc1 = nn.Linear(3136, 128)
    self.fc2 = nn.Linear(128, 10)
  def forward(self, x):
    x = self.relu(self.conv1(x))
    x = self.maxpool(x)
    x = self.relu(self.conv2(x))
    x = x.view(-1, 3136)
    x = self.relu(self.fc1(x))
    x = self.fc2(x)
    return x

In [25]:
transform = transforms.Compose([])
train_dataset = SpuCoMNIST(root = './data',split = 'train', transform=transform, spurious_feature_difficulty='medium', classes = [[0,1,2],[3,4,5],[6,7,8,9]], spurious_correlation_strength=0.9)

original_create_background = SpuCoMNIST.create_background

def patched_create_background(spurious_feature_difficulty, hex_code):
        all_points = torch.cartesian_prod(torch.arange(28), torch.arange(28))
        unmask_points = all_points[torch.randperm(len(all_points))[:9]]
        mask = SpuCoMNIST.compute_mask(unmask_points)

        background = train_dataset.rgb_to_mnist_background(hex_code)

        return background * mask

SpuCoMNIST.create_background = patched_create_background

In [24]:
train_dataset.load_data()

100%|██████████| 48004/48004 [00:09<00:00, 5208.45it/s]


(<spuco.datasets.base_spuco_dataset.SourceData at 0x7d98a4ce8dc0>,
 range(0, 3),
 range(0, 3))

In [21]:
print(train_dataset.__dict__.keys())
print(len(train_dataset))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

dict_keys(['root', '_num_classes', 'split', 'transform', 'verbose', 'skip_group_validation', 'label_noise', 'core_feature_noise', 'spurious_correlation_strength', 'spurious_feature_difficulty', 'classes', 'colors', 'download', 'mnist', 'data', 'partition', 'is_noisy_label'])
48004


In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ConvNet().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [89]:
def train_erm(model, dataloader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)

            outputs = outputs.reshape(labels.shape[0], -1)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(dataloader):.4f}')


In [90]:
train_erm(model, train_loader, criterion, optimizer)

Epoch [1/5], Loss: 0.3466
Epoch [2/5], Loss: 0.1032
Epoch [3/5], Loss: 0.0693
Epoch [4/5], Loss: 0.0476
Epoch [5/5], Loss: 0.0384


In [22]:
def collect_outputs(model, dataloader):
    model.eval()
    all_outputs = []
    all_labels = []
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            outputs = model(images)
            all_outputs.append(outputs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    return np.vstack(all_outputs), np.hstack(all_labels)

outputs, labels = collect_outputs(model, train_loader)

# Apply KMeans clustering on the model's outputs
num_clusters = 10  # You can adjust this
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(outputs)

# Assign each sample to a cluster (group)
clusters = kmeans.labels_

In [97]:
class GroupBalancedDataset(Dataset):
    def __init__(self, dataset, clusters):
        self.dataset = dataset
        self.clusters = clusters
        self.num_groups = len(set(clusters))

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx][0], self.dataset[idx][1], self.clusters[idx]

# Create a balanced data loader that ensures each batch has equal group representation
balanced_dataset = GroupBalancedDataset(train_dataset, clusters)
balanced_loader = DataLoader(balanced_dataset, batch_size=64, shuffle=True)


In [99]:
def train_group_balanced(model, dataloader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        group_counts = [0] * num_clusters
        for images, labels, group in dataloader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)

            outputs = outputs.reshape(labels.shape[0], -1)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            group_counts[group.numpy().tolist()[0]] += 1
        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(dataloader):.4f}')
        print(f'Group distribution in last batch: {group_counts}')

train_group_balanced(model, balanced_loader, criterion, optimizer)

Epoch [1/5], Loss: 0.0261
Group distribution in last batch: [546, 33, 22, 54, 12, 10, 24, 30, 5, 15]
Epoch [2/5], Loss: 0.0218
Group distribution in last batch: [542, 26, 20, 51, 15, 10, 26, 30, 6, 25]
Epoch [3/5], Loss: 0.0152
Group distribution in last batch: [543, 24, 19, 45, 13, 13, 26, 39, 5, 24]
Epoch [4/5], Loss: 0.0134
Group distribution in last batch: [527, 36, 13, 58, 13, 12, 30, 33, 7, 22]
Epoch [5/5], Loss: 0.0139
Group distribution in last batch: [533, 34, 16, 53, 14, 18, 28, 30, 5, 20]


In [44]:
!git config --global user.email "dhwanib9@gmail.com"
!git config --global user.name "Dhwani090"

In [110]:
%cd /content/drive/MyDrive/
!ls

/content/drive/MyDrive
 5L1A0343-2.jpg			 'Jake_s_Resume-2 (1).pdf'
 Biokind.ipynb			 'Jake_s_Resume-2 (2).pdf'
'Colab Notebooks'		 'Jake_s_Resume-2 (3).pdf'
'Copy of Resume.gdoc'		 'Jake_s_Resume-2 (4).pdf'
'Dhwani (1).pdf'		  Jake_s_Resume-2.pdf
'Dhwani_Beesanahalli (1).pdf'	  JE9A5032.JPG
'Dhwani_Beesanahalli (2).pdf'	 'Meta University Cover Letter.gdoc'
 Dhwani_Beesanahalli.pdf	  Microsoft_Cover_Letter.gdoc
 Dhwani_Beesanahalli_Resume.pdf  'Notes - Formula General Meeting.gdoc'
 Dhwani.pdf			 'Nvidia Application.gdoc'
'Dhwani_Resume (1).pdf'		 'Research Email.gdoc'
'Dhwani_Resume (2).pdf'		 'Resume (1).pdf'
'Dhwani_Resume (3).pdf'		  Resume-3.pdf
 Dhwani_Resume.pdf		  Resume-4.pdf
'DSU Intro Slide.gslides'	  resume.pdf
 DSU.ipynb			  Resume.pdf
'ECE Pre-Lab 2.gdoc'		  Roblox_Cover_Letter.gdoc
'Elfin Application.gdoc'	  SA708578.jpg
'Google STEP Cover Letter.gdoc'   SpuCo.ipynb
 Headshot.jpg			 'Student Academic Profile blank 2024.gdoc'
 IMG_0074.HEIC			  Unknown.png
 IMG_3079.jpg		

In [111]:
!mv /content/drive/MyDrive/SpuCo.ipynb /content/SpuCo/

In [112]:
%cd /content/SpuCo/
!ls

/content/SpuCo
README.md  SpuCo.ipynb


In [117]:
!git filter-branch --force --index-filter "git rm --cached --ignore-unmatch SpuCo.ipynb" --prune-empty --tag-name-filter cat -- --all

	 rewrites.  Hit Ctrl-C before proceeding to abort, then use an
	 alternative filtering tool such as 'git filter-repo'
	 (https://github.com/newren/git-filter-repo/) instead.  See the
Proceeding with filter-branch...

Rewrite 40d9911a5ac9710fb827d698d20cc94bc36fa6e4 (2/2) (0 seconds passed, remaining 0 predicted)    rm 'SpuCo.ipynb'

Ref 'refs/heads/main' was rewritten
Ref 'refs/remotes/origin/main' was rewritten


In [118]:
!git reset --soft HEAD~1

fatal: ambiguous argument 'HEAD~1': unknown revision or path not in the working tree.
Use '--' to separate paths from revisions, like this:
'git <command> [<revision>...] -- [<file>...]'


In [120]:
!ls

README.md


In [119]:
!git add SpuCo.ipynb
!git commit -m "updated"
!git push origin main

fatal: pathspec 'SpuCo.ipynb' did not match any files
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
To https://github.com/Dhwani090/SpuCo.git
 [31m! [rejected]       [m main -> main (non-fast-forward)
[31merror: failed to push some refs to 'https://github.com/Dhwani090/SpuCo.git'
[m[33mhint: Updates were rejected because the tip of your current branch is behind[m
[33mhint: its remote counterpart. Integrate the remote changes (e.g.[m
[33mhint: 'git pull ...') before pushing again.[m
[33mhint: See the 'Note about fast-forwards' in 'git push --help' for details.[m
