In [None]:
# Install required packages
!pip install torchvision pillow scikit-learn accelerate pandas opencv-python
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 # Example for CUDA 11.8, change as needed
!pip install sympy

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchvision)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchvision)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86

In [1]:
import torch
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import os
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm
import cv2
from google.colab import files
import zipfile
import torch.nn as nn # Import the torch.nn module
import torch.optim as optim # Import the torch.optim module for the optimizer
import torch.nn.functional as F

In [2]:
# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Upload your kaggle.json
from google.colab import files
files.upload()  # Choose kaggle.json

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [4]:
# Download dataset
!kaggle competitions download -c ucsc-cse-164-spring-2025-final-project
!unzip -q ucsc-cse-164-spring-2025-final-project.zip -d data/

Downloading ucsc-cse-164-spring-2025-final-project.zip to /content
 98% 662M/676M [00:03<00:00, 45.0MB/s]
100% 676M/676M [00:03<00:00, 181MB/s] 


In [5]:
# Dataset and Model Configuration
BASE_PATH = "./data/ImageNet-Subset"
NUM_CLASSES = 50
IMAGE_SIZE = 224
BATCH_SIZE = 32
EPOCHS = 20
LEARNING_RATE = 3e-4

In [6]:
# Data Transforms
# Enhanced transforms
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(IMAGE_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [7]:
# Custom Dataset Class that handles both classification and segmentation
class CustomImageDataset(Dataset):
    def __init__(self, data_dir, transform=None, is_labeled=True, class_to_idx=None, is_train=True):
        self.data_dir = data_dir
        self.transform = transform
        self.is_labeled = is_labeled
        self.is_train = is_train
        self.images = []
        self.labels = []

        # For labeled data
        if is_labeled:
            class_dirs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
            if class_to_idx is None:
                self.class_to_idx = {class_name: idx for idx, class_name in enumerate(sorted(class_dirs))}
            else:
                self.class_to_idx = class_to_idx

            for class_dir in sorted(class_dirs):
                class_path = os.path.join(data_dir, class_dir)
                image_files = [f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

                for img_file in image_files:
                    img_path = os.path.join(class_path, img_file)
                    self.images.append(img_path)
                    self.labels.append(self.class_to_idx[class_dir])
        else:
            # For test data
            image_files = [f for f in os.listdir(data_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            for img_file in image_files:
                img_path = os.path.join(data_dir, img_file)
                self.images.append(img_path)
                self.labels.append(-1)  # Dummy label

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
      if idx < len(self.images) and not self.images[idx].startswith("pseudo_"):
          # Original labeled data
          img_path = self.images[idx]
          image = Image.open(img_path).convert("RGB")
          label = torch.tensor(self.labels[idx], dtype=torch.long)
      else:
          # Pseudolabeled data (stored as tensors)
          image, label = self.pseudo_data[idx - len(self.images)]

      mask = torch.zeros((IMAGE_SIZE, IMAGE_SIZE), dtype=torch.long)  # Dummy mask

      if self.transform:
          if isinstance(image, torch.Tensor):  # Pseudolabeled tensor
              image = self.transform(image)  # May need adjustment for tensor input
          else:  # PIL Image
              image = self.transform(image)

      return image, label, mask

In [8]:
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F

class EnhancedModel(nn.Module):
    def __init__(self, num_classes=50):
        super().__init__()
        # ‚îÄ‚îÄ pretrained ResNet-18 ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
        backbone = models.resnet18(pretrained=True)
        # chop off its final pooling + fc:
        self.encoder = nn.Sequential(
            backbone.conv1, backbone.bn1, backbone.relu, backbone.maxpool,
            backbone.layer1, backbone.layer2, backbone.layer3, backbone.layer4,
        )
        # freeze pretrained layers
        for p in self.encoder.parameters():
            p.requires_grad = False

        # ‚îÄ‚îÄ classification head ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
        self.avgpool   = nn.AdaptiveAvgPool2d((1,1))
        self.classifier = nn.Sequential(
          nn.Dropout(0.5),
          nn.Linear(backbone.fc.in_features, num_classes)
      )

        # ‚îÄ‚îÄ segmentation head ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
        # a simple 1√ó1 conv to map to your 50 classes
        self.seg_head  = nn.Conv2d(backbone.fc.in_features, num_classes, kernel_size=1)

    def forward(self, x):
        feats = self.encoder(x)                  # [B,512,H,W]
        # classification
        p     = self.avgpool(feats).view(x.size(0), -1)  # [B,512]
        cls   = self.classifier(p)              # [B,50]
        # segmentation
        seg   = self.seg_head(feats)            # [B,50,H,W]
        seg   = F.interpolate(seg, size=(224,224),
                              mode='bilinear', align_corners=False)
        return cls, seg


In [9]:
# Loss functions
classification_criterion = nn.CrossEntropyLoss()
segmentation_criterion = nn.CrossEntropyLoss()

In [10]:
# Create class-to-index mapping
class_dirs = [d for d in os.listdir(os.path.join(BASE_PATH, "train-semi")) if os.path.isdir(os.path.join(BASE_PATH, "train-semi", d))]
class_to_idx = {class_name: idx for idx, class_name in enumerate(sorted(class_dirs))}
print("Class-to-index mapping:", class_to_idx)

Class-to-index mapping: {'n01443537': 0, 'n01491361': 1, 'n01531178': 2, 'n01644373': 3, 'n02104029': 4, 'n02119022': 5, 'n02123597': 6, 'n02133161': 7, 'n02165456': 8, 'n02281406': 9, 'n02325366': 10, 'n02342885': 11, 'n02396427': 12, 'n02483362': 13, 'n02504458': 14, 'n02510455': 15, 'n02690373': 16, 'n02747177': 17, 'n02783161': 18, 'n02814533': 19, 'n02859443': 20, 'n02917067': 21, 'n02992529': 22, 'n03014705': 23, 'n03047690': 24, 'n03095699': 25, 'n03197337': 26, 'n03201208': 27, 'n03445777': 28, 'n03452741': 29, 'n03584829': 30, 'n03630383': 31, 'n03775546': 32, 'n03791053': 33, 'n03874599': 34, 'n03891251': 35, 'n04026417': 36, 'n04335435': 37, 'n04380533': 38, 'n04404412': 39, 'n04447861': 40, 'n04507155': 41, 'n04522168': 42, 'n04557648': 43, 'n04562935': 44, 'n04612504': 45, 'n06794110': 46, 'n07749582': 47, 'n07831146': 48, 'n12998815': 49}


In [11]:
# Initialize datasets
train_dataset = CustomImageDataset(
    os.path.join(BASE_PATH, "train-semi"),
    transform=train_transform,
    is_labeled=True,
    class_to_idx=class_to_idx,
    is_train=True
)

unlabeled_dataset = CustomImageDataset(
    os.path.join(BASE_PATH, "train-unlabeled"),
    transform=train_transform,
    is_labeled=False,
    is_train=False
)

test_dataset = CustomImageDataset(
    os.path.join(BASE_PATH, "test"),
    transform=test_transform,
    is_labeled=False,
    is_train=False
)

# Initialize data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=BATCH_SIZE, shuffle=False)
# Define the test_loader
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [12]:
# Pseudolabeling function
def pseudolabel(model, unlabeled_loader, threshold=0.7):
    model.eval()
    pseudolabeled_images = []
    pseudolabeled_labels = []

    with torch.no_grad():
        for images, _, _ in unlabeled_loader:  # Ignore the dummy labels/masks
            images = images.to(device)
            logits = model(images)[0]  # Get classification logits
            probs = torch.softmax(logits, dim=1)
            confidences, labels = torch.max(probs, dim=1)

            # Only keep high-confidence predictions
            mask = confidences > threshold
            pseudolabeled_images.append(images[mask].cpu())
            pseudolabeled_labels.append(labels[mask].cpu())

    if pseudolabeled_images:  # Only concatenate if we have data
        return torch.cat(pseudolabeled_images), torch.cat(pseudolabeled_labels)
    return None, None

In [13]:
# Function to augment dataset with pseudolabels
def augment_dataset(train_dataset, pseudo_images, pseudo_labels):
    if pseudo_images is not None:
        # Convert pseudolabeled data to list of (image_path, label)
        # Since we can't modify the original dataset easily, we'll create a new one
        new_images = []
        new_labels = []

        # Add original data
        new_images.extend(train_dataset.images)
        new_labels.extend(train_dataset.labels)

        # Add pseudolabeled data (using dummy paths since we have tensors)
        for i in range(len(pseudo_images)):
            new_images.append(f"pseudo_{i}")  # Dummy path
            new_labels.append(pseudo_labels[i].item())

        # Create new dataset (modify your CustomImageDataset to handle this)
        # Alternatively, create a new Dataset class that can handle both paths and tensors
        # This is a simplified version - you may need to adapt based on your dataset class
        train_dataset.images = new_images
        train_dataset.labels = new_labels
        train_dataset.pseudo_data = list(zip(pseudo_images, pseudo_labels))

In [14]:
def accuracy(model, data_loader, device):
    """
    Calculate model accuracy on a given dataset
    Args:
        model: Your PyTorch model
        data_loader: DataLoader for the test/validation set
        device: 'cuda' or 'cpu'
    Returns:
        Accuracy percentage
    """
    correct = 0
    total = 0
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        # Modify the loop to unpack the three items returned by the dataset
        for data, labels, _ in data_loader:  # Added '_' to ignore the mask
            # Move data to the same device as model
            data, labels = data.to(device), labels.to(device)
            # Assuming the model returns classification logits first
            outputs = model(data)[0]  # Take only the classification logits
            _, predicted = torch.max(outputs.data, 1)  # Get predicted class
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total  # Return percentage

In [15]:
def train_model(model, train_loader, unlabeled_loader, test_loader, EPOCHS, device='cpu'):
    best_acc = 0.0
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0

        # ‚Äî‚Äî‚Äî Normal training loop ‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî
        for images, labels, masks in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            images, labels, masks = images.to(device), labels.to(device), masks.to(device)
            optimizer.zero_grad()
            cls_logits, seg_logits = model(images)

            # classification loss
            loss_cls    = classification_criterion(cls_logits, labels)
            # segmentation loss
            loss_seg_ce = segmentation_criterion(seg_logits, masks)
            # dice loss
            probs   = F.softmax(seg_logits, dim=1)
            one_hot = F.one_hot(masks, num_classes=seg_logits.shape[1])\
                         .permute(0,3,1,2).float()
            inter   = (probs * one_hot).sum((2,3))
            union   = probs.sum((2,3)) + one_hot.sum((2,3))
            loss_dice = 1 - (2*inter+1e-6)/(union+1e-6)
            loss_dice = loss_dice.mean()
            loss_seg  = loss_seg_ce + loss_dice

            # **de-emphasize segmentation early** (so cls head really learns):
            if epoch < 5:
                loss = loss_cls
            else:
                loss = loss_cls + loss_seg * 0.5

            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{EPOCHS} ‚Äî Train loss: {running_loss/len(train_loader):.4f}")

        # ‚Äî‚Äî‚Äî Now at epoch 5 inject pseudo-labels ‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî
        if epoch == 4:
            print("üîÆ Generating pseudo-labels on unlabeled pool‚Ä¶")
            pseudo_imgs, pseudo_lbls = pseudolabel(model, unlabeled_loader, threshold=0.8)
            if pseudo_imgs is not None:
                print(f"   ‚Üí got {len(pseudo_imgs)} high-confidence images, augmenting dataset")
                augment_dataset(train_dataset, pseudo_imgs, pseudo_lbls)
                # rebuild loader on the enlarged train_dataset
                train_loader = DataLoader(
                    train_dataset,
                    batch_size=BATCH_SIZE,
                    shuffle=True,
                    num_workers=4,
                    pin_memory=True
                )

        # ‚Äî‚Äî‚Äî Every 5 epochs evaluate & save ‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî
        if (epoch+1) % 5 == 0:
            model.eval()
            with torch.no_grad():
                test_acc = accuracy(model, test_loader, device)
            print(f"‚Üí Val Accuracy: {test_acc:.2f}%")
            if test_acc > best_acc:
                best_acc = test_acc
                torch.save(model.state_dict(), "best_model.pth")
                print(f"‚úî Saved best_model.pth (acc {test_acc:.2f}%)")
            model.train()

        scheduler.step()

    return model

#Training

In [16]:
# Initialize model
model = EnhancedModel(NUM_CLASSES).to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 44.7M/44.7M [00:00<00:00, 190MB/s]


In [17]:
# Stronger regularization
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

In [18]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset
import numpy as np
from torchvision import transforms

class SegClassDataset(Dataset):
    """
    Expects a folder structure like:
      train-semi/
         classA/
           00001.jpg
           00002.jpg
         classB/
           00003.jpg
           ...
      train-semi-segmentation/
         classA/
           00001.png
           00002.png
         classB/
           00003.png
           ...
    """
    def __init__(self, img_root, msk_root, transform):
        self.tf = transform
        # discover class subfolders
        classes = sorted(d for d in os.listdir(img_root)
                         if os.path.isdir(os.path.join(img_root, d)))
        self.class_to_idx = {c:i for i,c in enumerate(classes)}

        # build a list of (img_path, mask_path, class_idx)
        self.samples = []
        for cls in classes:
            imdir = os.path.join(img_root, cls)
            msdir = os.path.join(msk_root, cls)
            for fn in os.listdir(imdir):
                if not fn.lower().endswith((".jpg",".jpeg",".png")):
                    continue
                img_p = os.path.join(imdir, fn)
                msk_p = os.path.join(msdir, fn.rsplit(".",1)[0] + ".png")
                if not os.path.exists(msk_p):
                    continue
                lbl = self.class_to_idx[cls]
                self.samples.append((img_p, msk_p, lbl))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_p, msk_p, lbl = self.samples[idx]
        img = Image.open(img_p).convert("RGB")
        x   = self.tf(img)

        m = Image.open(msk_p).convert("L")
        m = transforms.Resize((224,224))(m)
        y_mask = torch.from_numpy(np.array(m, dtype=np.int64))

        return x, lbl, y_mask

In [19]:
import torch
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader

# ‚Äî‚Äî‚Äî CPU‚Äêside speed tweaks ‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî
torch.set_num_threads(8)
# (drop set_num_interop_threads)
cudnn.benchmark = True

# ‚Äî‚Äî‚Äî Paths ‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî
BASE     = "./data/ImageNet-Subset"
IMG_ROOT = f"{BASE}/train-semi"
MSK_ROOT = f"{BASE}/train-semi-segmentation"

# ‚Äî‚Äî‚Äî Build dataset ‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî
train_ds = SegClassDataset(IMG_ROOT, MSK_ROOT, train_transform)
val_ds   = train_ds  # or split

# ‚Äî‚Äî‚Äî DataLoaders ‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî
train_ld = DataLoader(
    train_ds,
    batch_size=8,
    shuffle=True,
    num_workers=8,
    pin_memory=False,
    prefetch_factor=2
)
val_ld = DataLoader(
    val_ds,
    batch_size=8,
    shuffle=False,
    num_workers=8,
    pin_memory=False,
    prefetch_factor=2
)

print(f"Datasets ready: {len(train_ds)} train samples")

Datasets ready: 500 train samples




In [20]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device in use:", device)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))


Device in use: cuda
CUDA available: True
GPU name: Tesla T4


In [21]:
trained_model = train_model(
    model=model,
    train_loader=train_loader,
    unlabeled_loader=unlabeled_loader,
    test_loader=test_loader,  # Add this
    EPOCHS=20,
    device=device
)

Epoch 1/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.18it/s]


Epoch 1/20 ‚Äî Train loss: 4.3406


Epoch 2/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:06<00:00,  2.61it/s]


Epoch 2/20 ‚Äî Train loss: 3.9891


Epoch 3/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:05<00:00,  2.95it/s]


Epoch 3/20 ‚Äî Train loss: 3.7500


Epoch 4/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:06<00:00,  2.63it/s]


Epoch 4/20 ‚Äî Train loss: 3.6125


Epoch 5/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:05<00:00,  2.97it/s]


Epoch 5/20 ‚Äî Train loss: 3.4593
üîÆ Generating pseudo-labels on unlabeled pool‚Ä¶
   ‚Üí got 0 high-confidence images, augmenting dataset




‚Üí Val Accuracy: 0.00%


Epoch 6/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.01it/s]


Epoch 6/20 ‚Äî Train loss: 5.1737


Epoch 7/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.22it/s]


Epoch 7/20 ‚Äî Train loss: 4.0305


Epoch 8/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.28it/s]


Epoch 8/20 ‚Äî Train loss: 3.6904


Epoch 9/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.11it/s]


Epoch 9/20 ‚Äî Train loss: 3.4869


Epoch 10/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.22it/s]


Epoch 10/20 ‚Äî Train loss: 3.3662
‚Üí Val Accuracy: 0.00%


Epoch 11/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:06<00:00,  2.29it/s]


Epoch 11/20 ‚Äî Train loss: 3.2759


Epoch 12/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.11it/s]


Epoch 12/20 ‚Äî Train loss: 3.1592


Epoch 13/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.18it/s]


Epoch 13/20 ‚Äî Train loss: 3.1012


Epoch 14/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.26it/s]


Epoch 14/20 ‚Äî Train loss: 3.0552


Epoch 15/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.12it/s]


Epoch 15/20 ‚Äî Train loss: 3.0189
‚Üí Val Accuracy: 0.00%


Epoch 16/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.08it/s]


Epoch 16/20 ‚Äî Train loss: 3.0735


Epoch 17/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.17it/s]


Epoch 17/20 ‚Äî Train loss: 2.9759


Epoch 18/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.04it/s]


Epoch 18/20 ‚Äî Train loss: 2.9862


Epoch 19/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.19it/s]


Epoch 19/20 ‚Äî Train loss: 2.8820


Epoch 20/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.24it/s]


Epoch 20/20 ‚Äî Train loss: 2.9889
‚Üí Val Accuracy: 0.00%


In [22]:
print(f"Test dataset contains {len(test_dataset.images)} images")  # Must be 752
print("First 5 image paths:", test_dataset.images[:5])

Test dataset contains 752 images
First 5 image paths: ['./data/ImageNet-Subset/test/00655.JPEG', './data/ImageNet-Subset/test/00123.JPEG', './data/ImageNet-Subset/test/00593.JPEG', './data/ImageNet-Subset/test/00267.JPEG', './data/ImageNet-Subset/test/00253.JPEG']


#Submission Generation


In [23]:
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F

# 1) Freeze all but seg_head
for n, p in model.named_parameters():
    p.requires_grad = False
for p in model.seg_head.parameters():
    p.requires_grad = True

optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=5e-4, weight_decay=1e-4
)
ce_loss   = nn.CrossEntropyLoss()

# 2) 20 epochs of seg-only training
for epoch in range(20):
    model.train()
    tot = 0.0
    for imgs, _, masks in tqdm(train_loader, desc=f"Seg-Epoch {epoch+1}/20"):
        imgs, masks = imgs.to(device), masks.to(device)
        optimizer.zero_grad()
        _, seg_logits = model(imgs)

        # CE
        Lce = ce_loss(seg_logits, masks)
        # Dice
        probs    = torch.softmax(seg_logits, dim=1)
        one_hot  = F.one_hot(masks, num_classes=50).permute(0,3,1,2).float()
        inter    = (probs * one_hot).sum((2,3))
        union    = probs.sum((2,3)) + one_hot.sum((2,3))
        Ldice    = 1 - (2*inter + 1e-6)/(union + 1e-6)
        Ldice    = Ldice.mean()

        loss = Lce + Ldice
        loss.backward()
        optimizer.step()
        tot += loss.item()
    print(f"  ‚Üí Avg mask loss: {tot/len(train_loader):.3f}")

Seg-Epoch 1/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.65it/s]


  ‚Üí Avg mask loss: 1.038


Seg-Epoch 2/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.67it/s]


  ‚Üí Avg mask loss: 0.997


Seg-Epoch 3/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.74it/s]


  ‚Üí Avg mask loss: 0.991


Seg-Epoch 4/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.66it/s]


  ‚Üí Avg mask loss: 0.988


Seg-Epoch 5/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.67it/s]


  ‚Üí Avg mask loss: 0.986


Seg-Epoch 6/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.68it/s]


  ‚Üí Avg mask loss: 0.985


Seg-Epoch 7/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.68it/s]


  ‚Üí Avg mask loss: 0.984


Seg-Epoch 8/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.74it/s]


  ‚Üí Avg mask loss: 0.983


Seg-Epoch 9/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.67it/s]


  ‚Üí Avg mask loss: 0.982


Seg-Epoch 10/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.66it/s]


  ‚Üí Avg mask loss: 0.980


Seg-Epoch 11/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.67it/s]


  ‚Üí Avg mask loss: 0.974


Seg-Epoch 12/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.73it/s]


  ‚Üí Avg mask loss: 0.960


Seg-Epoch 13/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.69it/s]


  ‚Üí Avg mask loss: 0.916


Seg-Epoch 14/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.66it/s]


  ‚Üí Avg mask loss: 0.810


Seg-Epoch 15/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.67it/s]


  ‚Üí Avg mask loss: 0.648


Seg-Epoch 16/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.67it/s]


  ‚Üí Avg mask loss: 0.461


Seg-Epoch 17/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.75it/s]


  ‚Üí Avg mask loss: 0.306


Seg-Epoch 18/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.66it/s]


  ‚Üí Avg mask loss: 0.245


Seg-Epoch 19/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.67it/s]


  ‚Üí Avg mask loss: 0.180


Seg-Epoch 20/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:09<00:00,  1.66it/s]

  ‚Üí Avg mask loss: 0.149





In [24]:
from tqdm.auto import tqdm

In [25]:
# ‚îÄ‚îÄ RLE encoder from the prof ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def rle_encode(values: np.ndarray) -> str:
    """Run-Length Encode a 1D numpy array of class IDs."""
    encoded  = []
    prev_val = int(values[0])
    count    = 1
    for v in values[1:]:
        v = int(v)
        if v == prev_val:
            count += 1
        else:
            encoded.extend([count, prev_val])
            count, prev_val = 1, v
    encoded.extend([count, prev_val])
    return " ".join(map(str, encoded))

In [26]:
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
import os
import cv2
from PIL import Image
# ‚îÄ‚îÄ submission generator ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def generate_submission(
    model,
    test_dir,
    transform,
    device
):
    model.eval()
    files = sorted(os.listdir(test_dir), key=lambda f: int(os.path.splitext(f)[0]))
    rows = []
    for fn in tqdm(files, desc="TTA Submission"):
        img = Image.open(os.path.join(test_dir, fn)).convert("RGB")
        x0  = transform(img).unsqueeze(0).to(device)
        x1  = transform(img.transpose(Image.FLIP_LEFT_RIGHT)).unsqueeze(0).to(device)

        with torch.no_grad():
            _, s0 = model(x0)
            _, s1 = model(x1)

        # resize & un-flip s1
        s0 = F.interpolate(s0, size=(224,224), mode='bilinear', align_corners=False)
        s1 = F.interpolate(s1, size=(224,224), mode='bilinear', align_corners=False).flip(-1)

        seg = (s0 + s1) * 0.5
        mask = seg.argmax(1).squeeze().cpu().numpy().astype(np.uint8)
        # median smooth stray pixels
        import cv2
        mask = cv2.medianBlur(mask, 3)

        flat = mask.flatten()
        # your existing rle_encode
        rle  = rle_encode(flat)
        # classification from aggregated features
        with torch.no_grad():
            c0, _ = model(x0)
            c1, _ = model(x1)
        label = int(((c0 + c1)*0.5).argmax(1))

        rows.append([fn, label, rle])

    df = pd.DataFrame(rows, columns=["ID","Label","MASK_RLE"])
    df.to_csv("submission.csv", index=False)
    print("Wrote submission.csv")
    return df

In [27]:
# Generating the submission using the function
generate_submission(model,
    os.path.join(BASE_PATH, "test"),
    test_transform,
    device
)

TTA Submission: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 752/752 [00:25<00:00, 29.63it/s]


Wrote submission.csv


Unnamed: 0,ID,Label,MASK_RLE
0,00000.JPEG,24,50176 0
1,00001.JPEG,11,50176 0
2,00002.JPEG,11,50176 0
3,00003.JPEG,33,50176 0
4,00004.JPEG,15,50176 0
...,...,...,...
747,00747.JPEG,32,50176 0
748,00748.JPEG,38,50176 0
749,00749.JPEG,10,50176 0
750,00750.JPEG,36,50176 0


#debugging submission generation


In [28]:
# Check file exists
print(f"File exists: {os.path.exists('submission.csv')}")

# Verify content
df = pd.read_csv("submission.csv")
print("\nFirst 5 entries:")
print(df.head())
print(f"\nTotal: {len(df)} rows")
print(f"Unique IDs: {df['ID'].nunique()}")

File exists: True

First 5 entries:
           ID  Label MASK_RLE
0  00000.JPEG     24  50176 0
1  00001.JPEG     11  50176 0
2  00002.JPEG     11  50176 0
3  00003.JPEG     33  50176 0
4  00004.JPEG     15  50176 0

Total: 752 rows
Unique IDs: 752


In [30]:
# Save model weights
def save_model():
    torch.save(model.state_dict(), "best_model.pth")
    print("Model weights saved as best_model.pth")

    # Zip the weights file for submission
    with zipfile.ZipFile('model_weights.zip', 'w') as zipf:
        zipf.write('best_model.pth')
    print("Zipped model weights saved as model_weights.zip")

save_model()

Model weights saved as best_model.pth
Zipped model weights saved as model_weights.zip


#Downloading the submission


In [31]:
# Downloading submission file and model weights
files.download('submission.csv')
files.download('model_weights.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>