In [None]:
import os
import sys
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from PIL import Image
from dataclasses import dataclass
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import efficientnet_b0
from sklearn.metrics import f1_score

In [None]:
def seed_everything(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()

In [None]:
@dataclass
class CFG:
    batch_size: int = 32
    lr: float = 5e-4
    epochs: int = 10
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    img_size: int = 224
    base_path: str = 'dsm-2024-competition/train'
    test_path: str = 'dsm-2024-competition'
    test_csv_path: str = 'dsm-2024-competition/test.csv'
    weight_decay: float = 1e-4
    best_model_path: str = 'best_model.pth'
    submission_path: str = 'submission.csv'

In [None]:
!unzip dsm-2024-competition.zip -d dsm-2024-competition

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: dsm-2024-competition/train/Unlabeled/040002.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040003.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040004.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040005.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040006.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040007.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040008.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040009.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040010.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040011.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040012.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040013.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040014.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040015.jpg  
  inflating: dsm-2024-competition/train/Unlabeled/040

In [None]:
class MyDataset(Dataset):
    def __init__(self, df, stage, transform=None):
        self.df = df
        self.transform = transform
        self.stage = stage

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.iloc[idx]['ID']
        if self.stage == 'train':
            label = self.df.iloc[idx]['label']
            subdir = 'Benign' if label == 0 else 'Malignant'
            image_path = os.path.join(CFG.base_path, subdir, img_name)
        elif self.stage == 'test':
            image_path = os.path.join(CFG.test_path, img_name)
            label = -1

        if not os.path.exists(image_path):
            raise FileNotFoundError(f"File not found: {image_path}")

        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return (image, label) if self.stage == 'train' else image

# Prepare data with oversampling for the Malignant class
def prepare_data_with_oversampling():
    benign = os.listdir(os.path.join(CFG.base_path, 'Benign'))
    malignant = os.listdir(os.path.join(CFG.base_path, 'Malignant'))

    # Create dataframes for each class
    benign_df = pd.DataFrame({'ID': benign, 'label': 0})
    malignant_df = pd.DataFrame({'ID': malignant, 'label': 1})

    # Oversample malignant examples
    oversampled_malignant = []
    for _, row in malignant_df.iterrows():
        for i in range(17):  # Generate 16 additional augmented examples + original
            oversampled_malignant.append(row)

    oversampled_malignant_df = pd.DataFrame(oversampled_malignant)

    # Combine and shuffle the dataset
    train_df = pd.concat([benign_df, oversampled_malignant_df]).reset_index(drop=True)
    train_df = train_df.sample(frac=1).reset_index(drop=True)  # Shuffle data

    return train_df

class EfficientNetClassifier(nn.Module):
    def __init__(self, num_classes=1):
        super(EfficientNetClassifier, self).__init__()
        self.efficientnet = efficientnet_b0()
        self.efficientnet.classifier[1] = nn.Linear(self.efficientnet.classifier[1].in_features, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.efficientnet(x)
        return self.sigmoid(x)

In [None]:
def train_one_epoch(model, optimizer, criterion, train_loader, device):
    model.train()
    running_loss = 0.0
    bar = tqdm(train_loader, total=len(train_loader), colour='cyan', file=sys.stdout)
    for images, labels in bar:
        images, labels = images.to(device), labels.float().unsqueeze(1).to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)
        bar.set_postfix(loss=f'{running_loss / len(train_loader.dataset):.4f}')
    return running_loss / len(train_loader.dataset)

def validate(model, val_loader, criterion, device):
    model.eval()
    val_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.float().unsqueeze(1).to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * images.size(0)

            preds = (outputs.cpu().numpy() > 0.5).astype(int)
            correct_predictions += (preds.flatten() == labels.cpu().numpy().flatten()).sum()
            total_samples += labels.size(0)

    accuracy = correct_predictions / total_samples
    return val_loss / len(val_loader.dataset), accuracy

def predict(model, test_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for images in tqdm(test_loader, desc="Predicting", colour="green"):
            images = images.to(device)
            outputs = model(images)
            predictions.extend(outputs.cpu().numpy())
    return [1 if x > 0.5 else 0 for x in predictions]

In [None]:
train_transform = transforms.Compose([
    transforms.Resize((CFG.img_size, CFG.img_size)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(30),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

val_transform = transforms.Compose([
    transforms.Resize((CFG.img_size, CFG.img_size)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_df = prepare_data_with_oversampling()
train_set, val_set = train_test_split(train_df, test_size=0.2, stratify=train_df['label'])
train_set = train_set.reset_index(drop=True)
val_set = val_set.reset_index(drop=True)

train_dataset = MyDataset(train_set, stage='train', transform=train_transform)
val_dataset = MyDataset(val_set, stage='train', transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)

In [None]:
model = EfficientNetClassifier().to(CFG.device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)

In [None]:
best_val_loss = float('inf')
# First training phase
for epoch in range(CFG.epochs):
    print(f"Epoch {epoch + 1}/{CFG.epochs}")
    train_loss = train_one_epoch(model, optimizer, criterion, train_loader, CFG.device)
    val_loss, val_accuracy = validate(model, val_loader, criterion, CFG.device)  # Unpack the tuple
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), CFG.best_model_path)
        print("Saved Best Model!")

Epoch 1/10
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.36it/s, loss=0.3753]
Train Loss: 0.3753, Validation Loss: 0.3527, Validation Accuracy: 0.8468
Saved Best Model!
Epoch 2/10
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.36it/s, loss=0.3475]
Train Loss: 0.3475, Validation Loss: 0.3139, Validation Accuracy: 0.8698
Saved Best Model!
Epoch 3/10
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.36it/s, loss=0.3400]
Train Loss: 0.3400, Validation Loss: 0.2948, Validation Accuracy: 0.8758
Saved Best Model!
Epoch 4/10
100%|[36m██████████[0m| 250/250 [01:13<00:00,  3.38it/s, loss=0.3207]
Train Loss: 0.3207, Validation Loss: 0.3356, Validation Accuracy: 0.8523
Epoch 5/10
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.34it/s, loss=0.3003]
Train Loss: 0.3003, Validation Loss: 0.2947, Validation Accuracy: 0.8893
Saved Best Model!
Epoch 6/10
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.35it/s, loss=0.2971]
Train Loss: 0.2971, Validation Loss: 0.2654, Validation Accuracy:

In [None]:
# Update learning rate for second training phase
CFG.lr = 1e-4
optimizer = optim.Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)

# Second training phase
for epoch in range(10, 20):
    print(f"Epoch {epoch + 1}/{20}")
    train_loss = train_one_epoch(model, optimizer, criterion, train_loader, CFG.device)
    val_loss, val_accuracy = validate(model, val_loader, criterion, CFG.device)  # Unpack the tuple
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), CFG.best_model_path)
        print("Saved Best Model!")

Epoch 11/20
100%|[36m██████████[0m| 250/250 [01:13<00:00,  3.39it/s, loss=0.2138]
Train Loss: 0.2138, Validation Loss: 0.2009, Validation Accuracy: 0.9304
Saved Best Model!
Epoch 12/20
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.35it/s, loss=0.2029]
Train Loss: 0.2029, Validation Loss: 0.2079, Validation Accuracy: 0.9269
Epoch 13/20
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.35it/s, loss=0.1915]
Train Loss: 0.1915, Validation Loss: 0.2152, Validation Accuracy: 0.9409
Epoch 14/20
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.35it/s, loss=0.1808]
Train Loss: 0.1808, Validation Loss: 0.2141, Validation Accuracy: 0.9434
Epoch 15/20
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.37it/s, loss=0.1778]
Train Loss: 0.1778, Validation Loss: 0.2434, Validation Accuracy: 0.9274
Epoch 16/20
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.37it/s, loss=0.1689]
Train Loss: 0.1689, Validation Loss: 0.1680, Validation Accuracy: 0.9379
Saved Best Model!
Epoch 17/20
100%|[36m

In [None]:
# Update learning rate for third training phase
CFG.lr = 5e-5
optimizer = optim.Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)

# Third training phase
for epoch in range(20, 30):
    print(f"Epoch {epoch + 1}/{30}")
    train_loss = train_one_epoch(model, optimizer, criterion, train_loader, CFG.device)
    val_loss, val_accuracy = validate(model, val_loader, criterion, CFG.device)  # Unpack the tuple
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), CFG.best_model_path)
        print("Saved Best Model!")

Epoch 21/30
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.38it/s, loss=0.1487]
Train Loss: 0.1487, Validation Loss: 0.2124, Validation Accuracy: 0.9379
Epoch 22/30
100%|[36m██████████[0m| 250/250 [01:13<00:00,  3.39it/s, loss=0.1438]
Train Loss: 0.1438, Validation Loss: 0.2107, Validation Accuracy: 0.9419
Epoch 23/30
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.38it/s, loss=0.1316]
Train Loss: 0.1316, Validation Loss: 0.1663, Validation Accuracy: 0.9399
Epoch 24/30
100%|[36m██████████[0m| 250/250 [01:13<00:00,  3.41it/s, loss=0.1342]
Train Loss: 0.1342, Validation Loss: 0.1529, Validation Accuracy: 0.9444
Saved Best Model!
Epoch 25/30
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.37it/s, loss=0.1327]
Train Loss: 0.1327, Validation Loss: 0.1413, Validation Accuracy: 0.9479
Saved Best Model!
Epoch 26/30
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.37it/s, loss=0.1301]
Train Loss: 0.1301, Validation Loss: 0.1408, Validation Accuracy: 0.9509
Saved Best Model!
Epoc

In [None]:
# Update learning rate for third training phase
CFG.lr = 1e-5
optimizer = optim.Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)

# Third training phase
for epoch in range(30, 40):
    print(f"Epoch {epoch + 1}/{40}")
    train_loss = train_one_epoch(model, optimizer, criterion, train_loader, CFG.device)
    val_loss, val_accuracy = validate(model, val_loader, criterion, CFG.device)  # Unpack the tuple
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), CFG.best_model_path)
        print("Saved Best Model!")

Epoch 31/40
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.37it/s, loss=0.1155]
Train Loss: 0.1155, Validation Loss: 0.1323, Validation Accuracy: 0.9554
Epoch 32/40
100%|[36m██████████[0m| 250/250 [01:13<00:00,  3.40it/s, loss=0.1157]
Train Loss: 0.1157, Validation Loss: 0.1357, Validation Accuracy: 0.9519
Epoch 33/40
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.37it/s, loss=0.1144]
Train Loss: 0.1144, Validation Loss: 0.1480, Validation Accuracy: 0.9524
Epoch 34/40
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.36it/s, loss=0.1108]
Train Loss: 0.1108, Validation Loss: 0.1849, Validation Accuracy: 0.9534
Epoch 35/40
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.37it/s, loss=0.1096]
Train Loss: 0.1096, Validation Loss: 0.1739, Validation Accuracy: 0.9594
Epoch 36/40
100%|[36m██████████[0m| 250/250 [01:13<00:00,  3.40it/s, loss=0.1129]
Train Loss: 0.1129, Validation Loss: 0.1736, Validation Accuracy: 0.9559
Epoch 37/40
100%|[36m██████████[0m| 250/250 [01:13<00:00

In [None]:
# # Update learning rate for third training phase
# CFG.lr = 1e-6
# optimizer = optim.Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)

# # Third training phase
# for epoch in range(40, 50):
#     print(f"Epoch {epoch + 1}/{50}")
#     train_loss = train_one_epoch(model, optimizer, criterion, train_loader, CFG.device)
#     val_loss, val_accuracy = validate(model, val_loader, criterion, CFG.device)  # Unpack the tuple
#     print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         torch.save(model.state_dict(), CFG.best_model_path)
#         print("Saved Best Model!")

Epoch 41/40
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.35it/s, loss=0.1213]
Train Loss: 0.1213, Validation Loss: 0.1269, Validation Accuracy: 0.9574
Saved Best Model!
Epoch 42/40
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.36it/s, loss=0.1220]
Train Loss: 0.1220, Validation Loss: 0.1370, Validation Accuracy: 0.9554
Epoch 43/40
100%|[36m██████████[0m| 250/250 [01:13<00:00,  3.38it/s, loss=0.1189]
Train Loss: 0.1189, Validation Loss: 0.1352, Validation Accuracy: 0.9564
Epoch 44/40
100%|[36m██████████[0m| 250/250 [01:13<00:00,  3.39it/s, loss=0.1198]
Train Loss: 0.1198, Validation Loss: 0.1333, Validation Accuracy: 0.9549
Epoch 45/40
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.37it/s, loss=0.1143]
Train Loss: 0.1143, Validation Loss: 0.1843, Validation Accuracy: 0.9544
Epoch 46/40
100%|[36m██████████[0m| 250/250 [01:14<00:00,  3.38it/s, loss=0.1227]
Train Loss: 0.1227, Validation Loss: 0.1487, Validation Accuracy: 0.9529
Epoch 47/40
100%|[36m██████████[0m| 25

In [None]:
test_df = pd.read_csv(CFG.test_csv_path)
test_transform = transforms.Compose([
    transforms.Resize((CFG.img_size, CFG.img_size)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

test_dataset = MyDataset(test_df, stage='test', transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

model.load_state_dict(torch.load(CFG.best_model_path))
predictions = predict(model, test_loader, CFG.device)
test_df['label'] = predictions
test_df[['ID', 'label']].to_csv(CFG.submission_path, index=False)
print(f"Submission file saved to {CFG.submission_path}")

  model.load_state_dict(torch.load(CFG.best_model_path))
Predicting: 100%|[32m██████████[0m| 66/66 [00:07<00:00,  9.23it/s]

Submission file saved to submission.csv



