In [1]:
!pip install datasets -q

In [2]:
import datasets

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset, WeightedRandomSampler
from torchvision import transforms, models
from PIL import Image
import random
import numpy as np
from datasets import load_dataset
from sklearn.metrics import f1_score
from copy import deepcopy
import os

# Set a fixed random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

class CustomHFDataset(Dataset):
    def __init__(self, hf_dataset, transform=None, test_flag=False):
        self.hf_dataset = hf_dataset
        self.transform = transform
        self.test_flag = test_flag

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        example = self.hf_dataset[idx]
        image = example['image']
        label = example['label']
        if self.transform:
            image = self.transform(image)
        if self.test_flag:
            return image, example['idx']
        else:
            return image, label

# -------------------------------------------
# Load dataset
# -------------------------------------------
dataset = load_dataset('hmdliu/ACAC-4K')

# Compute mean and std of the training set
temp_transform = transforms.ToTensor()
temp_dataset = CustomHFDataset(dataset['train'], transform=temp_transform, test_flag=False)

def compute_mean_std(dset):
    loader = DataLoader(dset, batch_size=32, shuffle=False)
    mean = 0.0
    var = 0.0
    total_pixels = 0
    for imgs, _ in loader:
        # imgs: (B, C, H, W)
        imgs = imgs.view(imgs.size(0), imgs.size(1), -1)
        batch_pixels = imgs.size(0)*imgs.size(2)
        mean += imgs.sum(dim=[0,2])
        var += (imgs**2).sum(dim=[0,2])
        total_pixels += batch_pixels
    mean = mean / total_pixels
    var = var / total_pixels
    std = torch.sqrt(var - mean**2)
    return mean.tolist(), std.tolist()

mean, std = compute_mean_std(temp_dataset)

print("Computed mean:", mean)
print("Computed std:", std)

# -------------------------------------------
# Define transforms
# -------------------------------------------
base_transform = transforms.Compose([
    transforms.Resize(512),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

# Stronger augmentations + Mixup or CutMix will be handled in training loop
train_augment = transforms.Compose([
    transforms.RandomResizedCrop(512, scale=(0.8, 1.0)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

val_transform = transforms.Compose([
    transforms.Resize(512),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

# -------------------------------------------
# Create base dataset and split train/val
# -------------------------------------------
train_dataset_raw = CustomHFDataset(dataset['train'], transform=base_transform, test_flag=False)
test_dataset = CustomHFDataset(dataset['test'], transform=val_transform, test_flag=True)

num_train = len(train_dataset_raw)
indices = list(range(num_train))
random.shuffle(indices)
val_ratio = 0.1
split = int(val_ratio * num_train)
val_indices = indices[:split]
train_indices = indices[split:]

val_dataset = Subset(train_dataset_raw, val_indices)

all_labels = [train_dataset_raw[i][1] for i in range(len(train_dataset_raw))]

Computed mean: [0.6204590201377869, 0.5430712699890137, 0.44747450947761536]
Computed std: [0.1726725846529007, 0.1666186898946762, 0.1685081124305725]


In [10]:
# -------------------------------------------
# Stage 1: Binary AI vs Non-AI classification for initial label correction
# -------------------------------------------
ai_indices = [i for i, l in enumerate(all_labels) if l == 5]
non_ai_indices = [i for i, l in enumerate(all_labels) if l in [0,1,2,3,4]]

# Balance binary training set
sample_size = min(len(ai_indices), len(non_ai_indices))
random.shuffle(non_ai_indices)
balanced_non_ai_indices = non_ai_indices[:sample_size]
binary_train_indices = ai_indices + balanced_non_ai_indices
random.shuffle(binary_train_indices)

class BinaryAIDataset(Dataset):
    def __init__(self, dataset, indices):
        self.dataset = dataset
        self.indices = indices
    def __len__(self):
        return len(self.indices)
    def __getitem__(self, idx):
        img, label = self.dataset[self.indices[idx]]
        bin_label = 1 if label == 5 else 0
        return img, bin_label

binary_train_dataset = BinaryAIDataset(train_dataset_raw, binary_train_indices)
binary_train_loader = DataLoader(binary_train_dataset, batch_size=32, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# A stronger binary model, say a wider resnet (no pretrained)
binary_model = models.resnet50(pretrained=False)
binary_model.fc = nn.Linear(binary_model.fc.in_features, 2)
binary_model = binary_model.to(device)

criterion_bin = nn.CrossEntropyLoss()
optimizer_bin = optim.Adam(binary_model.parameters(), lr=1e-4)
epochs_bin = 10

binary_model.train()
for epoch in range(epochs_bin):
    total_loss = 0.0
    for imgs, lbs in binary_train_loader:
        imgs, lbs = imgs.to(device), lbs.to(device)
        optimizer_bin.zero_grad()
        logits = binary_model(imgs)
        loss = criterion_bin(logits, lbs)
        loss.backward()
        optimizer_bin.step()
        total_loss += loss.item()
    print(f"[Binary Model] Epoch {epoch+1}/{epochs_bin}, Loss: {total_loss/len(binary_train_loader):.4f}")

# Inference for AI probabilities
binary_model.eval()
ai_probs = []
with torch.no_grad():
    for i in range(len(train_dataset_raw)):
        img, lb = train_dataset_raw[i]
        img = img.unsqueeze(0).to(device)
        logit = binary_model(img)
        prob = F.softmax(logit, dim=1)[0,1].item()
        ai_probs.append(prob)

# Initial threshold-based correction
threshold = 0.5
corrected_labels = deepcopy(all_labels)
for i in range(len(corrected_labels)):
    if corrected_labels[i] != 5 and ai_probs[i] > threshold:
        corrected_labels[i] = 5

[Binary Model] Epoch 1/15, Loss: 0.4646
[Binary Model] Epoch 2/15, Loss: 0.3857
[Binary Model] Epoch 3/15, Loss: 0.3700
[Binary Model] Epoch 4/15, Loss: 0.3364
[Binary Model] Epoch 5/15, Loss: 0.3196
[Binary Model] Epoch 6/15, Loss: 0.3350
[Binary Model] Epoch 7/15, Loss: 0.3587
[Binary Model] Epoch 8/15, Loss: 0.3005
[Binary Model] Epoch 9/15, Loss: 0.2986
[Binary Model] Epoch 10/15, Loss: 0.2448
[Binary Model] Epoch 11/15, Loss: 0.2214
[Binary Model] Epoch 12/15, Loss: 0.2370
[Binary Model] Epoch 13/15, Loss: 0.2294
[Binary Model] Epoch 14/15, Loss: 0.1986
[Binary Model] Epoch 15/15, Loss: 0.2251


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


AssertionError: Wrong image height! Expected 224 but got 512!

In [None]:
# -------------------------------------------
# Prepare the corrected dataset for the main classifier
# -------------------------------------------
class CorrectedDataset(Dataset):
    def __init__(self, hf_dataset, corrected_labels, transform=None):
        self.hf_dataset = hf_dataset
        self.corrected_labels = corrected_labels
        self.transform = transform
    def __len__(self):
        return len(self.hf_dataset)
    def __getitem__(self, idx):
        example = self.hf_dataset[idx]
        image = example['image']
        label = self.corrected_labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

corrected_train_dataset = CorrectedDataset(dataset['train'], corrected_labels, transform=train_augment)
corrected_val_dataset = CorrectedDataset(dataset['train'], corrected_labels, transform=val_transform)

train_dataset_final = Subset(corrected_train_dataset, train_indices)
val_dataset_final = Subset(corrected_val_dataset, val_indices)

# Compute class weights for WeightedRandomSampler due to imbalance
final_labels_train = [corrected_labels[i] for i in train_indices]
class_counts = np.bincount(final_labels_train, minlength=6)
class_weights = 1.0 / (class_counts + 1e-6)
weights = [class_weights[l] for l in final_labels_train]
sampler = WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

train_loader = DataLoader(train_dataset_final, batch_size=16, sampler=sampler)
val_loader = DataLoader(val_dataset_final, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [23]:
train_augment = transforms.Compose([
    transforms.RandomResizedCrop(512, scale=(0.8, 1.0)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

val_transform = transforms.Compose([
    transforms.Resize(512),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

In [25]:
# -------------------------------------------
# Define a Vision Transformer model from scratch
# -------------------------------------------
vit_model = models.vit_b_16(weights=None,image_size=512)  # no pretrained weights
num_features = vit_model.heads[0].in_features
vit_model.heads[0] = nn.Linear(num_features,6)
vit_model = vit_model.to(device)

# Label smoothing
class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, smoothing=0.1):
        super(LabelSmoothingCrossEntropy, self).__init__()
        self.smoothing = smoothing
        self.confidence = 1.0 - smoothing
    def forward(self, x, target):
        log_probs = F.log_softmax(x, dim=-1)
        with torch.no_grad():
            true_dist = torch.zeros_like(log_probs)
            true_dist.fill_(self.smoothing / (x.size(-1) - 1))
            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * log_probs, dim=-1))

criterion = LabelSmoothingCrossEntropy(smoothing=0.1)
optimizer = optim.AdamW(vit_model.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6)

scaler = torch.cuda.amp.GradScaler()

# Mixup function
def mixup_data(x, y, alpha=0.4):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(x.device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

def evaluate(model, loader):
    model.eval()
    preds = []
    trues = []
    with torch.no_grad():
        for img, lb in loader:
            img, lb = img.to(device), lb.to(device)
            out = model(img)
            pred = out.argmax(dim=1).cpu().numpy()
            preds.extend(pred)
            trues.extend(lb.cpu().numpy())
    preds = np.array(preds)
    trues = np.array(trues)
    accuracy = (preds == trues).mean()
    f1_scores = f1_score(trues, preds, labels=range(6), average=None, zero_division=0)
    f1_non_ai = np.mean(f1_scores[:5])
    f1_ai = f1_scores[5]
    weighted_metric = (accuracy + f1_non_ai + f1_ai) / 3
    return accuracy, f1_non_ai, f1_ai, weighted_metric

# -------------------------------------------
# Train final model (first round)
# -------------------------------------------
epochs = 20
best_wm = 0.0
for epoch in range(epochs):
    vit_model.train()
    total_loss = 0
    for img, lb in train_loader:
        img, lb = img.to(device), lb.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            if random.random() < 0.5:
                # apply mixup
                img, y_a, y_b, lam = mixup_data(img, lb)
                logits = vit_model(img)
                loss = mixup_criterion(criterion, logits, y_a, y_b, lam)
            else:
                logits = vit_model(img)
                loss = criterion(logits, lb)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    scheduler.step()

    acc, f1_non_ai, f1_ai, wm = evaluate(vit_model, val_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}, Acc: {acc:.4f}, "
          f"F1_non_AI: {f1_non_ai:.4f}, F1_AI: {f1_ai:.4f}, WM: {wm:.4f}")
    if wm > best_wm:
        best_wm = wm
        best_model_state = deepcopy(vit_model.state_dict())

# Load best model from first training round
vit_model.load_state_dict(best_model_state)

# -------------------------------------------
# Additional Noise Refinement Step:
# Use the trained model to re-check labels. If a sample strongly disagrees, correct again.
# For example, if model predicts AI with probability > 0.9 but label is not AI, relabel as AI.
# -------------------------------------------
vit_model.eval()
second_ai_probs = []
with torch.no_grad():
    train_loader_for_check = DataLoader(train_dataset_raw, batch_size=32, shuffle=False)
    # Note: The dataset here returns original images and labels (before transform),
    # so we apply the base_transform to images again
    # We'll just do a quick inline transform
    for imgs, lbs in train_loader_for_check:
        imgs = imgs.to(device)
        # apply base_transform again is tricky here, we already have in train_dataset_raw,
        # but it's already transformed. Let's assume train_dataset_raw is with base_transform.
        # If not, we need to re-apply. We'll trust that train_dataset_raw is with base_transform done.
        out = vit_model(imgs)
        probs = F.softmax(out, dim=1)
        second_ai_probs.extend(probs[:,5].cpu().tolist())

new_corrected_labels = deepcopy(corrected_labels)
for idx, label in enumerate(new_corrected_labels):
    if label != 5 and second_ai_probs[idx] > 0.9:
        new_corrected_labels[idx] = 5

# Retrain with second round corrected labels
final_corrected_train_dataset = CorrectedDataset(dataset['train'], new_corrected_labels, transform=train_augment)
final_corrected_val_dataset = CorrectedDataset(dataset['train'], new_corrected_labels, transform=val_transform)
final_train_dataset = Subset(final_corrected_train_dataset, train_indices)
final_val_dataset = Subset(final_corrected_val_dataset, val_indices)

final_labels_train = [new_corrected_labels[i] for i in train_indices]
class_counts = np.bincount(final_labels_train, minlength=6)
class_weights = 1.0 / (class_counts + 1e-6)
weights = [class_weights[l] for l in final_labels_train]
sampler = WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

final_train_loader = DataLoader(final_train_dataset, batch_size=16, sampler=sampler)
final_val_loader = DataLoader(final_val_dataset, batch_size=16, shuffle=False)

# Reinitialize and train again (fresh model or re-use the improved model)
# Let's reuse the improved model weights to save time
vit_model.load_state_dict(best_model_state)
optimizer = optim.AdamW(vit_model.parameters(), lr=5e-5, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6)
best_wm = 0.0

for epoch in range(epochs):
    vit_model.train()
    total_loss = 0
    for img, lb in final_train_loader:
        img, lb = img.to(device), lb.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            if random.random() < 0.5:
                img, y_a, y_b, lam = mixup_data(img, lb)
                logits = vit_model(img)
                loss = mixup_criterion(criterion, logits, y_a, y_b, lam)
            else:
                logits = vit_model(img)
                loss = criterion(logits, lb)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    scheduler.step()

    acc, f1_non_ai, f1_ai, wm = evaluate(vit_model, final_val_loader)
    print(f"[Refined] Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(final_train_loader):.4f}, "
          f"Acc: {acc:.4f}, F1_non_AI: {f1_non_ai:.4f}, F1_AI: {f1_ai:.4f}, WM: {wm:.4f}")
    if wm > best_wm:
        best_wm = wm
        best_model_state = deepcopy(vit_model.state_dict())

vit_model.load_state_dict(best_model_state)

# -------------------------------------------
# Inference on Test set
# -------------------------------------------
vit_model.eval()
test_preds = []
test_indices = []
with torch.no_grad():
    for img, idxs in test_loader:
        img = img.to(device)
        out = vit_model(img)
        pred = out.argmax(dim=1).cpu().numpy().tolist()
        test_preds.extend(pred)
        test_indices.extend(idxs.numpy().tolist())

output_path = "submission.csv"
with open(output_path, "w") as f:
    f.write("idx,predicted_label\n")
    for i, p in zip(test_indices, test_preds):
        f.write(f"{i},{p}\n")

print(f"Submission saved to {output_path}")

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch 1/20, Loss: 1.7000, Acc: 0.5062, F1_non_AI: 0.3699, F1_AI: 0.5263, WM: 0.4675


  with torch.cuda.amp.autocast():


Epoch 2/20, Loss: 1.4879, Acc: 0.5219, F1_non_AI: 0.4236, F1_AI: 0.5417, WM: 0.4957


  with torch.cuda.amp.autocast():


Epoch 3/20, Loss: 1.3883, Acc: 0.4656, F1_non_AI: 0.3811, F1_AI: 0.3902, WM: 0.4123


  with torch.cuda.amp.autocast():


Epoch 4/20, Loss: 1.3516, Acc: 0.4156, F1_non_AI: 0.3656, F1_AI: 0.4878, WM: 0.4230


  with torch.cuda.amp.autocast():


Epoch 5/20, Loss: 1.2985, Acc: 0.5344, F1_non_AI: 0.4844, F1_AI: 0.5424, WM: 0.5204


  with torch.cuda.amp.autocast():


Epoch 6/20, Loss: 1.2462, Acc: 0.5531, F1_non_AI: 0.4953, F1_AI: 0.4878, WM: 0.5121


  with torch.cuda.amp.autocast():


Epoch 7/20, Loss: 1.2177, Acc: 0.5469, F1_non_AI: 0.5054, F1_AI: 0.5417, WM: 0.5313


  with torch.cuda.amp.autocast():


Epoch 8/20, Loss: 1.1870, Acc: 0.5719, F1_non_AI: 0.5401, F1_AI: 0.5778, WM: 0.5632


  with torch.cuda.amp.autocast():


Epoch 9/20, Loss: 1.1553, Acc: 0.5969, F1_non_AI: 0.5572, F1_AI: 0.5000, WM: 0.5513


  with torch.cuda.amp.autocast():


Epoch 10/20, Loss: 1.1287, Acc: 0.5844, F1_non_AI: 0.5519, F1_AI: 0.5532, WM: 0.5632


  with torch.cuda.amp.autocast():


Epoch 11/20, Loss: 1.1144, Acc: 0.6062, F1_non_AI: 0.5653, F1_AI: 0.5532, WM: 0.5749


  with torch.cuda.amp.autocast():


Epoch 12/20, Loss: 1.0883, Acc: 0.6062, F1_non_AI: 0.5598, F1_AI: 0.5532, WM: 0.5731


  with torch.cuda.amp.autocast():


Epoch 13/20, Loss: 1.0975, Acc: 0.5844, F1_non_AI: 0.5471, F1_AI: 0.4878, WM: 0.5398


  with torch.cuda.amp.autocast():


Epoch 14/20, Loss: 1.0972, Acc: 0.5469, F1_non_AI: 0.5303, F1_AI: 0.5532, WM: 0.5434


  with torch.cuda.amp.autocast():


Epoch 15/20, Loss: 1.2030, Acc: 0.5906, F1_non_AI: 0.5310, F1_AI: 0.6222, WM: 0.5813


  with torch.cuda.amp.autocast():


Epoch 16/20, Loss: 1.1647, Acc: 0.5344, F1_non_AI: 0.4952, F1_AI: 0.5574, WM: 0.5290


  with torch.cuda.amp.autocast():


Epoch 17/20, Loss: 1.1775, Acc: 0.5687, F1_non_AI: 0.5019, F1_AI: 0.5532, WM: 0.5413


  with torch.cuda.amp.autocast():


Epoch 18/20, Loss: 1.2212, Acc: 0.6031, F1_non_AI: 0.5413, F1_AI: 0.5926, WM: 0.5790


  with torch.cuda.amp.autocast():


Epoch 19/20, Loss: 1.1951, Acc: 0.5750, F1_non_AI: 0.4956, F1_AI: 0.5714, WM: 0.5474


  with torch.cuda.amp.autocast():


Epoch 20/20, Loss: 1.2120, Acc: 0.5938, F1_non_AI: 0.5360, F1_AI: 0.6122, WM: 0.5806


  with torch.cuda.amp.autocast():


[Refined] Epoch 1/20, Loss: 1.1845, Acc: 0.5563, F1_non_AI: 0.4959, F1_AI: 0.5455, WM: 0.5325


  with torch.cuda.amp.autocast():


[Refined] Epoch 2/20, Loss: 1.1729, Acc: 0.6156, F1_non_AI: 0.5672, F1_AI: 0.6667, WM: 0.6165


  with torch.cuda.amp.autocast():


[Refined] Epoch 3/20, Loss: 1.1066, Acc: 0.5500, F1_non_AI: 0.5296, F1_AI: 0.5909, WM: 0.5568


  with torch.cuda.amp.autocast():


[Refined] Epoch 4/20, Loss: 1.0790, Acc: 0.5656, F1_non_AI: 0.5087, F1_AI: 0.5581, WM: 0.5441


  with torch.cuda.amp.autocast():


[Refined] Epoch 5/20, Loss: 1.0261, Acc: 0.5813, F1_non_AI: 0.5865, F1_AI: 0.5500, WM: 0.5726


  with torch.cuda.amp.autocast():


[Refined] Epoch 6/20, Loss: 0.9913, Acc: 0.6344, F1_non_AI: 0.5959, F1_AI: 0.5714, WM: 0.6006


  with torch.cuda.amp.autocast():


[Refined] Epoch 7/20, Loss: 0.9863, Acc: 0.6281, F1_non_AI: 0.5773, F1_AI: 0.6222, WM: 0.6092


  with torch.cuda.amp.autocast():


[Refined] Epoch 8/20, Loss: 0.9606, Acc: 0.6281, F1_non_AI: 0.5882, F1_AI: 0.6047, WM: 0.6070


  with torch.cuda.amp.autocast():


[Refined] Epoch 9/20, Loss: 0.9407, Acc: 0.6094, F1_non_AI: 0.5766, F1_AI: 0.5581, WM: 0.5814


  with torch.cuda.amp.autocast():


[Refined] Epoch 10/20, Loss: 0.9414, Acc: 0.6562, F1_non_AI: 0.6159, F1_AI: 0.6047, WM: 0.6256


  with torch.cuda.amp.autocast():


[Refined] Epoch 11/20, Loss: 0.8712, Acc: 0.6625, F1_non_AI: 0.6183, F1_AI: 0.6667, WM: 0.6491


  with torch.cuda.amp.autocast():


[Refined] Epoch 12/20, Loss: 0.9480, Acc: 0.6719, F1_non_AI: 0.6132, F1_AI: 0.6957, WM: 0.6602


  with torch.cuda.amp.autocast():


[Refined] Epoch 13/20, Loss: 0.9295, Acc: 0.6969, F1_non_AI: 0.6382, F1_AI: 0.7111, WM: 0.6820


  with torch.cuda.amp.autocast():


[Refined] Epoch 14/20, Loss: 0.9402, Acc: 0.6719, F1_non_AI: 0.6144, F1_AI: 0.5714, WM: 0.6192


  with torch.cuda.amp.autocast():


[Refined] Epoch 15/20, Loss: 0.9649, Acc: 0.6625, F1_non_AI: 0.6405, F1_AI: 0.6341, WM: 0.6457


  with torch.cuda.amp.autocast():


[Refined] Epoch 16/20, Loss: 0.9664, Acc: 0.6406, F1_non_AI: 0.5879, F1_AI: 0.6047, WM: 0.6110


  with torch.cuda.amp.autocast():


[Refined] Epoch 17/20, Loss: 0.9650, Acc: 0.6156, F1_non_AI: 0.5583, F1_AI: 0.5581, WM: 0.5774


  with torch.cuda.amp.autocast():


[Refined] Epoch 18/20, Loss: 0.9913, Acc: 0.6156, F1_non_AI: 0.5611, F1_AI: 0.6222, WM: 0.5997


  with torch.cuda.amp.autocast():


[Refined] Epoch 19/20, Loss: 1.0358, Acc: 0.6562, F1_non_AI: 0.6006, F1_AI: 0.5714, WM: 0.6094


  with torch.cuda.amp.autocast():


[Refined] Epoch 20/20, Loss: 0.9989, Acc: 0.5844, F1_non_AI: 0.5524, F1_AI: 0.6809, WM: 0.6059
Submission saved to submission.csv


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

vit_model = models.vit_b_16(weights=None,image_size=512)  # no pretrained weights
num_features = vit_model.heads[0].in_features
vit_model.heads[0] = nn.Linear(num_features,6)
vit_model.load_state_dict(torch.load("vit_model.pth", map_location=device))
vit_model.eval()# Make sure the model is in evaluation mode

all_preds = []
all_trues = []
all_probs = []  # To store softmax probabilities

with torch.no_grad():
    for img, lb in val_loader:
        img = img.to(device)
        lb = lb.to(device)
        outputs = vit_model(img)  # Raw logits
        probs = F.softmax(outputs, dim=1)  # Convert logits to probabilities
        
        preds = torch.argmax(probs, dim=1).cpu().numpy()
        labels = lb.cpu().numpy()
        probs_np = probs.cpu().numpy()  # Convert probabilities to numpy array
        
        all_preds.extend(preds)
        all_trues.extend(labels)
        all_probs.extend(probs_np)  # Each entry in all_probs is a list of probabilities for 6 classes

# Convert lists to numpy arrays
all_preds = np.array(all_preds)
all_trues = np.array(all_trues)
all_probs = np.array(all_probs)  # Shape: (num_samples, 6))

# Calculate confusion matrix
cm = confusion_matrix(all_trues, all_preds)
acc = accuracy_score(all_trues, all_preds)

print("Validation Accuracy:", acc)
print("Confusion Matrix:\n", cm)

# Optionally, you can visualize the confusion matrix:
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=range(6), yticklabels=range(6))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Validation Set')
plt.show()

In [None]:
# Modification for the threshold for making better prediction for AI images (After Checking Confusion Matrix)

# -------------------------------------------
# Inference on Test set
# -------------------------------------------
vit_model = models.vit_b_16(weights=None, image_size=512)  # no pretrained weights
num_features = vit_model.heads[0].in_features
vit_model.heads[0] = nn.Linear(num_features, 6)
vit_model.load_state_dict(torch.load("vit_model.pth", map_location=device))
vit_model = vit_model.to(device)
vit_model.eval()

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

threshold = 0.53

test_indices = []
test_top_predictions = []
# (top_class, top_class_prob, second_class, second_class_prob)

with torch.no_grad():
    for img, idxs in test_loader:
        img = img.to(device)
        out = vit_model(img)  # shape: (batch, 6)
        probs = F.softmax(out, dim=1)  # shape: (batch, 6)

        # Get top 2 predictions
        top2_values, top2_indices = torch.topk(probs, k=2, dim=1)  
        
        top2_values = top2_values.cpu().numpy()
        top2_indices = top2_indices.cpu().numpy()
        idxs = idxs.cpu().numpy()
        
        for i in range(len(idxs)):
            test_indices.append(idxs[i])
            test_top_predictions.append((top2_indices[i,0], top2_values[i,0],
                                         top2_indices[i,1], top2_values[i,1]))

# Apply threshold logic
final_predictions = []
for (c1, p1, c2, p2) in test_top_predictions:
    if c1 == 5:
        if p1 >= threshold:
            # Remain class 5
            final_predictions.append(5)
        else:
            # Assign second-best class
            final_predictions.append(c2)
    else:
        # Just assign top class
        final_predictions.append(c1)

# Save results to submission file
output_path = "submission.csv"
with open(output_path, "w") as f:
    f.write("idx,predicted_label\n")
    for idx, pred in zip(test_indices, final_predictions):
        f.write(f"{idx},{pred}\n")

print(f"Submission saved to {output_path}")
print(f"Threshold used: {threshold}")

In [30]:
# Replace 'MyDrive' and 'your-folder' with your actual Drive folders
save_path_final = "vit_model.pth"
save_path_binary = "final_binary_model.pth"

torch.save(vit_model.state_dict(), save_path_final)
torch.save(binary_model.state_dict(), save_path_binary)

print("Models saved to local")

Models saved to local


In [28]:
save_path_binary = "final_binary_model.pth"
torch.save(binary_model.state_dict(), save_path_binary)
print("binary model dowload!")

binary model dowload!
