### **Importing Libraries**

In [23]:
import numpy as np
import matplotlib.pyplot as plt
import cv2
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, ConcatDataset
from torch.utils.data import Dataset
import torchvision.models as models
from torchvision import transforms
import torchvision
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torchvision.transforms as T
from PIL import Image
import math
import random
from timm.data.mixup import Mixup
import torchvision.transforms.functional as TF
from torchvision.transforms import autoaugment
from timm.data import RandAugment
from timm.scheduler.cosine_lr import CosineLRScheduler
import csv
import timm
from itertools import combinations
from timm import create_model


import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

device_1 = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device_1)

cuda:0


### **Dataset**

#### **Helper Functions**

In [2]:
transform_base = T.Compose([
    T.Resize((256, 256)),
    T.RandomHorizontalFlip(),
    autoaugment.RandAugment(num_ops=2,magnitude=14),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ,
    T.RandomErasing(p=0.25, scale=(0.02, 0.1), ratio=(0.3, 3.3)),
])

transform_color = T.Compose([
    T.Resize((256, 256)),
    T.ColorJitter(brightness=0.4, contrast=0.3, saturation=0.3, hue=0.1),
    T.GaussianBlur(kernel_size=3, sigma=(0.1, 1.0)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    T.RandomErasing(p=0.25, scale=(0.02, 0.1), ratio=(0.3, 3.3)),
])

transform_affine = T.Compose([
    T.Resize((288, 288)),
    T.RandomResizedCrop(256, scale=(0.8, 1.0), ratio=(0.9, 1.1)),
    T.RandomAffine(degrees=0, translate=(0.2, 0.2), scale=(0.85, 1.15), shear=10),
    T.RandomPerspective(distortion_scale=0.2, p=0.5),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    T.RandomErasing(p=0.25, scale=(0.02, 0.1), ratio=(0.3, 3.3)),
])

transform_val = T.Compose([
    T.Resize((256, 256)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

#### **Dataset Class**

##### **Train & Val**

In [3]:
class RegionDataset(Dataset):
    def __init__(self, image_dir, labels_df, transform=None):
        self.image_dir = image_dir
        self.labels_df = labels_df
        self.transform = transform

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        row = self.labels_df.iloc[idx]
        img_path = os.path.join(self.image_dir, row['filename'])
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)
            region = int(row['Region_ID'])
            region = region - 1

        region_tensor = torch.tensor(region, dtype=torch.long)
        
        return image, region_tensor

##### **Test**

In [None]:
class TestDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.image_filenames = sorted(os.listdir(image_dir))
        self.transform = transform

    def __len__(self):
        return len(self.image_filenames)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_filenames[idx])
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image

##### **Augment Dataset**

In [5]:
def create_extended_dataset(image_dir, labels_df):
    # Original dataset
    original_dataset = RegionDataset(
        image_dir=image_dir,
        labels_df=labels_df,
        transform=transform_base
    )
    
    # Color jitter augmented dataset
    color_dataset = RegionDataset(
        image_dir=image_dir,
        labels_df=labels_df,
        transform=transform_color
    )
    
    # Affine transform augmented dataset
    affine_dataset = RegionDataset(
        image_dir=image_dir,
        labels_df=labels_df,
        transform=transform_affine
    )
    
    extended_dataset = ConcatDataset([original_dataset, color_dataset, affine_dataset])
    
    return extended_dataset

#### **Training**

In [10]:
image_dir_train = "../../Dataset/Train/images_train"
labels_path_train = "../../Dataset/Train/labels_train.csv"

labels_df = pd.read_csv(labels_path_train)

train_dataset = create_extended_dataset(image_dir_train, labels_df)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

#### **Validation**

In [11]:
images_dir_val = "../../Dataset/Val/images_val"
labels_path_val = "../../Dataset/Val/labels_val.csv"
labels_df_val = pd.read_csv(labels_path_val)

val_dataset = RegionDataset(images_dir_val, labels_df_val, transform_val)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

#### **Testing**

In [17]:
images_dir_test = "../../Dataset/Test"

test_dataset = TestDataset(images_dir_test, transform_val)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

### **Model**

#### **Model Implementation**

In [18]:
class DeitRegionClassifier(nn.Module):
    def __init__(self, num_classes=15, model_name='deit_base_patch16_224'):
        super(DeitRegionClassifier, self).__init__()

        self.model = timm.create_model(model_name, pretrained=True)
        self.model.head = nn.Linear(self.model.head.in_features, num_classes)

    def forward(self, x):

        if x.shape[-1] != 224 or x.shape[-2] != 224:
            x = F.interpolate(x, size=(224, 224), mode='bilinear', align_corners=False)
        return self.model(x)

    
class GlobalContextModule(nn.Module):

    def __init__(self, in_channels, reduction_ratio=16):
        super().__init__()
        self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
        self.softmax = nn.Softmax(dim=2)
        
        self.channel_attn = nn.Sequential(
            nn.Linear(in_channels, in_channels // reduction_ratio),
            nn.SiLU(),
            nn.Linear(in_channels // reduction_ratio, in_channels),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        batch, channels, height, width = x.size()
        
        # Spatial attention
        input_x = x
        input_x = self.conv_mask(input_x)  # [B, 1, H, W]
        input_x = input_x.view(batch, 1, height * width)
        attn = self.softmax(input_x)  # Softmax along spatial dimension
        attn = attn.view(batch, 1, height, width)
        
        # Apply spatial attention to feature map
        x_weighted = x * attn
        
        # Global context
        context = torch.sum(x_weighted, dim=(2, 3), keepdim=True) / (height * width)
        context = context.view(batch, channels)
        
        # Channel attention
        channel_attn = self.channel_attn(context).view(batch, channels, 1, 1)
        
        return x * channel_attn

class FeaturePyramidModule(nn.Module):

    def __init__(self, in_channels_list, out_channels):
        super().__init__()
        
        self.lateral_convs = nn.ModuleList([
            nn.Conv2d(in_channels, out_channels, kernel_size=1)
            for in_channels in in_channels_list
        ])
        
        self.output_convs = nn.ModuleList([
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
            for _ in range(len(in_channels_list))
        ])
        
    def forward(self, features):
        
        results = []
        
        prev = self.lateral_convs[-1](features[-1])
        results.append(self.output_convs[-1](prev))
        
        for i in range(len(features) - 2, -1, -1):

            current = self.lateral_convs[i](features[i])
            
            prev_shape = prev.shape[2:]
            current_shape = current.shape[2:]
            
            if prev_shape[0] < current_shape[0] or prev_shape[1] < current_shape[1]:
                prev = F.interpolate(prev, size=current_shape, mode='bilinear', align_corners=False)
            
            prev = current + prev
            results.append(self.output_convs[i](prev))
        
        return list(reversed(results))

class SelfAttention2D(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.query = nn.Conv2d(in_dim, in_dim // 8, 1)
        self.key = nn.Conv2d(in_dim, in_dim // 8, 1)
        self.value = nn.Conv2d(in_dim, in_dim, 1)
        self.gamma = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        B, C, H, W = x.size()
        proj_query = self.query(x).view(B, -1, H * W).permute(0, 2, 1)  # [B, N, C']
        proj_key = self.key(x).view(B, -1, H * W)                       # [B, C', N]
        energy = torch.bmm(proj_query, proj_key)                       # [B, N, N]
        attention = F.softmax(energy, dim=-1)
        proj_value = self.value(x).view(B, -1, H * W)                  # [B, C, N]
        out = torch.bmm(proj_value, attention.permute(0, 2, 1))       # [B, C, N]
        out = out.view(B, C, H, W)
        return self.gamma * out + x
    
class RegionClassifier(nn.Module):
    def __init__(self, num_classes, pretrained=True, dropout_rate=0.3, use_fpn=True, model="convex"):
        
        super().__init__()
        
        if model == "convex":
            self.backbone = timm.create_model(
                'convnext_base',
                pretrained=pretrained,
                features_only=True,
                out_indices=(1, 2, 3)
            )
        else:
            self.backbone = timm.create_model(
                'tf_efficientnetv2_m',
                pretrained=True,
                features_only=True,
                out_indices=(2, 3, 4)
            )

        
        with torch.no_grad():
            dummy_input = torch.zeros(1, 3, 224, 224)
            features = self.backbone(dummy_input)
            feature_channels = [f.shape[1] for f in features]
            print(f"Backbone feature channels: {feature_channels}")
        
        self.use_fpn = use_fpn
        if use_fpn:
            self.fpn = FeaturePyramidModule(feature_channels, 256)
            merged_channels = 256
        else:
            merged_channels = feature_channels[-1]

        self.attention = SelfAttention2D(merged_channels)
        self.gc_module = GlobalContextModule(merged_channels)

        self.classifier = nn.Sequential(
            nn.Linear(merged_channels, 512),
            nn.LayerNorm(512),
            nn.SiLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.SiLU(),
            nn.Dropout(dropout_rate * 0.5),
            
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        features = self.backbone(x)
        
        if self.use_fpn:
            features = self.fpn(features)
            feat = features[0]
        else:
            feat = features[-1]

        feat = self.attention(feat)
        feat = self.gc_module(feat)
        
        x = F.adaptive_avg_pool2d(feat, 1).flatten(1)
        out = self.classifier(x)
        return out


### **Training**

In [19]:
mixup_fn = Mixup(
    mixup_alpha=0.8, cutmix_alpha=1.0, cutmix_minmax=None,
    prob=0.8, switch_prob=0.3, mode='batch',
    label_smoothing=0.1, num_classes=15
)

def train_classification_model(model, train_loader, val_loader, optimizer, num_epochs, device, model_path = "best_model.pth"):
    # model = nn.DataParallel(model, device_ids=[0, 1])  # use GPUs 0 and 1
    model = model.to(device)
    best_val_acc = 0.0

    loss_fn = nn.CrossEntropyLoss()
    # Cosine LRS Scheduler
    scheduler = CosineLRScheduler(
        optimizer,
        t_initial=num_epochs,
        lr_min=1e-5,
        cycle_mul=1.0,
        warmup_t=5,
        warmup_lr_init=1e-6,
    )
    
    for epoch in range(num_epochs):

        model.train()
        train_loss = 0.0
        correct_train = 0
        total_train = 0
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")

        for images, targets in pbar:
            images, targets = mixup_fn(images, targets)
            images = images.to(device)
            targets = targets.to(device) 

            logits = model(images)

            loss = loss_fn(logits, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if len(targets.shape) == 1:
                _, preds = torch.max(logits, dim=1)
                correct_train += (preds == targets).sum().item()
                total_train += targets.size(0)
            else:
                _, preds = torch.max(logits, dim=1)
                _, target_labels = torch.max(targets, dim=1) 
                correct_train += (preds == target_labels).sum().item()
                total_train += targets.size(0)

            
            train_loss += loss.item() * images.size(0)
            pbar.set_postfix(loss=loss.item(), acc=f"{100*correct_train/total_train:.2f}%")
            pbar.update(1)

        avg_train_loss = train_loss / len(train_loader.dataset)
        train_accuracy = 100 * correct_train / total_train

        model.eval()
        val_loss = 0.0
        correct_val = 0
        total_val = 0
        
        with torch.no_grad():
            pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")
            for images, targets in pbar:
                images = images.to(device)
                targets = targets.to(device)

                logits = model(images)
                loss = loss_fn(logits, targets)

                _, predicted = torch.max(logits.data, 1)
                total_val += targets.size(0)
                correct_val += (predicted == targets).sum().item()
                
                val_loss += loss.item() * images.size(0)
                pbar.set_postfix(loss=loss.item(), acc=f"{100*correct_val/total_val:.2f}%")
                pbar.update(1)

        avg_val_loss = val_loss / len(val_loader.dataset)
        val_accuracy = 100 * correct_val / total_val
        
        if val_accuracy > best_val_acc:
            best_val_acc = val_accuracy
            torch.save(model.state_dict(), model_path)

        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Train Acc = {train_accuracy:.2f}%, \
                Val Loss = {avg_val_loss:.4f}, Val Acc = {val_accuracy:.2f}%")
        
        # Step the scheduler
        scheduler.step(epoch)

### **Evaluate**

In [21]:
def ensemble_predict(models, image_tensor, device):
    image_tensor = image_tensor.to(device)
    with torch.no_grad():
        logits = []
        for model in models:
            model.eval()
            out = model(image_tensor)
            logits.append(out)
        avg_logits = torch.stack(logits).mean(dim=0)
        avg_prob = torch.softmax(avg_logits, dim=1)
        preds = torch.argmax(avg_prob, dim=1)
    return preds, avg_prob


def evaluate_classification_model(models, test_loader, device, num_regions=15):
    for model in models:
        model.to(device)
        model.eval()

    all_predictions = []
    all_ground_truths = []
    correct = 0
    total = 0

    confusion_matrix = torch.zeros(num_regions, num_regions)

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            preds, _ = ensemble_predict(models, inputs, device)

            total += labels.size(0)
            correct += (preds == labels).sum().item()

            all_predictions.extend(preds.cpu().numpy())
            all_ground_truths.extend(labels.cpu().numpy())

            for t, p in zip(labels.view(-1), preds.view(-1)):
                confusion_matrix[t.long(), p.long()] += 1

    accuracy = 100 * correct / total

    per_class_accuracy = confusion_matrix.diag() / confusion_matrix.sum(1)
    per_class_accuracy = per_class_accuracy.cpu().numpy()

    adjacent_correct = 0
    for pred, true in zip(all_predictions, all_ground_truths):
        if pred == true or (pred == (true + 1) % num_regions) or (pred == (true - 1) % num_regions):
            adjacent_correct += 1

    adjacent_accuracy = 100 * adjacent_correct / total

    results = {
        'accuracy': accuracy,
        'adjacent_accuracy': adjacent_accuracy,
        'per_class_accuracy': per_class_accuracy,
        'predictions': all_predictions,
        'ground_truths': all_ground_truths
    }

    return results

def generate_submission(models, val_loader, test_loader, device, output_csv_name='YourRollNo_VersionofSubmission.csv', num_regions=15):
    for model in models:
        model.to(device)
        model.eval()

    predictions = []

    with torch.no_grad():
        for inputs, _ in val_loader:
            inputs = inputs.to(device)
            preds, _ = ensemble_predict(models, inputs, device)
            predictions.extend((preds + 1).cpu().tolist())

        assert len(predictions) == 369, "Expected 369 validation predictions."

        for inputs in test_loader:
            inputs = inputs.to(device)
            preds, _ = ensemble_predict(models, inputs, device)
            predictions.extend((preds + 1).cpu().tolist())

        assert len(predictions) == 738, "Total submission rows should be 738 (excluding header)."

    with open(output_csv_name, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['id', 'Region_ID'])
        for idx, region_id in enumerate(predictions):
            writer.writerow([idx, region_id])

    print(f"Submission file '{output_csv_name}' created successfully with {len(predictions)} predictions.")


### **Train Models**

In [12]:
num_epochs = 35
learning_rate = 1e-4

In [None]:
model_convnext = RegionClassifier(num_classes=15, pretrained=True, dropout_rate=0.2, use_fpn=True, model = "convex")
optimizer = optim.AdamW(
    model_convnext.parameters(),
    lr=learning_rate,
    weight_decay=0.05
)
train_classification_model(
    model_convnext, train_loader, val_loader, optimizer, num_epochs, device_1, model_path="model_convnext.pth")


In [None]:
model_deit = DeitRegionClassifier(num_classes=15, model_name='deit_base_patch16_224')
optimizer = optim.AdamW(
    model_deit.parameters(),
    lr=learning_rate,
    weight_decay=0.05
)
train_classification_model(
    model_deit, train_loader, val_loader, optimizer, num_epochs, device_1, model_path="model_deit.pth")

### **Testing**

In [None]:
model_convnext_ev = RegionClassifier(num_classes=15, pretrained=True, dropout_rate=0.2, use_fpn=True)
model_deit_ev = DeitRegionClassifier(num_classes=15, model_name='deit_base_patch16_224')
model_convnext_ev.load_state_dict(torch.load("model_convnext.pth"))
model_deit_ev.load_state_dict(torch.load("model_deit.pth"))

In [None]:
from itertools import combinations

models = [ model_deit_ev, model_convnext_ev] 
# best_result = {
#     'models': None,
#     'accuracy': 0,
#     'adjacent_accuracy': 0
# }

# # Try every non-empty combination of models
# for r in range(1, len(models) + 1):
#     for combo in combinations(models, r):
#         combo_list = list(combo)
#         results = evaluate_classification_model(combo_list, val_loader, device_1, num_regions=15)
#         print(f"Models: {[m.__class__.__name__ for m in combo_list]} | "
#               f"Accuracy: {results['accuracy']:.2f}% | "
#               f"Adjacent Accuracy: {results['adjacent_accuracy']:.2f}%")

#         if results['accuracy'] > best_result['accuracy']:
#             best_result = {
#                 'models': combo_list,
#                 'accuracy': results['accuracy'],
#                 'adjacent_accuracy': results['adjacent_accuracy']
#             }

# # Print best result
# print("\nBest Combination:")
# print(f"Models: {[m.__class__.__name__ for m in best_result['models']]}")
# print(f"Accuracy: {best_result['accuracy']:.2f}%")
# print(f"Adjacent Accuracy: {best_result['adjacent_accuracy']:.2f}%")
# Evaluate Ensemble Model

models = [model_convnext_ev, model_deit_ev]
results = evaluate_classification_model(models, val_loader, device_1, num_regions=15)
print(f"Accuracy: {results['accuracy']:.2f}%")
print(f"Adjacent Accuracy: {results['adjacent_accuracy']:.2f}%")
generate_submission(models, val_loader, test_loader, device_1, output_csv_name='2022101096_5.csv', num_regions=15)