In [None]:
import numpy as np
from scipy.misc import face
from scipy.ndimage import zoom
from scipy.special import logsumexp
import torch
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.metrics import roc_auc_score
from scipy.stats import gaussian_kde
DEVICE = "cuda"

def calculate_auc(pred, gt):
    gt_binary = (gt > 0.5).astype(int).flatten()
    pred = pred.flatten()
    auc = roc_auc_score(gt_binary, pred)
    return auc

def calculate_nss(pred, gt):
    pred_mean = np.mean(pred)
    pred_std = np.std(pred)
    if pred_std == 0:
        return 0
    pred_normalized = (pred - pred_mean) / pred_std
    nss = np.mean(pred_normalized * gt)
    return nss

def calculate_cc(pred, gt):
    pred = pred.flatten()
    gt = gt.flatten()
    cc, _ = pearsonr(pred, gt)
    return cc

def calculate_kld(pred, gt, epsilon=1e-10):
    pred = pred + epsilon
    gt = gt + epsilon
    
    pred = pred / np.sum(pred)
    gt = gt / np.sum(gt)
    
    kld = entropy(gt, pred)
    return kld

In [None]:
import cv2
import easyocr
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os


images_folder = "/kaggle/input/saliency-data/images/val"

image_ids = [
    "Vaillant_0480_1954_07_25-01",
    "Vaillant_0608_1957_01_06-06",
    "Vaillant_0525_1955_06_05-16"
]

# fr = french
reader = easyocr.Reader(['fr'], gpu=True)


def detect_text_boxes_with_easyocr(image):
    image_np = np.array(image)

    results = reader.readtext(image_np)

    boxes = []
    for (bbox, text, prob) in results:
        boxes.append(bbox)

    return boxes


def show_image_and_text_boxes(image_id):

    image_path = os.path.join(images_folder, f"{image_id}.png")
    image = Image.open(image_path).convert('RGB')
    boxes = detect_text_boxes_with_easyocr(image)
    image_np = np.array(image)

    plt.figure(figsize=(10, 10))
    plt.imshow(image_np)
    plt.axis("off")

    
    for box in boxes:
        box = np.int0(box) 
        cv2.polylines(image_np, [np.array(box)], isClosed=True, color=(0, 255, 0), thickness=2)


    plt.imshow(image_np)
    plt.title("Text Detection with EasyOCR")
    plt.axis("off")
    plt.show()


for image_id in image_ids:
    show_image_and_text_boxes(image_id)


# Data

In [None]:
comic_images_folder = '/kaggle/input/saliency-data/images/val/'
annotations_folder = '/kaggle/input/saliency-data/maps/val/'

In [None]:
comic_images = sorted([f for f in os.listdir(comic_images_folder) if f.endswith('.png') or f.endswith('.jpg')])
annotations = sorted([f for f in os.listdir(annotations_folder) if f.endswith('.png') or f.endswith('.jpg')])

# Ensure the lists are sorted and matched correctly
assert len(comic_images) == len(annotations), "The number of images and annotations should be the same."

In [None]:
def display_images_side_by_side(comic_img_path, annotation_img_path):
    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
    
    comic_img = mpimg.imread(comic_img_path)
    annotation_img = mpimg.imread(annotation_img_path)
    
    axs[0].imshow(comic_img)
    axs[0].set_title('Comic Image')
    axs[0].axis('off')
    
    axs[1].imshow(annotation_img, cmap='gray')
    axs[1].set_title('Saliency Annotation')
    axs[1].axis('off')
    
    plt.show()
    
i=0
for comic_img, annotation_img in zip(comic_images, annotations):
    display_images_side_by_side(os.path.join(comic_images_folder, comic_img), 
                                os.path.join(annotations_folder, annotation_img))
    i+=1
    if i==1:
        break # Remove or modify this line to display more images

In [None]:
from PIL import Image
import os

# Paths to your data directories
train_images_folder = '/kaggle/input/saliency-data/images/train/'
train_annotations_folder = '/kaggle/input/saliency-data/maps/train/'
val_images_folder = '/kaggle/input/saliency-data/images/val/'
val_annotations_folder = '/kaggle/input/saliency-data/maps/val/'



def print_image_sizes(img_folder, map_folder):
    img_ids = os.listdir(img_folder)
    
    for img_id in img_ids:
        img_path = os.path.join(img_folder, img_id)
        map_path = os.path.join(map_folder, img_id)
        
       
        img = Image.open(img_path)
        sal_map = Image.open(map_path)
        
       
        print(f"Image: {img_path}, Size: {img.size}")
        print(f"Saliency Map: {map_path}, Size: {sal_map.size}")
        print("-" * 50)


print("Training Set Sizes:")
print_image_sizes(train_images_folder, train_annotations_folder)


print("Validation Set Sizes:")
print_image_sizes(val_images_folder, val_annotations_folder)


In [None]:
import os
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import easyocr
import cv2
import torch
import matplotlib.pyplot as plt

class SaliconDatasetWithTextChannel(Dataset):
    def __init__(self, img_dir, gt_dir, img_ids, exten='.png', augment=False):
        self.img_dir = img_dir
        self.gt_dir = gt_dir
        self.img_ids = img_ids
        self.exten = exten
        self.augment = augment
        
        self.img_transform = transforms.Compose([
            transforms.Resize((512, 512)),  # Resize to 512x512
            transforms.ToTensor(),
            transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
        ])
        
        self.gt_transform = transforms.Compose([
            transforms.Resize((512, 512)),  # Resize to 512x512
            transforms.ToTensor()
        ])
        
        # Augmentation transforms
        self.augmentation_transform = transforms.Compose([
            transforms.RandomResizedCrop(512, scale=(0.8, 1.0)),  # Random crop and resize
        ])

        # Initialize EasyOCR reader for French
        self.reader = easyocr.Reader(['fr'], gpu=True)
    def detect_text_boxes_with_easyocr(self, image, area_threshold=7000):
        """
        Detect text boxes using EasyOCR and filter out large text boxes based on an area threshold.
        Args:
        - image: PIL image to perform text detection on.
        - area_threshold: the area size above which the text boxes are ignored (in pixels).

        Returns:
        - text_mask: a binary mask with text regions marked (as a PIL image).
        """

        image_np = np.array(image)
        results = self.reader.readtext(image_np)

        text_mask = np.zeros_like(image_np[:, :, 0])

        
        for (bbox, text, prob) in results:
            # bbox is a list of four points, each point is a list [x, y]
            # Extract the x and y coordinates for each corner of the bounding box
            x0, y0 = bbox[0]  # top-left corner
            x1, y1 = bbox[1]  # top-right corner
            x2, y2 = bbox[2]  # bottom-right corner
            x3, y3 = bbox[3]  # bottom-left corner

            # Calculate width and height of the bounding box
            width = np.linalg.norm([x1 - x0, y1 - y0]) 
            height = np.linalg.norm([x2 - x1, y2 - y1])  
            area = width * height

            #print(f"Text: '{text}', Area: {area:.2f}, Width: {width:.2f}, Height: {height:.2f}")

            
            if area < area_threshold:
                pts = np.array(bbox, dtype=np.int32)
                cv2.fillPoly(text_mask, [pts], 255)

        return Image.fromarray(text_mask)  # Convert back to PIL Image

    def __getitem__(self, idx):
        img_id = self.img_ids[idx]
        img_path = os.path.join(self.img_dir, img_id + self.exten)
        gt_path = os.path.join(self.gt_dir, img_id + self.exten)

        img = Image.open(img_path).convert('RGB')
        gt = Image.open(gt_path).convert('L')

        text_mask = self.detect_text_boxes_with_easyocr(img)

        if self.augment:
            img = self.augmentation_transform(img)
            gt = self.augmentation_transform(gt)
        
        img = self.img_transform(img)
        gt = self.gt_transform(gt)
        
        
        text_mask = text_mask.resize((512, 512)) 
        text_mask = transforms.ToTensor()(text_mask)

        # Combine image and text mask into a multi-channel tensor (4 channels now)
        img_with_text_channel = torch.cat([img, text_mask], dim=0)

        return img_with_text_channel, gt

    def __len__(self):
        return len(self.img_ids)


train_images_folder = '/kaggle/input/saliency-data/images/train/'
train_annotations_folder = '/kaggle/input/saliency-data/maps/train/'
val_images_folder = '/kaggle/input/saliency-data/images/val/'
val_annotations_folder = '/kaggle/input/saliency-data/maps/val/'


train_img_ids = os.listdir(train_images_folder)
train_img_ids = [os.path.splitext(f)[0] for f in train_img_ids]

val_img_ids = os.listdir(val_images_folder)
val_img_ids = [os.path.splitext(f)[0] for f in val_img_ids]


train_dataset = SaliconDatasetWithTextChannel(train_images_folder, train_annotations_folder, train_img_ids, augment=False)
val_dataset = SaliconDatasetWithTextChannel(val_images_folder, val_annotations_folder, val_img_ids, augment=False)


train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=0, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0, pin_memory=True)


for images, targets in val_dataloader:
    text_mask_np = images[0, 3, :, :].cpu().numpy()  # The 4th channel (index 3) corresponds to the text mask

    plt.imshow(text_mask_np, cmap='gray')
    plt.title("Text Detection Mask")
    plt.show()


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import timm

class ViTSaliencyModel(nn.Module):
    def __init__(self, freeze_vit_layers=True):
        super(ViTSaliencyModel, self).__init__()
        

        self.backbone = timm.create_model('vit_base_patch16_384', pretrained=True, img_size=512)
        self.backbone.head = nn.Identity()  # Removing the final classification head

        self.backbone.patch_embed.proj = nn.Conv2d(4, 768, kernel_size=16, stride=16)  # For 4 input channels

        
        if freeze_vit_layers:
            for param in self.backbone.parameters():
                param.requires_grad = False

        # CNN Feature Extraction for Local Features (modified to handle 4-channel input)
        self.cnn_extractor = nn.Sequential(
            nn.Conv2d(4, 64, kernel_size=7, stride=2, padding=3), 
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
        )

        # Upsampling layers to reconstruct the saliency map, ensuring output is 512x512
        self.upsample = nn.Sequential(
            nn.Conv2d(768 + 128, 512, kernel_size=3, padding=1),  # Concatenated ViT and CNN features
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(64, 32, kernel_size=2, stride=2),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 1, kernel_size=1),  # Output single-channel saliency map
            nn.Upsample(size=(512, 512), mode='bilinear', align_corners=False)
        )


        self.upsample.apply(self.weights_init)

    def forward(self, x):
        
        cnn_features = self.cnn_extractor(x)

        x = self.backbone.patch_embed(x) 
        x = self.backbone.pos_drop(x)
        x = self.backbone.blocks(x) 
        x = self.backbone.norm(x) 

        # Reshape ViT output for upsampling
        batch_size, num_patches, embedding_dim = x.size()
        height = width = int(num_patches ** 0.5)

        x = x.permute(0, 2, 1).contiguous()  # Rearrange to [batch, embedding_dim, height, width]
        x = x.view(batch_size, embedding_dim, height, width)

        # Ensure the CNN feature map has the same spatial dimensions as the ViT output
        cnn_features = torch.nn.functional.interpolate(cnn_features, size=(height, width), mode='bilinear', align_corners=False)

        # Concatenate ViT and CNN features
        combined_features = torch.cat([x, cnn_features], dim=1)

        # Upsample to generate saliency map
        saliency_map = self.upsample(combined_features)
        return saliency_map

    # Weight initialization for upsampling layers
    def weights_init(self, m):
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
            nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)



In [None]:
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn as nn

device = "cuda" if torch.cuda.is_available() else "cpu"
model = ViTSaliencyModel(freeze_vit_layers=True).to(device)

criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)

num_epochs = 9
for epoch in range(num_epochs):
    model.train() 
    running_loss = 0.0
    
    for images, targets in train_dataloader:
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad() 
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward() 
        optimizer.step()
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(train_dataloader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}')
    
    scheduler.step(epoch_loss)

print('Finished Training')



In [None]:
torch.save(model, 'model.pth')

In [None]:
import os
import torch
import cv2
import numpy as np
from torchvision import transforms
import easyocr
from PIL import Image

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)


reader = easyocr.Reader(['fr'], gpu=True)

def detect_text_boxes_with_easyocr(image, area_threshold=6000):
    """
    Detect text boxes using EasyOCR and filter out large text boxes based on an area threshold.
    Returns:
    - text_mask: a binary mask with text regions marked (as a numpy array).
    """
    image_np = np.array(image)
    results = reader.readtext(image_np)
    text_mask = np.zeros_like(image_np[:, :, 0])

    for (bbox, text, prob) in results:
        x0, y0 = bbox[0]
        x1, y1 = bbox[1]
        x2, y2 = bbox[2]
        x3, y3 = bbox[3]
        width = np.linalg.norm([x1 - x0, y1 - y0])
        height = np.linalg.norm([x2 - x1, y2 - y1])
        area = width * height

        if area < area_threshold:
            pts = np.array(bbox, dtype=np.int32)
            cv2.fillPoly(text_mask, [pts], 255)

    return text_mask

def add_text_channel(images):
    processed_images = []
    for img in images:
        if img.shape[0] == 3:
            img_pil = transforms.ToPILImage()(img.cpu())
            text_mask = detect_text_boxes_with_easyocr(img_pil)
            text_mask_resized = cv2.resize(text_mask, (img_pil.size[0], img_pil.size[1]))
            text_mask_resized = text_mask_resized / 255.0
            text_mask_tensor = transforms.ToTensor()(text_mask_resized).to(img.device)
            img_with_text_channel = torch.cat([img, text_mask_tensor], dim=0)
        else:
            img_with_text_channel = img
        
        processed_images.append(img_with_text_channel)
    
    return torch.stack(processed_images)

# Function to apply Gaussian blur to the saliency map
def apply_gaussian_blur(saliency_map, kernel_size=(5, 5)):
    return cv2.GaussianBlur(saliency_map, kernel_size, 0)

# Function to process and save saliency maps using the validation DataLoader
def process_validation_data(val_dataloader, output_folder, filenames):
    create_directory(output_folder)

    for i, (images, _) in enumerate(val_dataloader):
        images = images.to(device)
        images_with_text_channel = add_text_channel(images)

        with torch.no_grad():
            saliency_maps = model(images_with_text_channel)

        # Process each image in the batch
        for j in range(images.size(0)):
            saliency_map = saliency_maps[j].squeeze().cpu().numpy()
            # Normalize the saliency map between 0 and 1
            saliency_map = (saliency_map - saliency_map.min()) / (saliency_map.max() - saliency_map.min())
            # Apply Gaussian blur to the saliency map
            saliency_map = apply_gaussian_blur(saliency_map)
            # Convert to uint8 for saving
            saliency_map = (saliency_map * 255).astype("uint8")
            
            filename = filenames[i * images.size(0) + j]
            output_path = os.path.join(output_folder, filename)
            success = cv2.imwrite(output_path, saliency_map)
            if success:
                print(f'Successfully saved {filename} with shape {saliency_map.shape} to {output_path}')
            else:
                print(f'Failed to save image for {filename}')


MODE = "PRED"

if MODE == "PRED":
    output_folder = "/kaggle/working/preds9"
else:
    output_folder = "/kaggle/working/preds/val"

filenames = [f"{name}.png" for name in val_img_ids]


process_validation_data(val_dataloader, output_folder, filenames)



In [None]:
import os
import numpy as np
from skimage import io
from sklearn.metrics import roc_auc_score
from scipy.stats import pearsonr, entropy
import cv2  
import matplotlib.pyplot as plt 

def load_images_from_folder(folder):
    images = {}
    for filename in os.listdir(folder):
        if filename.endswith(".png") or filename.endswith(".jpg"):  # Adjust based on your image format
            img = io.imread(os.path.join(folder, filename), as_gray=True)
            if img is not None:
                images[filename] = img
    return images

def normalize_map(saliency_map):
    norm_map = (saliency_map - np.min(saliency_map)) / (np.max(saliency_map) - np.min(saliency_map) + 1e-10)
    return norm_map

def apply_post_processing(saliency_map, method="", kernel_size=(5, 5)):
    """
    Apply post-processing to the saliency map.
    You can extend this method to include different types of post-processing.
    """
    if method == "gaussian":
        return cv2.GaussianBlur(saliency_map, kernel_size, 0)
    elif method == "normalize":
        return normalize_map(saliency_map)
    elif method == "threshold":
        _, thresh_map = cv2.threshold(saliency_map, 0.8, 1.0, cv2.THRESH_BINARY)
        return thresh_map
    else:
        return saliency_map 

def calculate_auc(pred, gt):
    pred = normalize_map(pred).flatten()
    gt_binary = (gt > 0.5).astype(int).flatten()  # Binarize ground truth
    return roc_auc_score(gt_binary, pred)

def calculate_nss(pred, gt):
    gt_fixation = (gt > 0.5).astype(np.float32)  # Binarize fixation map
    pred = normalize_map(pred)
    mean_pred = np.mean(pred)
    std_pred = np.std(pred)
    if std_pred == 0:
        return 0
    pred_norm = (pred - mean_pred) / (std_pred + 1e-10)
    return np.mean(pred_norm * gt_fixation)

def calculate_cc(pred, gt):
    pred = normalize_map(pred)
    gt = normalize_map(gt)
    return pearsonr(pred.flatten(), gt.flatten())[0]

def calculate_kld(pred, gt):
    pred = normalize_map(pred).flatten()
    gt = normalize_map(gt).flatten()
    return entropy(gt + 1e-10, pred + 1e-10)

def check_nan_or_zero(img, img_type):
    if np.isnan(img).any():
        print(f"{img_type} contains NaN values")
    if np.all(img == 0):
        print(f"{img_type} is all zeros")

def visualize_prediction(pred, gt, image_idx, filename):
    """Visualize the prediction and ground truth."""
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.title(f"Prediction {image_idx}: {filename}")
    plt.imshow(pred, cmap='gray')
    plt.subplot(1, 2, 2)
    plt.title(f"Ground Truth {image_idx}: {filename}")
    plt.imshow(gt, cmap='gray')
    plt.show()

def evaluate_model(predictions_folder, gt_folder, post_process_method=None):
    predictions = load_images_from_folder(predictions_folder)
    ground_truths = load_images_from_folder(gt_folder)


    common_filenames = list(set(predictions.keys()).intersection(set(ground_truths.keys())))
    common_filenames.sort()

    auc_scores = []
    nss_scores = []
    cc_scores = []
    kld_scores = []

    for i, filename in enumerate(common_filenames):
        pred = predictions[filename]
        gt = ground_truths[filename]

        check_nan_or_zero(pred, "Prediction")
        check_nan_or_zero(gt, "Ground Truth")


        if pred.shape != gt.shape:
            print(f"Resizing prediction {i} from shape {pred.shape} to {gt.shape}")
            pred = cv2.resize(pred, (gt.shape[1], gt.shape[0]))


        if post_process_method:
            pred = apply_post_processing(pred, method=post_process_method)

        visualize_prediction(pred, gt, i, filename)


        auc = calculate_auc(pred, gt)
        nss = calculate_nss(pred, gt)
        cc = calculate_cc(pred, gt)
        kld = calculate_kld(pred, gt)
        
        print(f"Image {i} ({filename}): AUC={auc}, NSS={nss}, CC={cc}, KLD={kld}")
        
        auc_scores.append(auc)
        nss_scores.append(nss)
        cc_scores.append(cc)
        kld_scores.append(kld)

    metrics = {
        'AUC': np.mean(auc_scores),
        'NSS': np.mean(nss_scores),
        'CC': np.mean(cc_scores),
        'KLD': np.mean(kld_scores)
    }

    return metrics


predictions_folder = "/kaggle/working/preds9"  # Path where your model predictions are saved
gt_folder = "/kaggle/input/saliency-data/maps/val"  # Path to your ground truth saliency maps

metrics = evaluate_model(predictions_folder, gt_folder, post_process_method="gaussian")
print(f"Model Evaluation Metrics with Post-Processing:\nAUC: {metrics['AUC']}\nNSS: {metrics['NSS']}\nCC: {metrics['CC']}\nKLD: {metrics['KLD']}")


In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from scipy.ndimage import gaussian_filter


images_folder = "/kaggle/input/saliency-data/images/val"
gt_folder = "/kaggle/input/saliency-data/maps/val"
predictions_folder = "/kaggle/working/preds9"


image_ids = [
    "Vaillant_0480_1954_07_25-01",
    "Vaillant_0608_1957_01_06-06",
    "Vaillant_0525_1955_06_05-16",
    "Vaillant_0471_1954_05_23-14",
    "Vaillant_0479_1954_07_18-01",
    "Vaillant_0553_1955_12_18-06",
    "Vaillant_0485_1954_08_29-01"
]


def apply_gaussian_smoothing(image, sigma=2):
    return gaussian_filter(image, sigma=sigma)


def show_comic_gt_pred_smooth(image_id, sigma=2):
    # Load the images
    image_path = os.path.join(images_folder, f"{image_id}.png")
    gt_path = os.path.join(gt_folder, f"{image_id}.png")
    pred_path = os.path.join(predictions_folder, f"{image_id}.png")
    
    image = Image.open(image_path).convert("RGB")
    gt = Image.open(gt_path).convert("L")  
    pred = Image.open(pred_path).convert("L") 
    pred_np = np.array(pred)
    

    pred_smoothed = apply_gaussian_smoothing(pred_np, sigma=sigma)
    fig, axs = plt.subplots(1, 4, figsize=(20, 5))
    
    axs[0].imshow(image)
    axs[0].set_title("Original Comic")
    axs[0].axis("off")
    
    axs[1].imshow(gt, cmap="gray")
    axs[1].set_title("Ground Truth")
    axs[1].axis("off")
    
    axs[2].imshow(pred, cmap="gray")
    axs[2].set_title("Prediction")
    axs[2].axis("off")
    
    axs[3].imshow(pred_smoothed, cmap="gray")
    axs[3].set_title(f"Prediction (Smoothed, σ={sigma})")
    axs[3].axis("off")
    
    plt.show()


for image_id in image_ids:
    show_comic_gt_pred_smooth(image_id, sigma=2)



In [None]:
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn as nn

device = "cuda" if torch.cuda.is_available() else "cpu"
model1 = ViTSaliencyModel(freeze_vit_layers=True).to(device)

criterion = nn.L1Loss()
optimizer = optim.Adam(model1.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)

num_epochs = 12
for epoch in range(num_epochs):
    model1.train()  
    running_loss = 0.0
    
    for images, targets in train_dataloader:
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()  
        outputs = model1(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step() 
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(train_dataloader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}')
    
    scheduler.step(epoch_loss)

print('Finished Training')

In [None]:
import os
import torch
import cv2
import numpy as np
from torchvision import transforms
import easyocr
from PIL import Image

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

reader = easyocr.Reader(['fr'], gpu=True)

def detect_text_boxes_with_easyocr(image, area_threshold=6000):
    """
    Detect text boxes using EasyOCR and filter out large text boxes based on an area threshold.
    Returns:
    - text_mask: a binary mask with text regions marked (as a numpy array).
    """
    image_np = np.array(image)
    results = reader.readtext(image_np)
    text_mask = np.zeros_like(image_np[:, :, 0])

    for (bbox, text, prob) in results:
        x0, y0 = bbox[0]
        x1, y1 = bbox[1]
        x2, y2 = bbox[2]
        x3, y3 = bbox[3]
        width = np.linalg.norm([x1 - x0, y1 - y0])
        height = np.linalg.norm([x2 - x1, y2 - y1])
        area = width * height

        if area < area_threshold:
            pts = np.array(bbox, dtype=np.int32)
            cv2.fillPoly(text_mask, [pts], 255)

    return text_mask

def add_text_channel(images):
    processed_images = []
    for img in images:
        if img.shape[0] == 3:
            img_pil = transforms.ToPILImage()(img.cpu())
            text_mask = detect_text_boxes_with_easyocr(img_pil)
            text_mask_resized = cv2.resize(text_mask, (img_pil.size[0], img_pil.size[1]))
            text_mask_resized = text_mask_resized / 255.0
            text_mask_tensor = transforms.ToTensor()(text_mask_resized).to(img.device)
            img_with_text_channel = torch.cat([img, text_mask_tensor], dim=0)
        else:
            img_with_text_channel = img
        
        processed_images.append(img_with_text_channel)
    
    return torch.stack(processed_images)


def apply_gaussian_blur(saliency_map, kernel_size=(5, 5)):
    return cv2.GaussianBlur(saliency_map, kernel_size, 0)


def process_validation_data(val_dataloader, output_folder, filenames):
    create_directory(output_folder)

    for i, (images, _) in enumerate(val_dataloader):
        images = images.to(device)
        images_with_text_channel = add_text_channel(images)
        
        with torch.no_grad():
            saliency_maps = model1(images_with_text_channel)

        for j in range(images.size(0)):
            saliency_map = saliency_maps[j].squeeze().cpu().numpy()

            # Normalize the saliency map between 0 and 1
            saliency_map = (saliency_map - saliency_map.min()) / (saliency_map.max() - saliency_map.min())

            # Apply Gaussian blur to the saliency map
            saliency_map = apply_gaussian_blur(saliency_map)

            # Convert to uint8 for saving
            saliency_map = (saliency_map * 255).astype("uint8")
            
            # Use the original filename
            filename = filenames[i * images.size(0) + j]
            output_path = os.path.join(output_folder, filename)

            success = cv2.imwrite(output_path, saliency_map)
            if success:
                print(f'Successfully saved {filename} with shape {saliency_map.shape} to {output_path}')
            else:
                print(f'Failed to save image for {filename}')

MODE = "PRED"

if MODE == "PRED":
    output_folder = "/kaggle/working/preds8"
else:
    output_folder = "/kaggle/working/preds/val"

filenames = [f"{name}.png" for name in val_img_ids]

# Process the validation DataLoader
process_validation_data(val_dataloader, output_folder, filenames)


In [None]:
predictions_folder = "/kaggle/working/preds8"  
gt_folder = "/kaggle/input/saliency-data/maps/val" 

#threshold, normalize, gaussian
metrics = evaluate_model(predictions_folder, gt_folder, post_process_method="")
print(f"Model Evaluation Metrics with Post-Processing:\nAUC: {metrics['AUC']}\nNSS: {metrics['NSS']}\nCC: {metrics['CC']}\nKLD: {metrics['KLD']}")


In [None]:
import os
import numpy as np
import cv2

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def load_images_from_folder(folder):
    images = {}
    for filename in os.listdir(folder):
        if filename.endswith(".png") or filename.endswith(".jpg"):  # Adjust for your image format
            img_path = os.path.join(folder, filename)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if img is not None:
                images[filename] = img
    return images

def ensemble_predictions(folder1, folder2, output_folder, method="average"):

    preds1 = load_images_from_folder(folder1)
    preds2 = load_images_from_folder(folder2)

    create_directory(output_folder)


    common_filenames = list(set(preds1.keys()).intersection(set(preds2.keys())))

    for filename in common_filenames:
        pred1 = preds1[filename]
        pred2 = preds2[filename]


        if pred1.shape != pred2.shape:
            print(f"Resizing {filename} predictions to match dimensions.")
            pred2 = cv2.resize(pred2, (pred1.shape[1], pred1.shape[0]))


        if method == "average":
            ensemble_pred = (pred1.astype(np.float32) + pred2.astype(np.float32)) / 2
        elif method == "max":
            ensemble_pred = np.maximum(pred1, pred2)
        else:
            raise ValueError(f"Unknown ensemble method: {method}")


        ensemble_pred = (ensemble_pred - ensemble_pred.min()) / (ensemble_pred.max() - ensemble_pred.min()) * 255
        ensemble_pred = ensemble_pred.astype(np.uint8)

        output_path = os.path.join(output_folder, filename)
        success = cv2.imwrite(output_path, ensemble_pred)
        if success:
            print(f'Successfully saved ensembled prediction: {filename} to {output_path}')
        else:
            print(f'Failed to save ensembled prediction for {filename}')


folder_path_1 = "/kaggle/working/preds9"
folder_path_2 = "/kaggle/working/preds8"
output_folder = "/kaggle/working/preds10"

#average,max
ensemble_predictions(folder_path_1, folder_path_2, output_folder, method="average")


In [None]:
"""


"""
import zipfile
import os

def compress_images_to_zip(folder_path, output_zip):
    # Create a ZipFile object
    with zipfile.ZipFile(output_zip, 'w') as zipf:
        # Iterate over all the files in the directory
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Create the full file path
                file_path = os.path.join(root, file)
                # Add file to the zip file
                zipf.write(file_path, os.path.relpath(file_path, folder_path))
    print(f'All images have been compressed into {output_zip}')


folder_path = "/kaggle/working/preds10"  # Folder containing the images to compress
output_zip = '/kaggle/working/saliency_maps20.zip'  # Name of the output zip file

# Compress the images
compress_images_to_zip(folder_path, output_zip)

