In [3]:
import pandas as pd

# ✅ Load CSV
df = pd.read_csv("/kaggle/input/datasett/test (1).csv")

# ✅ Ensure correct replacement of "/content" with "/kaggle/input/imagessss"
df["Image_path"] = df["Image_path"].str.replace("/content", "/kaggle/input/imagessss", regex=False)

# ✅ Also remove any unintended "/kaggle/working/" prefix
df["Image_path"] = df["Image_path"].str.replace("/kaggle/working/", "/", regex=False)

# ✅ Save the updated CSV
df.to_csv("valid_updated.csv", index=False)

# ✅ Check if the replacement worked
print(df["Image_path"].head())
print("✅ Image paths updated successfully!")


0    /kaggle/input/imagessss/Multimodal_images/skin...
1    /kaggle/input/imagessss/Multimodal_images/skin...
2    /kaggle/input/imagessss/Multimodal_images/skin...
3    /kaggle/input/imagessss/Multimodal_images/swol...
4    /kaggle/input/imagessss/Multimodal_images/skin...
Name: Image_path, dtype: object
✅ Image paths updated successfully!


In [4]:
import pandas as pd

# ✅ Load CSV
df = pd.read_csv("/kaggle/input/datasett/val (1).csv")

# ✅ Ensure correct replacement of "/content" with "/kaggle/input/imagessss"
df["Image_path"] = df["Image_path"].str.replace("/content", "/kaggle/input/imagessss", regex=False)

# ✅ Also remove any unintended "/kaggle/working/" prefix
df["Image_path"] = df["Image_path"].str.replace("/kaggle/working/", "/", regex=False)

# ✅ Save the updated CSV
df.to_csv("test_updated.csv", index=False)

# ✅ Check if the replacement worked
print(df["Image_path"].head())
print("✅ Image paths updated successfully!")


0    /kaggle/input/imagessss/Multimodal_images/skin...
1    /kaggle/input/imagessss/Multimodal_images/skin...
2    /kaggle/input/imagessss/Multimodal_images/swol...
3    /kaggle/input/imagessss/Multimodal_images/skin...
4    /kaggle/input/imagessss/Multimodal_images/skin...
Name: Image_path, dtype: object
✅ Image paths updated successfully!


In [5]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [6]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=7bea30252c2dfa5a82f95ad7523838ce70638ec9ad98f01f9b066876fae8bc3c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [7]:
!pip install textstat


Collecting textstat
  Downloading textstat-0.7.5-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.0.32-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.5-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.3/105.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cmudict-1.0.32-py3-none-any.whl (939 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pyphen, cmudict, textstat
Successfully installed cmudict-1.0.32 pyphen-0.17.2 textstat-0.7.5


In [None]:
# --------------------------
# Section 0: Imports
# --------------------------
import re
import os
import json
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split  # (if needed for further splits)

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

import torchvision.transforms as transforms
import torchvision.models as models

from transformers import BartTokenizer, BartForConditionalGeneration, get_linear_schedule_with_warmup

import evaluate
from textstat import flesch_reading_ease  # For readability scores

# --------------------------
# Section 1: Data Preprocessing & Transforms
# --------------------------
def enhanced_clean_text(text):
    """
    Clean codemixed text by lowercasing, fixing common quirks, and removing extra spaces.
    """
    text = text.lower()
    text = re.sub(r'sabse normal', 'most normal', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Define image transformations for training and validation/test
train_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

# --------------------------
# Section 2: Dataset Definition
# --------------------------
class MultimodalDataset(Dataset):
    def __init__(self, data, tokenizer, transform):
        """
        data: pandas DataFrame with columns: 'Codemixed_Question', 'Image_path', 'summary'
        tokenizer: BartTokenizer.
        transform: Image transformation function.
        """
        self.data = data.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        # Clean text and retrieve image path & summary
        text = enhanced_clean_text(row['Codemixed_Question'])
        image_path = row['Image_path']
        summary = row['summary']

        # Tokenize the input text
        input_encoding = self.tokenizer(
            text,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = input_encoding.input_ids.squeeze(0)
        attention_mask = input_encoding.attention_mask.squeeze(0)

        # Tokenize the summary (target)
        target_encoding = self.tokenizer(
            summary,
            max_length=150,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        labels = target_encoding.input_ids.squeeze(0)
        labels[labels == self.tokenizer.pad_token_id] = -100  # ignore padding in loss

        # Load and transform image
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        return {
            'input_ids': input_ids,         
            'attention_mask': attention_mask,  
            'image': image,                 
            'labels': labels                
        }

# --------------------------
# Section 3: Data Loading
# --------------------------
# Assuming CSV files are stored in the working directory.
train_csv_path = "/kaggle/working/train_updated.csv"
valid_csv_path = "/kaggle/working/valid_updated.csv"
test_csv_path  = "/kaggle/working/test_updated.csv"

train_df = pd.read_csv(train_csv_path)
valid_df = pd.read_csv(valid_csv_path)
test_df  = pd.read_csv(test_csv_path)

print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")

# Initialize tokenizer (using DistilBART weights)
tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")

# Create dataset instances using respective transforms
train_dataset = MultimodalDataset(train_df, tokenizer, train_transform)
valid_dataset = MultimodalDataset(valid_df, tokenizer, val_transform)
test_dataset  = MultimodalDataset(test_df, tokenizer, val_transform)

# DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2)
valid_dataloader = DataLoader(valid_dataset, batch_size=8, shuffle=False, num_workers=2)
test_dataloader  = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=2)

# --------------------------
# Section 4: Model Definition - MultimodalDistilBart
# --------------------------
class MultimodalDistilBart(nn.Module):
    def __init__(self, tokenizer):
        super(MultimodalDistilBart, self).__init__()
        # Initialize Vision Transformer (ViT)
        self.vit = models.vit_b_16(pretrained=True)
        # Remove classification head to get raw features
        self.vit.heads = nn.Identity()
        # Freeze ViT parameters initially
        for param in self.vit.parameters():
            param.requires_grad = False

        # Load DistilBART weights for summarization
        self.distilbart = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
        self.distilbart.resize_token_embeddings(len(tokenizer))
        
        # Projection layer: map ViT features (768) to DistilBART embedding space (1024)
        self.projection = nn.Sequential(
            nn.Linear(768, 1024),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        
    def forward(self, input_ids, attention_mask, image, labels=None):
        # Get text embeddings from DistilBART's shared layer.
        text_embeddings = self.distilbart.model.shared(input_ids)  # [B, seq_len, 1024]
        
        # Extract image features with ViT (use no_grad for frozen parameters)
        with torch.no_grad():
            image_features = self.vit(image)  # [B, 768]
        projected_features = self.projection(image_features)  # [B, 1024]
        projected_features = projected_features.unsqueeze(1)  # [B, 1, 1024]
        
        # Concatenate the image token with text embeddings
        combined_embeddings = torch.cat([projected_features, text_embeddings], dim=1)  # [B, seq_len+1, 1024]
        
        # Adjust attention mask to account for extra image token
        batch_size = attention_mask.size(0)
        img_mask = torch.ones(batch_size, 1, dtype=attention_mask.dtype, device=attention_mask.device)
        combined_attention_mask = torch.cat([img_mask, attention_mask], dim=1)
        
        # Forward pass through DistilBART
        outputs = self.distilbart(
            inputs_embeds=combined_embeddings,
            attention_mask=combined_attention_mask,
            labels=labels
        )
        return outputs

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalDistilBart(tokenizer).to(device)

# --------------------------
# Section 5: Training Loop with Warmup & Differential Learning Rates
# --------------------------
optimizer = torch.optim.AdamW([
    {'params': model.distilbart.parameters(), 'lr': 5e-5},
    {'params': model.projection.parameters(), 'lr': 5e-5},
    {'params': model.vit.parameters(), 'lr': 1e-5},
])

num_epochs = 10
num_training_steps = len(train_dataloader) * num_epochs
num_warmup_steps = int(0.1 * num_training_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps,
                                              num_training_steps=num_training_steps)

scaler = GradScaler()
accumulation_steps = 4  # Effective batch size = batch_size * accumulation_steps

import matplotlib.pyplot as plt

# Initialize lists to record epoch losses
train_loss_history = []
valid_loss_history = []

best_val_loss = float('inf')
patience = 2
no_improve = 0

for epoch in range(num_epochs):
    # Unfreeze ViT parameters after 3 epochs for joint fine-tuning
    if epoch == 3:
        for param in model.vit.parameters():
            param.requires_grad = True
        print(f"Epoch {epoch+1}: Unfreezing ViT parameters for fine-tuning.")
    
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        image = batch['image'].to(device)
        labels = batch['labels'].to(device)
        
        with autocast():
            outputs = model(input_ids, attention_mask, image, labels)
            loss = outputs.loss / accumulation_steps
        
        scaler.scale(loss).backward()
        
        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * accumulation_steps
        if (step + 1) % 10 == 0:
            print(f"Epoch {epoch+1}, Step {step+1}, Batch Loss: {loss.item() * accumulation_steps:.4f}")
    
    avg_train_loss = total_loss / len(train_dataloader)
    train_loss_history.append(avg_train_loss)
    print(f"Epoch {epoch+1} Training Loss: {avg_train_loss:.4f}")
    
    # Validation loop
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            image = batch['image'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask, image, labels)
            total_val_loss += outputs.loss.item()
    
    avg_val_loss = total_val_loss / len(valid_dataloader)
    valid_loss_history.append(avg_val_loss)
    print(f"Epoch {epoch+1} Validation Loss: {avg_val_loss:.4f}")
    
    
torch.save(model.state_dict(), "/kaggle/working/best_multimodal_distilbart.pth")
print("Model improved; checkpoint saved.")
    

# --------------------------
# Plot Training and Validation Loss
# --------------------------
epochs = range(1, len(train_loss_history) + 1)
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_loss_history, marker='o', label='Training Loss')
plt.plot(epochs, valid_loss_history, marker='o', label='Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training vs. Validation Loss")
plt.legend()
plt.savefig("/kaggle/working/loss_plot.png")
plt.show()

# --------------------------
# Section 6: Inference Routine
# --------------------------



In [10]:
!pip install bert_score
def generate_summary(model, tokenizer, codemixed_question, image_path, device, transform):
    """
    Generate a summary for a given codemixed question and image.
    """
    cleaned_text = enhanced_clean_text(codemixed_question)
    input_encoding = tokenizer(
        cleaned_text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = input_encoding.input_ids.to(device)
    attention_mask = input_encoding.attention_mask.to(device)
    
    # Process image
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    
    text_embeddings = model.distilbart.model.shared(input_ids)
    
    with torch.no_grad():
        image_features = model.vit(image)
        projected_features = model.projection(image_features)
    projected_features = projected_features.unsqueeze(1)
    
    combined_embeddings = torch.cat([projected_features, text_embeddings], dim=1)
    batch_size = attention_mask.size(0)
    img_mask = torch.ones(batch_size, 1, dtype=attention_mask.dtype, device=attention_mask.device)
    combined_attention_mask = torch.cat([img_mask, attention_mask], dim=1)
    
    outputs = model.distilbart.generate(
        inputs_embeds=combined_embeddings,
        attention_mask=combined_attention_mask,
        max_length=150,
        num_beams=6,
        temperature=0.7,
        early_stopping=True
    )
    
    generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_summary

# --------------------------
# Section 7: Advanced Evaluation Routine
# --------------------------
def compute_all_bleu(predictions, references):
    """
    Compute BLEU-1 through BLEU-4 scores.
    """
    bleu = evaluate.load('bleu')
    bleu_scores = {}
    for max_order in range(1, 5):
        result = bleu.compute(predictions=predictions, references=[[ref] for ref in references], max_order=max_order)
        bleu_scores[f"BLEU-{max_order}"] = result['bleu']
    return bleu_scores

def compute_readability(text):
    """
    Compute Flesch Reading Ease score.
    """
    try:
        score = flesch_reading_ease(text)
    except Exception as e:
        score = None
    return score

def evaluate_model(model, tokenizer, dataloader, device, transform):
    model.eval()
    rouge = evaluate.load('rouge')
    bertscore = evaluate.load('bertscore')
    
    predictions = []
    references = []
    readability_pred = []
    readability_ref = []
    
    with torch.no_grad():
        for batch in dataloader:
            # Generate predictions batch-wise
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['labels']
            
            # Decode reference summaries
            for label in labels:
                label_tokens = label[label != -100]  # Remove ignore tokens
                ref_text = tokenizer.decode(label_tokens, skip_special_tokens=True)
                references.append(ref_text)
                readability_ref.append(compute_readability(ref_text))
            
            # Get text embeddings
            text_embeddings = model.distilbart.model.shared(input_ids)
            image_features = model.vit(images)
            projected_features = model.projection(image_features).unsqueeze(1)
            combined_embeddings = torch.cat([projected_features, text_embeddings], dim=1)
            
            batch_size = attention_mask.size(0)
            img_mask = torch.ones(batch_size, 1, dtype=attention_mask.dtype, device=attention_mask.device)
            combined_attention_mask = torch.cat([img_mask, attention_mask], dim=1)
            
            outputs = model.distilbart.generate(
                inputs_embeds=combined_embeddings,
                attention_mask=combined_attention_mask,
                max_length=150,
                num_beams=6,
                temperature=0.7,
                early_stopping=True
            )
            
            for output in outputs:
                pred_text = tokenizer.decode(output, skip_special_tokens=True)
                predictions.append(pred_text)
                readability_pred.append(compute_readability(pred_text))
    
    # Compute metrics
    rouge_scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    bleu_scores = compute_all_bleu(predictions, references)
    bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")
    avg_bertscore_f1 = sum(bertscore_result['f1']) / len(bertscore_result['f1'])
    
    avg_readability_pred = sum(r for r in readability_pred if r is not None) / len(readability_pred)
    avg_readability_ref  = sum(r for r in readability_ref if r is not None) / len(readability_ref)
    
    print("\n===== Evaluation Results =====")
    print("ROUGE:", rouge_scores)
    for k, v in bleu_scores.items():
        print(f"{k}: {v:.4f}")
    print(f"BERTScore (F1): {avg_bertscore_f1:.4f}")
    print(f"Average Readability (Prediction): {avg_readability_pred:.2f}")
    print(f"Average Readability (Reference): {avg_readability_ref:.2f}")
    
    results = {
        "ROUGE": rouge_scores,
        **bleu_scores,
        "BERTScore_F1": avg_bertscore_f1,
        "Avg_Readability_Pred": avg_readability_pred,
        "Avg_Readability_Ref": avg_readability_ref
    }
    return results

# Example: Evaluate on Test Dataset
eval_results = evaluate_model(model, tokenizer, test_dataloader, device, val_transform)

# Optionally, save the evaluation results to a JSON file
with open("/kaggle/working/evaluation_metrics.json", "w") as f:
    json.dump(eval_results, f, indent=4)

print("Evaluation complete. Metrics saved to evaluation_metrics.json")

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13




Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== Evaluation Results =====
ROUGE: {'rouge1': 0.6009162280259486, 'rouge2': 0.3827452012001286, 'rougeL': 0.5169120825192468, 'rougeLsum': 0.5168313539047731}
BLEU-1: 0.5753
BLEU-2: 0.4665
BLEU-3: 0.4038
BLEU-4: 0.3620
BERTScore (F1): 0.9192
Average Readability (Prediction): 69.85
Average Readability (Reference): 66.39
Evaluation complete. Metrics saved to evaluation_metrics.json


In [40]:

separator = "=" * 80

print(separator)
print("Generating summaries for the first few test samples:\n")

for i in range(70,78):
    row = test_df.iloc[i]
    codemixed_question = row['Codemixed_Question']
    image_path = row['Image_path']
    reference_summary = row['summary']
    
    generated_summary = generate_summary(model, tokenizer, codemixed_question, image_path, device, val_transform)
    
    print(f"Sample {i+1}")
    print("-" * 80)
    print("Codemixed Question:")
    print(codemixed_question)
    print("\nReference Summary:")
    print(reference_summary)
    print("\nGenerated Summary:")
    print(generated_summary)
    print(separator + "\n")


Generating summaries for the first few test samples:

Sample 71
--------------------------------------------------------------------------------
Codemixed Question:
Mere dental surgeon ne Thursday ko ek dental procedure conduct kiya tha. Humne socha ki wahan koi tumor hai, lekin wo use remove kar diya. Mere face par 9 stitches the. Procedure ke dauran mujhe IV antibiotics diye gaye aur ghar par Amoxicillin 500mg aur Hydrocodone 10/325 di gayi hai. Mai antibiotics ko bilkul sahi tarah le raha hoon. Dard bahut jyada ho raha hai, mai apne chehre ke us side ko bina yelping ke naahi chhu sakta. Kripya mere mouth mein kuch hua hai wo dekhiye. Surgical site yellowish dikh rahi hai lekin mujhe fever nahi hai. Dard ki dawa sirf kuch minutes ke liye hi relief de rahi hai. Kya ye infected ho raha hai?

Reference Summary:
Is the patient's excruciating pain and swelling, along with a yellowish surgical site, indicative of an infection? The image here shows the condition of mouth ulcers

Generated S