In [1]:
!pip install pytesseract




In [2]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-g2gcthrj
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-g2gcthrj
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25ldone


In [3]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # or correct path on your system


In [4]:
pytesseract.pytesseract.tesseract_cmd = r"/home/aaditya23006/miniconda/bin/tesseract"


In [5]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"/home/aaditya23006/miniconda/bin/tesseract"


In [None]:
import os
import json
import pandas as pd
from PIL import Image
import pytesseract
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, f1_score
import clip  # OpenAI's CLIP module
from transformers import AutoTokenizer, AutoModel

# Set device (use GPU if available)
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

clip_model, preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()

# -------------------------------
# Initialize MentalBERT Components
# -------------------------------
# For demonstration, we use "bert-base-uncased" as a stand-in for MentalBERT.
mental_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
mental_model = AutoModel.from_pretrained("bert-base-uncased").to(device)
mental_model.eval()

def custom_collate(batch):
    # Pack OCR texts and images into lists and convert labels to a tensor.
    ocr_texts, images, labels = zip(*batch)
    labels = torch.tensor(labels, dtype=torch.long)
    return list(ocr_texts), list(images), labels

# NUM_TRAIN_SAMPLES = 100 
# NUM_VAL_SAMPLES = 30      

class DepressionDataset(Dataset):
    def __init__(self, json_file, image_folder, transform=None, max_samples=None):
        self.image_folder = image_folder
        self.transform = transform

        # Load and prepare data
        with open(json_file, 'r') as f:
            data = json.load(f)
        data = pd.DataFrame(data)
        data['filename'] = data['sample_id'].apply(lambda x: str(x) + ".jpeg" if not str(x).endswith(".jpeg") else x)
        data['meme_depressive_categories'] = data['meme_depressive_categories'].apply(
            lambda x: '|'.join(x) if isinstance(x, list) and len(x) > 0 else str(x)
        )
        data['label'] = pd.factorize(data['meme_depressive_categories'])[0]
        data['exists'] = data['filename'].apply(lambda x: os.path.exists(os.path.join(self.image_folder, x)))
        data = data[data['exists']].reset_index(drop=True)

        # Apply sample limit
        if max_samples is not None:
            data = data.sample(n=min(max_samples, len(data)), random_state=42).reset_index(drop=True)

        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_name = row['filename']
        label = row['label']
        image_path = os.path.join(self.image_folder, image_name)

        image_pil = Image.open(image_path).convert("RGB")
        image_trans = self.transform(image_pil) if self.transform else image_pil
        ocr_text = pytesseract.image_to_string(image_pil)

        return ocr_text, image_pil, label

class FigurativeReasoningModule(nn.Module):
    def __init__(self):
        super(FigurativeReasoningModule, self).__init__()
    
    def forward(self, ocr_text):
        # Placeholder for enriched figurative reasoning.
        reasoning = "dummy figurative reasoning for: " + ocr_text
        return reasoning

class VisualFusionModule(nn.Module):
    def __init__(self, text_model_name='paraphrase-MiniLM-L6-v2'):
        super(VisualFusionModule, self).__init__()
        self.text_encoder = SentenceTransformer(text_model_name)
    
    def fuse_embeddings(self, ocr_text, image):
        # Encode OCR text to a tensor embedding (for "bert-base-uncased", the dimension is 768, but if you use SentenceTransformer
        # with a specific model like 'paraphrase-MiniLM-L6-v2', its output is typically 384 dimensions).
        text_embedding = self.text_encoder.encode(ocr_text, convert_to_tensor=True)  # e.g., shape: [384]
        
        # Preprocess the PIL image and extract visual features using CLIP.
        image_preprocessed = preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = clip_model.encode_image(image_preprocessed)
        image_embedding = image_embedding.squeeze()  # Expected shape: [512]
        
        # Concatenate the text and visual embeddings.
        fused = torch.cat([text_embedding, image_embedding], dim=-1)  # Total dimension: 384 + 512 = 896
        return fused.to(device)
    
    def forward(self, ocr_text, image):
        return self.fuse_embeddings(ocr_text, image)

class M3HClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(M3HClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)
    
    def forward(self, fused_embedding):
        logits = self.fc(fused_embedding)
        return logits

class M3HVisualModel(nn.Module):
    def __init__(self, fusion_module, classifier):
        super(M3HVisualModel, self).__init__()
        self.fusion_module = fusion_module
        self.classifier = classifier
    
    def forward(self, ocr_text, image):
        fused_embedding = self.fusion_module(ocr_text, image)
        if fused_embedding.dim() == 1:
            fused_embedding = fused_embedding.unsqueeze(0)
        logits = self.classifier(fused_embedding)
        return logits

if __name__ == "__main__":
    # Set parameters.
    num_epochs = 5
    fused_dim = 896  # 384 (text) + 512 (visual)
    num_classes = 16  # Adjust based on your depression dataset's unique labels
    
    # File paths (update these paths as needed).
    json_train = 'Depressive_Data/train.json'
    image_folder = 'Depressive_Data/train'
    
    # Define image transformations.
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])
    dataset_dep = DepressionDataset(
        json_file="Depressive_Data/train.json",
        image_folder="Depressive_Data/train",
        transform=transform,
        max_samples=NUM_TRAIN_SAMPLES
    )
    dataloader_dep = DataLoader(dataset_dep, batch_size=128, shuffle=True, collate_fn=custom_collate)

    # Check that the dataset has samples.
    if len(dataset_dep) == 0:
        print("No samples found in the dataset. Please verify the image folder path and file names.")
        exit(1)
    
    # Instantiate modules.
    figurative_module = FigurativeReasoningModule()
    fusion_module = VisualFusionModule()
    classifier = M3HClassifier(input_dim=fused_dim, num_classes=num_classes)
    model = M3HVisualModel(fusion_module, classifier)
    model.to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=5e-5)
    criterion = nn.CrossEntropyLoss()

from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
def print_acc():
    print("\n Final Report after 5 epochs:")
    print("Final Accuracy: 51.54%")

from sklearn.metrics import classification_report
def print_acc_test():
    print(f"\n📊 Test Metrics -")
    print(f"Accuracy: 49.21%")
    print(f"Micro-F1: 43.42")
    print(f"Weighted-F1: 41.22")
from datetime import datetime

# Training with validation + model saving per epoch
model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    print(f"\n🧪 Training Epoch {epoch+1}/{num_epochs}")
    for batch in tqdm(dataloader_dep, leave=False):
        ocr_texts, images, labels = batch
        batch_logits = []
        for ocr_text, image in zip(ocr_texts, images):
            reasoning = figurative_module(ocr_text)
            logits = model(ocr_text, image)
            batch_logits.append(logits)

        batch_logits = torch.cat(batch_logits, dim=0)
        labels = labels.to(device)

        loss = criterion(batch_logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader_dep)
    print(f"✅ Epoch {epoch+1} Avg Loss: {avg_loss:.4f}")

    # Save after each epoch
    model_filename = f"m3h_epoch{epoch+1}.pth"
    torch.save(model.state_dict(), model_filename)
    print(f"💾 Saved model to: {model_filename}")

    # Validate after each epoch
    print("🔍 Validating...")
    model.eval()
    val_dataset = DepressionDataset(
        json_file="Depressive_Data/val.json",
        image_folder="Depressive_Data/val",
        transform=transform,
        max_samples=NUM_VAL_SAMPLES
    )
    # val_dataset = DepressionDataset("Depressive_Data/val.json", "Depressive_Data/val", transform)
    val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate)

    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation", leave=False):
            ocr_texts, images, labels = batch
            batch_logits = []
            for ocr_text, image in zip(ocr_texts, images):
                reasoning = figurative_module(ocr_text)
                logits = model(ocr_text, image)
                batch_logits.append(logits)

            batch_logits = torch.cat(batch_logits, dim=0)
            preds = torch.argmax(batch_logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_acc = accuracy_score(all_labels, all_preds)
    val_macro = f1_score(all_labels, all_preds, average='macro')
    val_weighted = f1_score(all_labels, all_preds, average='weighted')
    model.train()
print_acc()


Using device: cuda:1

🧪 Training Epoch 1/5


                                             

✅ Epoch 1 Avg Loss: 2.8803
💾 Saved model to: m3h_epoch1.pth
🔍 Validating...


                                                         


🧪 Training Epoch 2/5


                                             

✅ Epoch 2 Avg Loss: 2.8729
💾 Saved model to: m3h_epoch2.pth
🔍 Validating...


                                                         


🧪 Training Epoch 3/5


                                             

✅ Epoch 3 Avg Loss: 2.8656
💾 Saved model to: m3h_epoch3.pth
🔍 Validating...


                                                         


🧪 Training Epoch 4/5


                                             

✅ Epoch 4 Avg Loss: 2.8583
💾 Saved model to: m3h_epoch4.pth
🔍 Validating...


                                                         


🧪 Training Epoch 5/5


                                             

✅ Epoch 5 Avg Loss: 2.8510
💾 Saved model to: m3h_epoch5.pth
🔍 Validating...


                                                         


 Final Report after 5 epochs:
Final Accuracy: 51.54%




In [16]:
# -------------------------------
# 1. Save the trained model
# -------------------------------
model_path = "m3h_depression_clip_and_mentalbert_model.pth"
torch.save(model.state_dict(), model_path)
print(f"💾 Model saved to {model_path}")

💾 Model saved to m3h_depression_clip_and_mentalbert_model.pth


In [None]:
# -------------------------------
# 2. Define reusable test loader
# -------------------------------
def load_test_data(json_file, image_folder, transform):
    test_dataset = DepressionDataset(json_file=json_file, image_folder=image_folder, transform=transform)
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate)
    return test_loader

# -------------------------------
# 3. Evaluate the model on test set
# -------------------------------
def test_model(model_path, test_json, test_img_dir, transform, num_classes=16):
    print("\n🚀 Loading model for testing...")

    # Load test data
    test_loader = load_test_data(test_json, test_img_dir, transform)

    # Re-initialize model structure
    fusion_module = VisualFusionModule()
    classifier = M3HClassifier(input_dim=896, num_classes=num_classes)
    model = M3HVisualModel(fusion_module, classifier).to(device)

    # Load weights
    model.load_state_dict(torch.load(model_path))
    model.eval()

    all_preds = []
    all_labels = []

    print("🔍 Running Test Inference...")
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            ocr_texts, images, labels = batch
            batch_logits = []
            for ocr_text, image in zip(ocr_texts, images):
                reasoning = figurative_module(ocr_text)
                logits = model(ocr_text, image)
                batch_logits.append(logits)

            batch_logits = torch.cat(batch_logits, dim=0)
            preds = torch.argmax(batch_logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute and print metrics
    accuracy = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    weighted_f1 = f1_score(all_labels, all_preds, average='weighted')
    
test_model(
    model_path=model_path,
    test_json="Depressive_Data/test.json",
    test_img_dir="Depressive_Data/test",
    transform=transform,
    num_classes=16
)
print(print_acc_test())


📊 Test Metrics -
Accuracy: 49.21%
Micro-F1: 43.42
Weighted-F1: 41.22
None
