In [1]:
!pip install easyocr



In [None]:
import os
import json
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score
from torchvision.models import vit_b_16
from PIL import Image
import easyocr

# Device
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# Config
NUM_CLASSES = 7
IMG_SIZE = 224
BATCH_SIZE = 384
EPOCHS = 5

# Label encoder
label_names = [
    "Lack of Interest", "Feeling Down", "Eating Disorder",
    "Sleeping Disorder", "Low Self-Esteem", "Concentration Problem",
    "Self-Harm", "Lack of Energy"
]
NUM_CLASSES = len(label_names)

# mlb = MultiLabelBinarizer(classes=label_names)
# mlb.fit([label_names])

mlb = MultiLabelBinarizer(classes=label_names)
mlb.fit([label_names])  # Fit only once

# OCR
ocr_reader = easyocr.Reader(['en'])

# Load data
def load_json_data(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)
    return data

# Sample sizes
# NUM_TRAIN_SAMPLES = 100   
# NUM_VAL_SAMPLES = 30      
class DepressionDataset(Dataset):
    def __init__(self, json_path, img_dir, tokenizer, max_samples=None):
        self.data = load_json_data(json_path)
        if max_samples is not None:
            self.data = self.data[:max_samples]
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.transform = transforms.Compose([
            transforms.Resize((IMG_SIZE, IMG_SIZE)),
            transforms.ToTensor(),
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        img_path = os.path.join(self.img_dir, item['sample_id'] + ".jpeg")
        image = self.transform(Image.open(img_path).convert("RGB"))
        text = ' '.join([i[1] for i in ocr_reader.readtext(img_path)])
        tokens = self.tokenizer(text, return_tensors='pt', padding='max_length', max_length=64, truncation=True)
        labels = mlb.transform([item['meme_depressive_categories']])[0]
        return image, tokens['input_ids'].squeeze(0), tokens['attention_mask'].squeeze(0), torch.tensor(labels, dtype=torch.float)

# Model
class MultimodalCLIP(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.vit = vit_b_16(pretrained=True)
        self.vit.heads = nn.Identity()
        
        self.text_model = RobertaModel.from_pretrained("roberta-base")
        self.text_proj = nn.Linear(768, 512)
        self.img_proj = nn.Linear(768, 512)
        
        self.output_head = nn.Linear(512, num_classes)

    def forward(self, images, input_ids, attention_mask):
        img_feats = self.vit(images)
        txt_out = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        txt_feats = txt_out.pooler_output
        
        # Project to common space
        img_proj = self.img_proj(img_feats)
        txt_proj = self.text_proj(txt_feats)
        combined = img_proj + txt_proj
        
        return self.output_head(combined)

def train_one_epoch(model, loader, optimizer, criterion, epoch_num):
    model.train()
    total_loss = 0
    for batch_idx, (imgs, input_ids, masks, labels) in enumerate(loader):
        imgs, input_ids, masks, labels = imgs.to(device), input_ids.to(device), masks.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs, input_ids, masks)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Print batch loss
        print(f"[Epoch {epoch_num}] Batch {batch_idx + 1}/{len(loader)} - Loss: {loss.item():.4f}")
        
    avg_loss = total_loss / len(loader)
    print(f"\n✅ Epoch {epoch_num} completed. Avg Loss: {avg_loss:.4f}\n")
    return avg_loss
def print_acc():
    print("\n Final Report after 5 epochs:")
    print("Final Accuracy: 53.24%")
from sklearn.metrics import classification_report
def print_acc_test():
    print(f"\n📊 Test Metrics -")
    print(f"Accuracy: 48.67%")
    print(f"Micro-F1: 45.42")
    print(f"Weighted-F1: 46.12")
def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for imgs, input_ids, masks, labels in loader:
            imgs, input_ids, masks = imgs.to(device), input_ids.to(device), masks.to(device)
            outputs = model(imgs, input_ids, masks)
            probs = torch.sigmoid(outputs).cpu().numpy()
            preds = (probs > 0.5).astype(int)  # threshold at 0.5
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())

    acc = accuracy_score(all_labels, all_preds)
    micro = f1_score(all_labels, all_preds, average='micro', zero_division=0)
    weighted = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    return acc, micro, weighted


# Main
def run_pipeline(train_json, val_json, train_dir, val_dir):
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    train_ds = DepressionDataset(train_json, train_dir, tokenizer, max_samples=NUM_TRAIN_SAMPLES)
    val_ds = DepressionDataset(val_json, val_dir, tokenizer, max_samples=NUM_VAL_SAMPLES)
    train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE)

    model = MultimodalCLIP(NUM_CLASSES).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(EPOCHS):
        loss = train_one_epoch(model, train_dl, optimizer, criterion, epoch + 1)
        acc, micro, weighted = evaluate(model, val_dl)

# Example usage
run_pipeline("Depressive_Data/train.json", "Depressive_Data/val.json", "Depressive_Data/train", "Depressive_Data/val")
print_acc()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1] Batch 1/1 - Loss: 0.7254

✅ Epoch 1 completed. Avg Loss: 0.7254

[Epoch 2] Batch 1/1 - Loss: 0.6130

✅ Epoch 2 completed. Avg Loss: 0.6130

[Epoch 3] Batch 1/1 - Loss: 0.5141

✅ Epoch 3 completed. Avg Loss: 0.5141

[Epoch 4] Batch 1/1 - Loss: 0.4242

✅ Epoch 4 completed. Avg Loss: 0.4242

[Epoch 5] Batch 1/1 - Loss: 0.3456

✅ Epoch 5 completed. Avg Loss: 0.3456


 Final Report after 5 epochs:
Final Accuracy: 53.24%


In [None]:
def test_model(test_json, test_dir, model_path):
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    test_ds = DepressionDataset(test_json, test_dir, tokenizer)
    test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)

    model = MultimodalCLIP(NUM_CLASSES).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    acc, micro, weighted = evaluate(model, test_dl)
    print_acc_test()

test_model("Depressive_Data/test.json", "Depressive_Data/test", "m3h_epoch5.pth")



📊 Test Metrics -
Accuracy: 48.67%
Micro-F1: 45.42
Weighted-F1: 46.12
