In [2]:
!apt-get install tesseract-ocr -y
!pip install pytesseract


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [3]:
!pip install git+https://github.com/openai/CLIP.git
!pip install -U sentence-transformers


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-8mlhk5c8
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-8mlhk5c8
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting 

In [4]:
import os
import json
import pandas as pd
from PIL import Image
import pytesseract
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, f1_score
import clip

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

clip_model, preprocess = clip.load("ViT-B/32", device=device)

clip_model.eval()


def custom_collate(batch):
    ocr_texts, images, labels = zip(*batch)
    labels = torch.tensor(labels, dtype=torch.long)
    return list(ocr_texts), list(images), labels


class DepressionDataset(Dataset):
    def __init__(self, json_file, image_folder, transform=None):
        self.image_folder = image_folder
        self.transform = transform

        with open(json_file, 'r') as f:
            data = json.load(f)
        data = pd.DataFrame(data)
        print("JSON keys:", data.columns)

        print("First few sample_ids from JSON:")
        print(data['sample_id'].head())

        data['filename'] = data['sample_id'].apply(lambda x: str(x) + ".jpeg" if not str(x).endswith(".jpeg") else x)

        data['meme_depressive_categories'] = data['meme_depressive_categories'].apply(
            lambda x: '|'.join(x) if isinstance(x, list) and len(x) > 0 else str(x)
        )

        data['label'] = pd.factorize(data['meme_depressive_categories'])[0]
        print("Unique label mapping:", dict(enumerate(pd.factorize(data['meme_depressive_categories'])[1])))

        data['exists'] = data['filename'].apply(lambda x: os.path.exists(os.path.join(self.image_folder, x)))
        print("Number of existing files found:", data['exists'].sum())

        self.data = data[data['exists']].reset_index(drop=True)
        print(f"Filtered dataset length: {len(self.data)} (only rows with existing images)")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_name = row['filename']
        label = row['label']
        image_path = os.path.join(self.image_folder, image_name)

        image_pil = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image_pil)
        else:
            image = image_pil

        ocr_text = pytesseract.image_to_string(image_pil)
        return ocr_text, image_pil, label


class FigurativeReasoningModule(nn.Module):
    def __init__(self):
        super(FigurativeReasoningModule, self).__init__()

    def forward(self, ocr_text):
        reasoning = "dummy figurative reasoning for: " + ocr_text
        return reasoning


class VisualFusionModule(nn.Module):
    def __init__(self, text_model_name='paraphrase-MiniLM-L6-v2'):
        super(VisualFusionModule, self).__init__()
        self.text_encoder = SentenceTransformer(text_model_name)

    def fuse_embeddings(self, ocr_text, image):
        text_embedding = self.text_encoder.encode(ocr_text, convert_to_tensor=True)

        image_preprocessed = preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = clip_model.encode_image(image_preprocessed)
        image_embedding = image_embedding.squeeze()

        fused = torch.cat([text_embedding, image_embedding], dim=-1)
        return fused.to(device)

    def forward(self, ocr_text, image):
        return self.fuse_embeddings(ocr_text, image)

class M3HClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(M3HClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, fused_embedding):
        logits = self.fc(fused_embedding)
        return logits

class M3HVisualModel(nn.Module):
    def __init__(self, fusion_module, classifier):
        super(M3HVisualModel, self).__init__()
        self.fusion_module = fusion_module
        self.classifier = classifier

    def forward(self, ocr_text, image):
        fused_embedding = self.fusion_module(ocr_text, image)
        if fused_embedding.dim() == 1:
            fused_embedding = fused_embedding.unsqueeze(0)
        logits = self.classifier(fused_embedding)
        return logits

if __name__ == "__main__":
    num_epochs = 10
    fused_dim = 896  # 384 (text) + 512 (visual)
    num_classes = 16

    json_train = '/content/drive/MyDrive/Anand & Shashank/Depressive_Data/train.json'
    image_folder = '/content/drive/MyDrive/Anand & Shashank/Depressive_Data/Images/depressive_image/train'

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])
    dataset_dep = DepressionDataset(json_file=json_train, image_folder=image_folder, transform=transform)

    dataloader_dep = DataLoader(dataset_dep, batch_size=64, shuffle=True, collate_fn=custom_collate)

    if len(dataset_dep) == 0:
        print("No samples found in the dataset. Please verify the image folder path and file names.")
        exit(1)

    figurative_module = FigurativeReasoningModule()
    fusion_module = VisualFusionModule()
    classifier = M3HClassifier(input_dim=fused_dim, num_classes=num_classes)
    model = M3HVisualModel(fusion_module, classifier)
    model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=5e-5)
    criterion = nn.CrossEntropyLoss()


    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        for batch in dataloader_dep:
            ocr_texts, images, labels = batch
            batch_logits = []
            for ocr_text, image in zip(ocr_texts, images):
                reasoning = figurative_module(ocr_text)
                logits = model(ocr_text, image)
                batch_logits.append(logits)
            batch_logits = torch.cat(batch_logits, dim=0)
            labels = labels.to(device)
            loss = criterion(batch_logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            print(f"Epoch {epoch+1} Batch loss: {loss.item()}")
        avg_loss = total_loss / len(dataloader_dep)
        print(f"Epoch {epoch+1} Average loss: {avg_loss}")

    torch.save(model.state_dict(), "m3h_depression_visual_model.pth")
    print("Model saved to m3h_depression_visual_model.pth")

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader_dep:
            ocr_texts, images, labels = batch
            batch_logits = []
            for ocr_text, image in zip(ocr_texts, images):
                reasoning = figurative_module(ocr_text)
                logits = model(ocr_text, image)
                batch_logits.append(logits)
            batch_logits = torch.cat(batch_logits, dim=0)
            preds = torch.argmax(batch_logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    weighted_f1 = f1_score(all_labels, all_preds, average='weighted')

    print("Evaluation Metrics:")
    print("Accuracy:", accuracy)
    print("Macro-F1:", macro_f1)
    print("Weighted-F1:", weighted_f1)


Using device: cuda


100%|███████████████████████████████████████| 338M/338M [00:04<00:00, 86.5MiB/s]


JSON keys: Index(['sample_id', 'ocr_text', 'meme_depressive_categories'], dtype='object')
First few sample_ids from JSON:
0    TR-1
1    TR-2
2    TR-3
3    TR-4
4    TR-5
Name: sample_id, dtype: object
Unique label mapping: {0: 'Eating Disorder', 1: 'Self-Harm', 2: 'Feeling Down', 3: 'Low Self-Esteem', 4: 'Feeling Down|Low Self-Esteem', 5: 'Self-Harm|Feeling Down|Low Self-Esteem', 6: 'Feeling Down|Self-Harm', 7: 'Self-Harm|Low Self-Esteem', 8: 'Low Self-Esteem|Feeling Down|Lack of Energy', 9: 'Feeling Down|Lack of Energy', 10: 'Low Self-Esteem|Lack of Energy', 11: 'Lack of Energy', 12: 'Self-Harm|Feeling Down|Lack of Energy', 13: 'Sleeping Disorder', 14: 'Concentration Problem', 15: 'Lack of Interest'}
Number of existing files found: 1076
Filtered dataset length: 1076 (only rows with existing images)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Epoch 1 Batch loss: 2.725266933441162
Epoch 1 Batch loss: 2.675508737564087
Epoch 1 Batch loss: 2.640186309814453
Epoch 1 Batch loss: 2.6271703243255615
Epoch 1 Batch loss: 2.6876027584075928
Epoch 1 Batch loss: 2.661339521408081
Epoch 1 Batch loss: 2.625311851501465
Epoch 1 Batch loss: 2.663341522216797
Epoch 1 Batch loss: 2.6660895347595215
Epoch 1 Batch loss: 2.6719117164611816
Epoch 1 Batch loss: 2.627960205078125
Epoch 1 Batch loss: 2.6482186317443848
Epoch 1 Batch loss: 2.670633554458618
Epoch 1 Batch loss: 2.657909393310547
Epoch 1 Batch loss: 2.5864615440368652
Epoch 1 Batch loss: 2.6418755054473877
Epoch 1 Batch loss: 2.63040828704834
Epoch 1 Average loss: 2.653364490060245
Epoch 2 Batch loss: 2.647489547729492
Epoch 2 Batch loss: 2.5953028202056885
Epoch 2 Batch loss: 2.5856969356536865
Epoch 2 Batch loss: 2.5864458084106445
Epoch 2 Batch loss: 2.6489548683166504
Epoch 2 Batch loss: 2.564161777496338
Epoch 2 Batch loss: 2.5999679565429688
Epoch 2 Batch loss: 2.579065561294555

In [17]:
import os
import json
import pandas as pd
from PIL import Image
import pytesseract
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, f1_score
import clip
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

clip_model, preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()

mental_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
mental_model = AutoModel.from_pretrained("bert-base-uncased").to(device)
mental_model.eval()


def custom_collate(batch):
    ocr_texts, images, labels = zip(*batch)
    labels = torch.tensor(labels, dtype=torch.long)
    return list(ocr_texts), list(images), labels

class DepressionDataset(Dataset):
    def __init__(self, json_file, image_folder, transform=None):
        self.image_folder = image_folder
        self.transform = transform

        with open(json_file, 'r') as f:
            data = json.load(f)
        data = pd.DataFrame(data)
        print("JSON keys:", data.columns)
        print("First few sample_ids from JSON:")
        print(data['sample_id'].head())

        data['filename'] = data['sample_id'].apply(
            lambda x: str(x) + ".jpeg" if not str(x).endswith(".jpeg") else x
        )

        data['meme_depressive_categories'] = data['meme_depressive_categories'].apply(
            lambda x: '|'.join(x) if isinstance(x, list) and len(x) > 0 else str(x)
        )

        data['label'] = pd.factorize(data['meme_depressive_categories'])[0]
        print("Unique label mapping:", dict(enumerate(pd.factorize(data['meme_depressive_categories'])[1])))

        data['exists'] = data['filename'].apply(lambda x: os.path.exists(os.path.join(self.image_folder, x)))
        print("Number of existing files found:", data['exists'].sum())

        self.data = data[data['exists']].reset_index(drop=True)
        print(f"Filtered dataset length: {len(self.data)} (only rows with existing images)")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_name = row['filename']
        label = row['label']
        image_path = os.path.join(self.image_folder, image_name)

        image_pil = Image.open(image_path).convert("RGB")

        if self.transform:
            image = self.transform(image_pil)
        else:
            image = image_pil

        ocr_text = pytesseract.image_to_string(image_pil)
        return ocr_text, image_pil, label


class FigurativeReasoningModule(nn.Module):
    def __init__(self):
        super(FigurativeReasoningModule, self).__init__()

    def forward(self, ocr_text):
        reasoning = "dummy figurative reasoning for: " + ocr_text
        return reasoning

class VisualFusionModule(nn.Module):
    def __init__(self, text_model_name='paraphrase-MiniLM-L6-v2'):
        super(VisualFusionModule, self).__init__()
        self.text_encoder = SentenceTransformer(text_model_name)

    def fuse_embeddings(self, ocr_text, image):
        text_embedding = self.text_encoder.encode(ocr_text, convert_to_tensor=True)
        image_preprocessed = preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = clip_model.encode_image(image_preprocessed)
        image_embedding = image_embedding.squeeze()
        fused = torch.cat([text_embedding, image_embedding], dim=-1)
        return fused.to(device)

    def forward(self, ocr_text, image):
        return self.fuse_embeddings(ocr_text, image)


class M3HClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(M3HClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, fused_embedding):
        logits = self.fc(fused_embedding)
        return logits


class M3HVisualModel(nn.Module):
    def __init__(self, fusion_module, classifier):
        super(M3HVisualModel, self).__init__()
        self.fusion_module = fusion_module
        self.classifier = classifier

    def forward(self, ocr_text, image):
        fused_embedding = self.fusion_module(ocr_text, image)
        if fused_embedding.dim() == 1:
            fused_embedding = fused_embedding.unsqueeze(0)
        logits = self.classifier(fused_embedding)
        return logits

if __name__ == "__main__":
    json_test = '/content/drive/MyDrive/Anand & Shashank/Depressive_Data/test.json'
    image_folder = '/content/drive/MyDrive/Anand & Shashank/Depressive_Data/Images/depressive_image/test'

    test_transform = None

    test_dataset = DepressionDataset(json_file=json_test, image_folder=image_folder, transform=test_transform)
    test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=custom_collate)

    figurative_module = FigurativeReasoningModule()
    fusion_module = VisualFusionModule()
    classifier = M3HClassifier(input_dim=896, num_classes=16)  # Adjust num_classes as per your dataset.
    model = M3HVisualModel(fusion_module, classifier)
    model.to(device)

    saved_model_path = "/content/m3h_depression_visual_model.pth"
    model.load_state_dict(torch.load(saved_model_path, map_location=device))
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_dataloader:
            ocr_texts, images, labels = batch
            batch_logits = []
            for ocr_text, image in zip(ocr_texts, images):
                reasoning = figurative_module(ocr_text)
                logits = model(ocr_text, image)
                batch_logits.append(logits)
            batch_logits = torch.cat(batch_logits, dim=0)
            preds = torch.argmax(batch_logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    weighted_f1 = f1_score(all_labels, all_preds, average='weighted')

    print("Evaluation Metrics:")
    print("Accuracy:", accuracy)
    print("Macro-F1:", macro_f1)
    print("Weighted-F1:", weighted_f1)


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

JSON keys: Index(['sample_id', 'ocr_text', 'meme_depressive_categories'], dtype='object')
First few sample_ids from JSON:
0    TE-1
1    TE-2
2    TE-4
3    TE-6
4    TE-7
Name: sample_id, dtype: object
Unique label mapping: {0: 'Feeling Down', 1: 'Feeling Down|Low Self-Esteem', 2: 'Feeling Down|Low Self-Esteem|Sleeping Disorder', 3: 'Sleeping Disorder', 4: 'Self-Harm', 5: 'Eating Disorder', 6: 'Feeling Down|Sleeping Disorder', 7: 'Low Self-Esteem', 8: '[]', 9: 'Lack of Interest', 10: 'Feeling Down|Self-Harm', 11: 'Feeling Down|Low Self-Esteem|Self-Harm', 12: 'Eating Disorder|Low Self-Esteem', 13: 'Concentration Problem', 14: 'Lack of Interest|Low Self-Esteem', 15: 'Low Self-Esteem|Sleeping Disorder', 16: 'Feeling Down|Lack of Interest', 17: 'Low Self-Esteem|Self-Harm', 18: 'Low Self-Esteem|Concentration Problem', 19: 'Feeling Down|Lack of Interest|Low Self-Esteem', 20: 'Feeling Down|Low Self-Esteem|Concentration Problem', 21: 'Feeling Down|Eating Disorder', 22: 'Lack of Interest|Eatin