In [None]:
import zipfile

zip_path = "/content/drive/MyDrive/Anand & Shashank/Anxiety_Data.zip"
extract_to = "/content/Anxiety"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)


In [None]:
pip install  pytesseract


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
pip install git+https://github.com/openai/CLIP.git


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-16kjmfet
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-16kjmfet
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting 

# Clip

In [5]:
import os
import pandas as pd
from PIL import Image
import pytesseract
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, f1_score
import clip

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

clip_model, preprocess = clip.load("ViT-B/32", device=device)

def custom_collate(batch):
    ocr_texts, images, labels = zip(*batch)
    labels = torch.tensor(labels, dtype=torch.long)
    return list(ocr_texts), list(images), labels

class AnxietyDataset(Dataset):
    def __init__(self, csv_file, image_folder):
        self.image_folder = image_folder

        data = pd.read_csv(csv_file)
        print("CSV columns:", data.columns)

        print("First few sample_ids from CSV:")
        print(data['sample_id'].head())

        data['filename'] = data['sample_id'].apply(lambda x: str(x) + ".jpg" if not str(x).endswith(".jpg") else x)

        data['label'] = pd.factorize(data['meme_anxiety_categories'])[0]
        print("Unique label mapping:", dict(enumerate(pd.factorize(data['meme_anxiety_categories'])[1])))

        data['exists'] = data['filename'].apply(lambda x: os.path.exists(os.path.join(self.image_folder, x)))
        print("Number of existing files found:", data['exists'].sum())

        self.data = data[data['exists']].reset_index(drop=True)
        print(f"Filtered dataset length: {len(self.data)} (only rows with existing images)")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_name = row['filename']
        label = row['label']
        image_path = os.path.join(self.image_folder, image_name)

        image_pil = Image.open(image_path).convert("RGB")

        ocr_text = pytesseract.image_to_string(image_pil)
        return ocr_text, image_pil, label

class FigurativeReasoningModule(nn.Module):
    def __init__(self):
        super(FigurativeReasoningModule, self).__init__()

    def forward(self, ocr_text):
        reasoning = "dummy figurative reasoning for: " + ocr_text
        return reasoning

class VisualFusionModule(nn.Module):
    def __init__(self, text_model_name='paraphrase-MiniLM-L6-v2'):
        super(VisualFusionModule, self).__init__()
        self.text_encoder = SentenceTransformer(text_model_name)

    def fuse_embeddings(self, ocr_text, image):
        text_embedding = self.text_encoder.encode(ocr_text, convert_to_tensor=True)
        image_preprocessed = preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = clip_model.encode_image(image_preprocessed)
        image_embedding = image_embedding.squeeze()
        fused = torch.cat([text_embedding, image_embedding], dim=-1)
        return fused.to(device)

    def forward(self, ocr_text, image):
        return self.fuse_embeddings(ocr_text, image)

class M3HClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(M3HClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, fused_embedding):
        logits = self.fc(fused_embedding)
        return logits

class M3HVisualModel(nn.Module):
    def __init__(self, fusion_module, classifier):
        super(M3HVisualModel, self).__init__()
        self.fusion_module = fusion_module
        self.classifier = classifier

    def forward(self, ocr_text, image):
        fused_embedding = self.fusion_module(ocr_text, image)
        if fused_embedding.dim() == 1:
            fused_embedding = fused_embedding.unsqueeze(0)
        logits = self.classifier(fused_embedding)
        return logits

if __name__ == "__main__":
    num_epochs = 10
    fused_dim = 896
    num_classes = 7
    fusion_module = VisualFusionModule()
    classifier = M3HClassifier(input_dim=fused_dim, num_classes=num_classes)
    model = M3HVisualModel(fusion_module, classifier)
    model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=5e-5)
    criterion = nn.CrossEntropyLoss()

    csv_train = '/content/Anxiety/anxiety_train.csv'
    image_folder = '/content/Anxiety/anxiety_train_image'
    dataset = AnxietyDataset(csv_file=csv_train, image_folder=image_folder)
    dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=custom_collate)

    if len(dataset) == 0:
        print("No samples found in the dataset. Please verify the image folder path and file names.")
        exit(1)

    figurative_module = FigurativeReasoningModule()

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        for batch in dataloader:
            ocr_texts, images, labels = batch
            batch_logits = []
            for ocr_text, image in zip(ocr_texts, images):
                reasoning = figurative_module(ocr_text)
                logits = model(ocr_text, image)
                batch_logits.append(logits)
            batch_logits = torch.cat(batch_logits, dim=0)
            labels = labels.to(device)
            loss = criterion(batch_logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            print(f"Epoch {epoch+1} Batch loss: {loss.item()}")
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1} Average loss: {avg_loss}")

    torch.save(model.state_dict(), "m3h_visual_model.pth_10")
    print("Model saved to m3h_visual_model.pth")



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 3 Batch loss: 1.6432061195373535
Epoch 3 Batch loss: 1.5071964263916016
Epoch 3 Batch loss: 1.57987642288208
Epoch 3 Batch loss: 1.8414703607559204
Epoch 3 Batch loss: 1.5474761724472046
Epoch 3 Batch loss: 1.5962923765182495
Epoch 3 Batch loss: 1.3637405633926392
Epoch 3 Batch loss: 1.7246513366699219
Epoch 3 Batch loss: 1.672473669052124
Epoch 3 Batch loss: 1.338304042816162
Epoch 3 Batch loss: 1.5960686206817627
Epoch 3 Batch loss: 1.7401039600372314
Epoch 3 Batch loss: 1.8933320045471191
Epoch 3 Batch loss: 1.433897614479065
Epoch 3 Batch loss: 1.8221559524536133
Epoch 3 Batch loss: 1.410334587097168
Epoch 3 Batch loss: 1.78633451461792
Epoch 3 Batch loss: 1.7041664123535156
Epoch 3 Batch loss: 1.7780771255493164
Epoch 3 Batch loss: 1.6952614784240723
Epoch 3 Batch loss: 1.9583103656768799
Epoch 3 Batch loss: 1.8100254535675049
Epoch 3 Batch loss: 1.560115098953247
Epoch 3 Batch loss: 1.5979454517364502
Epoch 3 

# Evaluation on the Training Data

In [8]:
# Evaluation on the Training Data
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in dataloader:
        ocr_texts, images, labels = batch
        batch_logits = []
        for ocr_text, image in zip(ocr_texts, images):
            reasoning = figurative_module(ocr_text)
            logits = model(ocr_text, image)
            batch_logits.append(logits)
        batch_logits = torch.cat(batch_logits, dim=0)
        preds = torch.argmax(batch_logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
macro_f1 = f1_score(all_labels, all_preds, average='macro')
weighted_f1 = f1_score(all_labels, all_preds, average='weighted')

print("Evaluation Metrics:")
print("Accuracy:", accuracy)
print("Macro-F1:", macro_f1)
print("Weighted-F1:", weighted_f1)


Evaluation Metrics:
Accuracy: 0.5681381957773513
Macro-F1: 0.5275800798770774
Weighted-F1: 0.5547858505297208


# Evaluation on Test Data

In [9]:
import os
import pandas as pd
from PIL import Image
import pytesseract
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, f1_score
import clip

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

clip_model, preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()

def custom_collate(batch):
    ocr_texts, images, labels = zip(*batch)
    labels = torch.tensor(labels, dtype=torch.long)
    return list(ocr_texts), list(images), labels

class AnxietyDataset(Dataset):
    def __init__(self, csv_file, image_folder):
        self.image_folder = image_folder
        data = pd.read_csv(csv_file)
        print("CSV columns:", data.columns)
        print("First few sample_ids from CSV:")
        print(data['sample_id'].head())

        data['filename'] = data['sample_id'].apply(
            lambda x: str(x) + ".jpg" if not str(x).endswith(".jpg") else x
        )
        data['label'] = pd.factorize(data['meme_anxiety_categories'])[0]
        print("Unique label mapping:", dict(enumerate(pd.factorize(data['meme_anxiety_categories'])[1])))
        data['exists'] = data['filename'].apply(
            lambda x: os.path.exists(os.path.join(self.image_folder, x))
        )
        print("Number of existing files found:", data['exists'].sum())

        self.data = data[data['exists']].reset_index(drop=True)
        print(f"Filtered dataset length: {len(self.data)} (only rows with existing images)")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_name = row['filename']
        label = row['label']
        image_path = os.path.join(self.image_folder, image_name)
        image_pil = Image.open(image_path).convert("RGB")
        ocr_text = pytesseract.image_to_string(image_pil)
        return ocr_text, image_pil, label

class FigurativeReasoningModule(nn.Module):
    def __init__(self):
        super(FigurativeReasoningModule, self).__init__()

    def forward(self, ocr_text):
        reasoning = "dummy figurative reasoning for: " + ocr_text
        return reasoning

class VisualFusionModule(nn.Module):
    def __init__(self, text_model_name='paraphrase-MiniLM-L6-v2'):
        super(VisualFusionModule, self).__init__()
        self.text_encoder = SentenceTransformer(text_model_name)

    def fuse_embeddings(self, ocr_text, image):
        text_embedding = self.text_encoder.encode(ocr_text, convert_to_tensor=True)
        image_preprocessed = preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = clip_model.encode_image(image_preprocessed)
        image_embedding = image_embedding.squeeze()
        fused = torch.cat([text_embedding, image_embedding], dim=-1)
        return fused.to(device)

    def forward(self, ocr_text, image):
        return self.fuse_embeddings(ocr_text, image)

class M3HClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(M3HClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, fused_embedding):
        logits = self.fc(fused_embedding)
        return logits

class M3HVisualModel(nn.Module):
    def __init__(self, fusion_module, classifier):
        super(M3HVisualModel, self).__init__()
        self.fusion_module = fusion_module
        self.classifier = classifier

    def forward(self, ocr_text, image):
        fused_embedding = self.fusion_module(ocr_text, image)
        if fused_embedding.dim() == 1:
            fused_embedding = fused_embedding.unsqueeze(0)
        logits = self.classifier(fused_embedding)
        return logits

if __name__ == "__main__":
    csv_test = '/content/Anxiety/anxiety_test.csv'
    test_image_folder = '/content/Anxiety/anxiety_test_image'

    test_dataset = AnxietyDataset(csv_file=csv_test, image_folder=test_image_folder)
    test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=custom_collate)

    fusion_module = VisualFusionModule()
    classifier = M3HClassifier(input_dim=896, num_classes=7)
    model = M3HVisualModel(fusion_module, classifier)
    model.to(device)

    model_path = "/content/m3h_visual_model.pth_10"
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    print("Loaded model from:", model_path)

    figurative_module = FigurativeReasoningModule()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_dataloader:
            ocr_texts, images, labels = batch
            batch_logits = []
            for ocr_text, image in zip(ocr_texts, images):
                logits = model(ocr_text, image)
                batch_logits.append(logits)
            batch_logits = torch.cat(batch_logits, dim=0)
            preds = torch.argmax(batch_logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    weighted_f1 = f1_score(all_labels, all_preds, average='weighted')

    print("Test Evaluation Metrics:")
    print("Accuracy:", accuracy)
    print("Macro-F1:", macro_f1)
    print("Weighted-F1:", weighted_f1)


Using device: cuda
CSV columns: Index(['sample_id', 'meme_anxiety_categories'], dtype='object')
First few sample_ids from CSV:
0    TE-515
1    TE-203
2    TE-141
3    TE-415
4    TE-386
Name: sample_id, dtype: object
Unique label mapping: {0: 'Impending Doom', 1: 'Nervousness', 2: 'Restlessness', 3: 'Lack of Worry Control', 4: 'Irritatbily', 5: 'Difficulty Relaxing', 6: 'Excessive Worry'}
Number of existing files found: 651
Filtered dataset length: 651 (only rows with existing images)
Loaded model from: /content/m3h_visual_model.pth_10
Test Evaluation Metrics:
Accuracy: 0.2119815668202765
Macro-F1: 0.18731613271501707
Weighted-F1: 0.20320376205863164
