In [None]:
import os
import torch
import random
from PIL import Image
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import one_hot
from torchvision import transforms
from torchvision.models import resnet152, mobilenet_v3_small
from torchsummary import summary
import torch.optim as optim
from torch.optim import lr_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np
from IPython.display import clear_output

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
img_path = '/content/drive/MyDrive/EXIST 2024/memes'
label_path = '/content/drive/MyDrive/EXIST 2024/final_meme_dataset.csv'
model_save_path = '/content/drive/MyDrive/EXIST 2024/meme_save'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Filter Data before training

In [None]:
all_df = pd.read_csv(label_path)
all_df.head()

Unnamed: 0.1,Unnamed: 0,id,lang,text,meme,path_memes,number_annotators,annotators,gender_annotators,age_annotators,...,labels_task4,labels_task5,labels_task6,split,task4_gold_hard,task4_gold_soft,task5_gold_hard,task5_gold_soft,task6_gold_hard,task6_gold_soft
0,0,110001,es,2+2=5 MITO Albert Einstein tenía bajo rendimie...,110001.jpeg,memes/110001.jpeg,6,Annotator_1;Annotator_2;Annotator_3;Annotator_...,F;F;F;M;M;M,18-22;23-45;46+;46+;18-22;23-45,...,YES;YES;YES;YES;YES;YES,DIRECT;DIRECT;DIRECT;DIRECT;DIRECT;DIRECT,IDEOLOGICAL-INEQUALITY|STEREOTYPING-DOMINANCE|...,TRAIN-MEME_ES,YES,"{'YES': 1.0, 'NO': 0.0}",DIRECT,"{'DIRECT': 1.0, 'NO': 0.0, 'JUDGEMENTAL': 0.0}",['IDEOLOGICAL-INEQUALITY'],"{'IDEOLOGICAL-INEQUALITY': 1.0, 'STEREOTYPING-..."
1,1,110002,es,CUANDO UNA MUJER VA A LUCHAR POR SUS DERECHOS,110002.jpeg,memes/110002.jpeg,6,Annotator_1;Annotator_2;Annotator_3;Annotator_...,F;F;F;M;M;M,18-22;23-45;46+;46+;18-22;23-45,...,YES;YES;YES;YES;YES;YES,DIRECT;DIRECT;DIRECT;DIRECT;DIRECT;JUDGEMENTAL,IDEOLOGICAL-INEQUALITY|STEREOTYPING-DOMINANCE|...,TRAIN-MEME_ES,YES,"{'YES': 1.0, 'NO': 0.0}",DIRECT,"{'DIRECT': 0.8333333333333334, 'JUDGEMENTAL': ...","['IDEOLOGICAL-INEQUALITY', 'MISOGYNY-NON-SEXUA...","{'IDEOLOGICAL-INEQUALITY': 0.6666666666666666,..."
2,2,110003,es,ІЯ ЕГЕЯ Е MOA ¿El Partido Republicano busca pe...,110003.jpeg,memes/110003.jpeg,6,Annotator_1;Annotator_2;Annotator_3;Annotator_...,F;F;F;M;M;M,18-22;23-45;46+;46+;18-22;23-45,...,YES;YES;NO;NO;NO;NO,DIRECT;DIRECT;-;-;-;-,STEREOTYPING-DOMINANCE|OBJECTIFICATION|MISOGYN...,TRAIN-MEME_ES,NO,"{'YES': 0.3333333333333333, 'NO': 0.6666666666...",NO,"{'DIRECT': 0.3333333333333333, 'NO': 0.6666666...",['NO'],"{'STEREOTYPING-DOMINANCE': 0.3333333333333333,..."
3,3,110004,es,"Paises que ""apoyan"" los derechos de la mujer A...",110004.jpeg,memes/110004.jpeg,6,Annotator_1;Annotator_2;Annotator_3;Annotator_...,F;F;F;M;M;M,18-22;23-45;46+;46+;18-22;23-45,...,YES;YES;NO;NO;YES;NO,JUDGEMENTAL;JUDGEMENTAL;-;-;JUDGEMENTAL;-,IDEOLOGICAL-INEQUALITY;IDEOLOGICAL-INEQUALITY;...,TRAIN-MEME_ES,-,"{'YES': 0.5, 'NO': 0.5}",-,"{'JUDGEMENTAL': 0.5, 'NO': 0.5, 'DIRECT': 0.0}",-,"{'IDEOLOGICAL-INEQUALITY': 0.3333333333333333,..."
4,4,110005,es,Ya verás como este 8 de marzo hay uno que te s...,110005.jpeg,memes/110005.jpeg,6,Annotator_1;Annotator_2;Annotator_3;Annotator_...,F;F;F;M;M;M,18-22;23-45;46+;46+;18-22;23-45,...,NO;YES;NO;NO;YES;NO,-;JUDGEMENTAL;-;-;DIRECT;-,-;IDEOLOGICAL-INEQUALITY;-;-;IDEOLOGICAL-INEQU...,TRAIN-MEME_ES,NO,"{'NO': 0.6666666666666666, 'YES': 0.3333333333...",NO,"{'NO': 0.6666666666666666, 'JUDGEMENTAL': 0.16...",['NO'],"{'NO': 0.6666666666666666, 'IDEOLOGICAL-INEQUA..."


In [None]:
# Remove entries that don't have "Yes" or "No" label
i = all_df[all_df['task5_gold_hard']=='-'].index
all_df = all_df.drop(i)

In [None]:
from sklearn.preprocessing import LabelEncoder
task5_encoder = LabelEncoder()

task5_encoder.fit(all_df['task5_gold_hard'])
all_df['hard_label'] = task5_encoder.transform(all_df['task5_gold_hard'])

In [None]:

all_df[['hard_label','task5_gold_hard']]

Unnamed: 0,hard_label,task5_gold_hard
0,0,DIRECT
1,0,DIRECT
2,2,NO
4,2,NO
5,0,DIRECT
...,...,...
4039,0,DIRECT
4040,1,JUDGEMENTAL
4041,1,JUDGEMENTAL
4042,2,NO


In [None]:
all_df.head()

Unnamed: 0.1,Unnamed: 0,id,lang,text,meme,path_memes,number_annotators,annotators,gender_annotators,age_annotators,...,labels_task5,labels_task6,split,task4_gold_hard,task4_gold_soft,task5_gold_hard,task5_gold_soft,task6_gold_hard,task6_gold_soft,hard_label
0,0,110001,es,2+2=5 MITO Albert Einstein tenía bajo rendimie...,110001.jpeg,memes/110001.jpeg,6,Annotator_1;Annotator_2;Annotator_3;Annotator_...,F;F;F;M;M;M,18-22;23-45;46+;46+;18-22;23-45,...,DIRECT;DIRECT;DIRECT;DIRECT;DIRECT;DIRECT,IDEOLOGICAL-INEQUALITY|STEREOTYPING-DOMINANCE|...,TRAIN-MEME_ES,YES,"{'YES': 1.0, 'NO': 0.0}",DIRECT,"{'DIRECT': 1.0, 'NO': 0.0, 'JUDGEMENTAL': 0.0}",['IDEOLOGICAL-INEQUALITY'],"{'IDEOLOGICAL-INEQUALITY': 1.0, 'STEREOTYPING-...",0
1,1,110002,es,CUANDO UNA MUJER VA A LUCHAR POR SUS DERECHOS,110002.jpeg,memes/110002.jpeg,6,Annotator_1;Annotator_2;Annotator_3;Annotator_...,F;F;F;M;M;M,18-22;23-45;46+;46+;18-22;23-45,...,DIRECT;DIRECT;DIRECT;DIRECT;DIRECT;JUDGEMENTAL,IDEOLOGICAL-INEQUALITY|STEREOTYPING-DOMINANCE|...,TRAIN-MEME_ES,YES,"{'YES': 1.0, 'NO': 0.0}",DIRECT,"{'DIRECT': 0.8333333333333334, 'JUDGEMENTAL': ...","['IDEOLOGICAL-INEQUALITY', 'MISOGYNY-NON-SEXUA...","{'IDEOLOGICAL-INEQUALITY': 0.6666666666666666,...",0
2,2,110003,es,ІЯ ЕГЕЯ Е MOA ¿El Partido Republicano busca pe...,110003.jpeg,memes/110003.jpeg,6,Annotator_1;Annotator_2;Annotator_3;Annotator_...,F;F;F;M;M;M,18-22;23-45;46+;46+;18-22;23-45,...,DIRECT;DIRECT;-;-;-;-,STEREOTYPING-DOMINANCE|OBJECTIFICATION|MISOGYN...,TRAIN-MEME_ES,NO,"{'YES': 0.3333333333333333, 'NO': 0.6666666666...",NO,"{'DIRECT': 0.3333333333333333, 'NO': 0.6666666...",['NO'],"{'STEREOTYPING-DOMINANCE': 0.3333333333333333,...",2
4,4,110005,es,Ya verás como este 8 de marzo hay uno que te s...,110005.jpeg,memes/110005.jpeg,6,Annotator_1;Annotator_2;Annotator_3;Annotator_...,F;F;F;M;M;M,18-22;23-45;46+;46+;18-22;23-45,...,-;JUDGEMENTAL;-;-;DIRECT;-,-;IDEOLOGICAL-INEQUALITY;-;-;IDEOLOGICAL-INEQU...,TRAIN-MEME_ES,NO,"{'NO': 0.6666666666666666, 'YES': 0.3333333333...",NO,"{'NO': 0.6666666666666666, 'JUDGEMENTAL': 0.16...",['NO'],"{'NO': 0.6666666666666666, 'IDEOLOGICAL-INEQUA...",2
5,5,110006,es,"Princesa, hoy es tu día, no laves ningún plato...",110006.jpeg,memes/110006.jpeg,6,Annotator_1;Annotator_2;Annotator_3;Annotator_...,F;F;F;M;M;M,18-22;23-45;46+;46+;18-22;23-45,...,DIRECT;DIRECT;DIRECT;DIRECT;DIRECT;DIRECT,IDEOLOGICAL-INEQUALITY|STEREOTYPING-DOMINANCE|...,TRAIN-MEME_ES,YES,"{'YES': 1.0, 'NO': 0.0}",DIRECT,"{'DIRECT': 1.0, 'NO': 0.0, 'JUDGEMENTAL': 0.0}","['IDEOLOGICAL-INEQUALITY', 'STEREOTYPING-DOMIN...","{'IDEOLOGICAL-INEQUALITY': 0.3333333333333333,...",0


In [None]:
train_df, test_df = train_test_split(all_df, test_size=0.1, random_state=42)

In [None]:
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel
from torch import optim

# Load the Hugging Face CLIP model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai/clip-vit-large-patch14"  # Choose the desired variant
processor = CLIPProcessor.from_pretrained(model_name)
clip_model = CLIPModel.from_pretrained(model_name).to(device)

batch_size = 8

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

# CLIP Alternative

In [None]:
class CustomDataset(Dataset):
    def __init__(self, root_dir, label_df, label_column, transform=None):
        self.root_dir = root_dir
        self.label_df = label_df
        self.transform = transform
        self.data = self.read_annotations(label_column)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, img_name, label = self.data[idx]
        img_path = os.path.join(self.root_dir, img_name)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return text, image, label

    def read_annotations(self, label_column):
        text_list = list(self.label_df['text'])
        image_list = list(self.label_df['meme'])
        label_list = list(self.label_df[label_column])

        data = [(text, img, label) for text, img, label in zip(text_list, image_list, label_list)]

        return data

In [None]:
class ImagePersuasivenessClassifier(nn.Module):
    def __init__(self, num_classes):
        super(ImagePersuasivenessClassifier, self).__init__()
        # Linear layer for classification
        self.linear = nn.Linear(1536, num_classes)

    def forward(self, text, image):
        # # Get CLIP embeddings for image and text
        inputs = processor(text, images=image, return_tensors="pt", padding=True, truncation=True)
        inputs = inputs.to(device)
        outputs = clip_model(**inputs)
        text_embedding = outputs.text_embeds
        #print(text_embedding.shape)
        image_embedding = outputs.image_embeds
        #print(image_embedding.shape)

        # Concatenate embeddings
        combined_embedding = torch.cat((image_embedding, text_embedding), dim=1)

        # Classification
        logits = self.linear(combined_embedding)
        return logits

# Example usage
num_classes = 3  # Binary classification (persuasive or not)
model = ImagePersuasivenessClassifier(num_classes).to(device)

# Define transformations
transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize images to fit the pre-trained model input size
        transforms.ToTensor(),
    ])

# Load datasets
train_dataset = CustomDataset(root_dir=img_path, label_df=train_df, label_column='hard_label', transform=transform)
test_dataset = CustomDataset(root_dir=img_path, label_df=test_df, label_column='hard_label', transform=transform)

# Define data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for data in train_loader:
            # Forward pass
            text, image, labels = data
            #inputs, labels =inputs.to(device), labels.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(text, image)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            del outputs


    print(f"Epoch [{epoch+1}/{num_epochs}]: Loss = {running_loss}")

    # Evaluate model on test data
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            text, image, labels = data
            #inputs, labels =inputs.to(device), labels.to(device)
            labels = labels.to(device)
            outputs = model(text, image)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del outputs
        print("Accuracy: ", correct / total)


# Save the trained model
torch.save(model.state_dict(), model_save_path + "/clip_task5.pth")


It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


Epoch [1/5]: Loss = 358.06670784950256
Accuracy:  0.5125
Epoch [2/5]: Loss = 344.00928527116776
Accuracy:  0.53125
Epoch [3/5]: Loss = 334.4566608071327
Accuracy:  0.553125
Epoch [4/5]: Loss = 326.9057405591011
Accuracy:  0.56875
Epoch [5/5]: Loss = 321.32852280139923
Accuracy:  0.56875


In [None]:
correct = 0
total = 0
with torch.no_grad():
        for data in test_loader:
            text, image, labels = data
            #inputs, labels =inputs.to(device), labels.to(device)
            labels = labels.to(device)
            outputs = model(text, image)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del outputs

print(correct/total)

0.7105263157894737


## Prediction section

In [None]:
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel
from torch import optim

# Load the Hugging Face CLIP model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai/clip-vit-large-patch14"  # Choose the desired variant
processor = CLIPProcessor.from_pretrained(model_name)
clip_model = CLIPModel.from_pretrained(model_name).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

In [None]:
class ImagePersuasivenessClassifier(nn.Module):
    def __init__(self, num_classes):
        super(ImagePersuasivenessClassifier, self).__init__()
        # Linear layer for classification
        self.linear = nn.Linear(1536, num_classes)

    def forward(self, text, image):
        # # Get CLIP embeddings for image and text
        inputs = processor(text, images=image, return_tensors="pt", padding=True, truncation=True)
        inputs = inputs.to(device)
        outputs = clip_model(**inputs)
        text_embedding = outputs.text_embeds
        #print(text_embedding.shape)
        image_embedding = outputs.image_embeds
        #print(image_embedding.shape)

        # Concatenate embeddings
        combined_embedding = torch.cat((image_embedding, text_embedding), dim=1)

        # Classification
        logits = self.linear(combined_embedding)
        return logits

In [None]:
num_classes = 2
model = ImagePersuasivenessClassifier(num_classes).to(device)
model.load_state_dict(torch.load(model_save_path + "/clip_model_task5.pth"))
model.eval()

ImagePersuasivenessClassifier(
  (linear): Linear(in_features=1536, out_features=2, bias=True)
)

In [None]:
test_img_path = "/content/drive/MyDrive/EXIST 2024/Final/Testsets/Memes/memes"
test_dataset_path = "/content/drive/MyDrive/EXIST 2024/Final/Testsets/Memes/EXIST2024_test_clean.json"

In [None]:
import json
with open(test_dataset_path) as f:
    file = json.load(f)

In [None]:
test_dict = {"meme":[], "text":[]}

for entry in file:
    test_dict["meme"].append(file[entry]["meme"])
    test_dict["text"].append(file[entry]["text"])

In [None]:
# Define transformations
transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize images to fit the pre-trained model input size
        transforms.ToTensor(),
    ])

In [None]:
count = len(test_dict["meme"])
hard_results = []
soft_results = []
with torch.no_grad():
        for idx in range(count):
            text, img_name = test_dict["text"][idx], test_dict["meme"][idx]
            img_id = img_name.split(".")[0]
            img_path = os.path.join(test_img_path, img_name)
            image = Image.open(img_path).convert('RGB')
            image = transform(image)
            outputs = model(text, image)
            # soft label
            probs = torch.softmax(outputs.data, dim=1)
            soft_results.append({"test_case": "EXIST2024",
                                "id": img_id,
                                "value": {
                                    "YES": probs[0][1].item(),
                                    "NO": probs[0][0].item(),
                                         }})
            _, predicted = torch.max(probs, 1)
            # hard results
            hard_label = "NO" if predicted[0].item() == 0 else "YES"
            hard_results.append({"test_case": "EXIST2024",
                                    "id": img_id,
                                    "value": hard_label})


In [None]:
save_file = open("task4_hard.json", "w")
json.dump(hard_results, save_file)
save_file.close()

In [None]:
save_file = open("task4_soft.json", "w")
json.dump(soft_results, save_file)
save_file.close()

In [None]:
class TestCLIPDataset(Dataset):
    def __init__(self, root_dir, data_dict, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.data_dict = data_dict

    def __len__(self):
        return len(self.data_dict["text"])

    def __getitem__(self, idx):
        text, img_name = self.data_dict["text"][idx], self.data_dict["meme"][idx]
        img_path = os.path.join(self.root_dir, img_name)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return text, image

In [None]:
# Load datasets
test_dataset = TestCLIPDataset(root_dir=test_img_path, data_dict= test_dict, transform=transform)

# Define data loaders
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
mapping_dict = {0: 'NO', 1: 'YES'}
with torch.no_grad():
        for data in test_loader:
            text, image, img_names = data
            outputs = model(text, image)
            probs = torch.softmax(outputs.data, dim=1)
            _, predicted = torch.max(probs, 1)
            mapped_strings = [mapping_dict[val] for val in predicted]