In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
nlp = spacy.load("en_core_web_lg")

from tqdm import tqdm
tqdm.pandas()   

from src import config
from src.datasets import TextConcatFactCheck, TextConcatPosts
from src.utils import cleaning_spacy, cleaning_spacy_batch

tasks_path = config.TASKS_PATH
posts_path = config.POSTS_PATH
fact_checks_path = config.FACT_CHECKS_PATH
gs_path = config.GS_PATH
lang = 'tha'
task_name = "monolingual"

print("Loading Fact Checks...")
fc = TextConcatFactCheck(fact_checks_path, tasks_path=tasks_path, task_name=task_name, lang=lang, version="english")
print("Loading Fact Checks (English + Clean)...")
# fc_eng = TextConcatFactCheck(fact_checks_path, tasks_path=tasks_path, task_name=task_name, lang=lang, version="english", cleaning_function=lambda x: cleaning_spacy_batch(x, nlp))

print("Loading Posts...")
posts = TextConcatPosts(posts_path, tasks_path=tasks_path, task_name=task_name, lang=lang, gs_path=gs_path, version="english")
print("Loading Posts (English + Clean)...")
# posts_eng = TextConcatPosts(posts_path, tasks_path=tasks_path, task_name=task_name, lang=lang, gs_path=gs_path, version="english", cleaning_function=lambda x: cleaning_spacy_batch(x, nlp))

  from .autonotebook import tqdm as notebook_tqdm


Loading Fact Checks...
Loading Fact Checks (English + Clean)...
Loading Posts...
Loading Posts (English + Clean)...


In [2]:
df_fc = fc.df
df_train_posts = posts.df_train
df_dev_posts = posts.df_dev

In [3]:
print(f"TRAIN: {df_train_posts.shape}, DEV: {df_dev_posts.shape}, FC: {df_fc.shape}")

TRAIN: (418, 9), DEV: (47, 9), FC: (382, 4)


In [11]:
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    # device="cuda",
    device_map="auto",
)

terminators = [
    pipe.tokenizer.eos_token_id,
    pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  4.00s/it]
Device set to use cuda:0


In [14]:

ocr = "First with a hood and stick in hand... After arresting protesters... What is it called??? or Carlos Roman Hamu"
text = ""

messages = [
        {"role": "system", "content": "You are an expert recovering the image of a post from a social media platform only using the text and OCR of the post. do it in less than 512 tokens."\
                "Complete the post with the information from the image. Be concise and provide proper names if necessary. Be 100% confident in your answer. Provide only the description of the image."\
                 "Provide always a description even if you don't know the answer without justifying your decision just as if it was real. Talk only about the content not your reasoning.",},
        {"role": "user", "content": "The OCR of the post is: " + ocr + " The text of the post is: " + text},]
outputs = pipe(
            messages,
            max_new_tokens=32,
            temperature=0.1,
            eos_token_id=terminators,
            pad_token_id=pipe.tokenizer.eos_token_id,
        )

outputs

[{'generated_text': [{'role': 'system',
    'content': "You are an expert recovering the image of a post from a social media platform only using the text and OCR of the post. do it in less than 512 tokens.Complete the post with the information from the image. Be concise and provide proper names if necessary. Be 100% confident in your answer. Provide only the description of the image.Provide always a description even if you don't know the answer without justifying your decision just as if it was real. Talk only about the content not your reasoning."},
   {'role': 'user',
    'content': 'The OCR of the post is: First with a hood and stick in hand... After arresting protesters... What is it called??? or Carlos Roman Hamu The text of the post is: '},
   {'role': 'assistant', 'content': 'Police brutality'}]}]

In [58]:
def describe_image(ocr, text):
    messages = [
{
  "role": "system",
  "content": 
  "You are an expert in recovering the image of a social media post based on its OCR and textual content. Use the given text and OCR to describe the post's visual content concisely. Be confident in your descriptions.\n"
  "If you cannot deduce anything meaningful about the image, return <NO ANSWER>. Do not explain your reasoning. Focus only on describing the content.\n"
  "\n"
  "Example 0:\n"
  "OCR: years in 2022. That makes him the oldest land animal known alive today."
  "Text: #ascoisasinteressantes #sabiocurioso #like4like #food #instalike #likeforlike #family #travel #fitness #igers #tagsforlikes #follow4follow #nofilter #life #filosofia #estoicismo #beauty #amazing #instamood #instagram #sun #photo #mphotographyusic #beach #followforfollow #bestoftheday"
  "Output: 'Jonathan the tortoise, the oldest living land animal, who turned 190 in 2022.'\n"
  "Example 1:\n"
  "OCR: '50% OFF SALE NOW'\n"
  "Text: 'Huge discounts on electronics.'\n"
  "Output: 'An advertisement banner with bold text offering a 50% discount.'\n"
  "\n"
  "Example 2:\n"
  "OCR: 'John Doe for Mayor'\n"
  "Text: 'Vote for a better future!'\n"
  "Output: 'A political campaign poster featuring John Doe.'\n"
  "\n"
  "Example 3:\n"
  "OCR: 'Unreadable'\n"
  "Text: 'Important meeting tomorrow.'\n"
  "Output: '<NO ANSWER>'\n"
  "\n"
  "Example 4:\n"
  "OCR: She set a world record in 2023 for the longest free dive by a woman, reaching a depth of 123 meters.\n"
  "Text: 'This is so inspiring! She broke the record and reached 123 meters underwater. Women are unstoppable! 🌊 #freediving #worldrecord'\n"
  "Output: 'A female freediver who set a world record in 2023 by diving 123 meters.'\n"
  "\n"
  "Example 5:\n"
  "OCR: This structure, completed in 1889, was once the tallest man-made structure in the world.\n"
  "Text: 'Visited Paris last summer and finally saw the Eiffel Tower! Can't believe it was the tallest building back in 1889. 🗼❤️'\n"
  "Output: 'The Eiffel Tower, completed in 1889, and once the tallest man-made structure.'\n"
  "\n"
  "Example 6:\n"
  "OCR: In 1969, humanity took its first steps on the Moon, marking a historic milestone in space exploration.\n"
  "Text: 'The Moon landing in 1969 still gives me chills. Imagine watching that live! Truly one of humankind's greatest achievements. 🌕🚀'\n"
  "Output: 'The first human Moon landing in 1969, a milestone in space exploration.'\n"
  "\n"
  "Example 7:\n"
  "OCR: Known as the King of Pop, his album Thriller remains the best-selling album of all time.\n"
  "Text: 'Listening to Michael Jackson's Thriller on repeat today. It’s crazy how it’s still the best-selling album of all time. 👑🎶'\n"
  "Output: 'Michael Jackson, the King of Pop, known for Thriller, the best-selling album ever.'\n"
  "\n"
  "Example 8:\n"
  "OCR: This majestic animal is the largest land mammal and is known for its intelligence and memory.\n"
  "Text: 'Saw an elephant up close during my safari trip. They're so intelligent and graceful. Truly the giants of the animal kingdom. 🐘❤️'\n"
  "Output: 'The elephant, the largest land mammal, celebrated for its intelligence and memory.'\n"
},

        {"role": "user", "content": f"The OCR of the post is: {ocr} The text of the post is: {text}"}
    ]
    outputs = pipe(
        messages,
        max_new_tokens=128,
        temperature=0.01,
        eos_token_id=pipe.tokenizer.eos_token_id,
        pad_token_id=pipe.tokenizer.eos_token_id
    )

    return outputs[0]["generated_text"][-1]["content"]

In [59]:
df_train_posts_mini = df_train_posts.iloc[:20]

In [60]:
df_train_posts_mini["description"] = df_train_posts_mini.progress_apply(lambda x: describe_image(x["ocr"], x["text"]), axis=1)

100%|██████████| 20/20 [00:13<00:00,  1.51it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_posts_mini["description"] = df_train_posts_mini.progress_apply(lambda x: describe_image(x["ocr"], x["text"]), axis=1)


In [61]:
print(df_train_posts_mini["description"].to_markdown())

|   post_id | description                                                                                                                                                                                                                                                                                                                                                                               |
|----------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|        72 | A vintage advertisement for a futuristic car, predicting a world with advanced technology and minimal pollution.                                                                                      

In [62]:
from src.models import EmbeddingModel
model_name = "/gpfs/projects/bsc14/abecerr1/hub/models--intfloat--multilingual-e5-large/snapshots/ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb"
model = EmbeddingModel(model_name=model_name, df_fc=df_fc, device="cuda")

Batches: 100%|██████████| 3/3 [00:00<00:00,  7.28it/s]


In [63]:
df_dev_orig = df_dev_posts.copy()
df_dev_desc = df_dev_posts.copy()
df_dev_desc["description"] = df_dev_desc.progress_apply(lambda x: describe_image(x["ocr"], x["text"]), axis=1)

100%|██████████| 47/47 [00:38<00:00,  1.22it/s]


In [77]:
df_dev_desc["full_desc"] = df_dev_desc["description"] + " " + df_dev_desc["ocr"] + " " + df_dev_desc["text"]

In [65]:
# df_dev_desc.loc[df_dev_desc["description"] == "<NO ANSWER>", "description"] = df_dev_desc.loc[df_dev_desc["description"] == "<NO ANSWER>", "full_text"]

In [79]:
df_dev_orig["preds"] = model.predict(df_dev_orig["full_text"].tolist()).tolist()
df_dev_desc["preds"] = model.predict(df_dev_desc["full_desc"].tolist()).tolist()

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.55it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.55it/s]


In [80]:
print(model.evaluate(df_dev_orig, task_name=task_name, lang=lang))
print(model.evaluate(df_dev_desc, task_name=task_name, lang=lang))

{'monolingual': {'tha': {1: np.float64(0.7659574468085106), 3: np.float64(0.9148936170212766), 5: np.float64(0.9361702127659575), 10: np.float64(0.9787234042553191)}}}
{'monolingual': {'tha': {1: np.float64(0.6808510638297872), 3: np.float64(0.8723404255319149), 5: np.float64(0.9361702127659575), 10: np.float64(0.9574468085106383)}}}


In [81]:
df_dev_desc["correct"] = df_dev_desc.progress_apply(lambda x: len(np.intersect1d(x["preds"][:10], x["gs"])) > 0, axis=1)

100%|██████████| 47/47 [00:00<00:00, 31068.92it/s]


In [82]:
# Get position in which the correct answer is
df_dev_desc["pos_true"] = df_dev_desc.progress_apply(lambda x: [x["preds"].index(gs) for gs in x["gs"] if gs in x["preds"]], axis=1)

100%|██████████| 47/47 [00:00<00:00, 65666.98it/s]


In [83]:
print(df_dev_desc[~df_dev_desc["correct"]].to_markdown())

|   post_id | ocr                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | verdicts                  | text                                                                                                                   

In [84]:
print(df_fc.loc[[193430, 193487], :].to_markdown())

|   fact_check_id | claim                                                                                                | instances                                                                                                                                                        | title                                                                                                   | full_text                                                                                                                                                                                                    |
|----------------:|:-----------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------|:---------