In [1]:
from tqdm import tqdm
import os
from datetime import datetime
from time import time
import pandas as pd
from src.datasets import TextConcatFactCheck, TextConcatPosts
from src.models import EmbeddingModel
from src import config
from src.utils import log_info

task_name = "monolingual"
langs = ["eng"]
output_path = None
model_name = '/home/bsc/bsc830651/.cache/huggingface/hub/models--intfloat--multilingual-e5-large/snapshots/ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb'
tasks_path = config.TASKS_PATH
posts_path = config.POSTS_PATH
fact_checks_path = config.FACT_CHECKS_PATH

"""
Run the task with the given parameters.
"""
current_time = datetime.now().strftime("%Y%m%d-%H%M%S")

if output_path is not None:
    output_path = os.path.join(output_path, task_name, current_time)

langs = ["eng"] if task_name == "crosslingual" else langs

log_info(f"Task: {task_name}")
log_info(f"Tasks path: {tasks_path}")
log_info(f"Languages: {langs}")
log_info(f"Model: {model_name}")
log_info(f"Output path: {output_path}\n")

# Paths from config
posts_path = config.POSTS_PATH
fact_checks_path = config.FACT_CHECKS_PATH
gs_path = config.GS_PATH

# tasks_path = "data/splits/tasks_local_dev.json"
ls_k = [1, 3, 5, 10]

d_out = {}
df_eval = pd.DataFrame(index=ls_k)
df_eval.index.name = "k"

for lang in tqdm(langs, desc="Languages"):
    log_info(f"Lang: {lang}")
    time_start_lang = time()
    
    log_info("Loading posts...")
    time_start = time()
    posts = TextConcatPosts(posts_path, tasks_path, task_name=task_name, gs_path=gs_path, lang=lang)
    log_info(f"Loaded {len(posts)}")
    log_info(f"Time taken: {time() - time_start:.2f}s\n")
    
    log_info("Loading fact checks..")
    time_start = time()
    fact_checks = TextConcatFactCheck(fact_checks_path, tasks_path, task_name=task_name, lang=lang)
    log_info(f"Loaded {len(fact_checks)}")
    log_info(f"Time taken: {time() - time_start:.2f}s\n")

    df_fc = fact_checks.df
    # df_posts_train = posts.df_train
    df_posts_dev = posts.df_dev
    log_info("Loading model...")
    # time_start = time()
    # model = EmbeddingModel(model_name, df_fc, batch_size=512)
    # log_info(f"Time taken: {time() - time_start:.2f}s\n")

  from tqdm.autonotebook import tqdm, trange
2024-10-25 20:19:17,411 - INFO - Task: monolingual
2024-10-25 20:19:17,411 - INFO - Tasks path: data/splits/tasks_no_gs_overlap.json
2024-10-25 20:19:17,412 - INFO - Languages: ['eng']
2024-10-25 20:19:17,412 - INFO - Model: /home/bsc/bsc830651/.cache/huggingface/hub/models--intfloat--multilingual-e5-large/snapshots/ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb
2024-10-25 20:19:17,412 - INFO - Output path: None

Languages:   0%|          | 0/1 [00:00<?, ?it/s]2024-10-25 20:19:17,415 - INFO - Lang: eng
2024-10-25 20:19:17,415 - INFO - Loading posts...
2024-10-25 20:19:19,309 - INFO - Loaded 24431
2024-10-25 20:19:19,310 - INFO - Time taken: 1.89s

2024-10-25 20:19:19,310 - INFO - Loading fact checks..
2024-10-25 20:19:26,154 - INFO - Loaded 85734
2024-10-25 20:19:26,155 - INFO - Time taken: 6.85s

2024-10-25 20:19:26,156 - INFO - Loading model...
Languages: 100%|██████████| 1/1 [00:08<00:00,  8.74s/it]


In [2]:
# from transformers import pipeline
# from transformers import AutoTokenizer, AutoModelForTokenClassification
# ner_model_path = "/gpfs/projects/bsc14/abecerr1/hub/models--FacebookAI--xlm-roberta-large-finetuned-conll03-english/snapshots/18f95e9924f3f452df09cc90945073906ef18f1e/"
# ner_tokenizer = AutoTokenizer.from_pretrained(ner_model_path)
# ner_model = AutoModelForTokenClassification.from_pretrained(ner_model_path)
# ner_classifier = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, device="cuda")
# ner_out = ner_classifier("Alya told Jasmine that Andrew could pay with cash..")

In [3]:
import spacy

spacy.prefer_gpu()
# nlp = spacy.load("xx_ent_wiki_sm")
nlp = spacy.load("en_core_web_sm")

from tqdm import tqdm
import pandas as pd
tqdm.pandas()

df_posts_train = posts.df_train
# df_posts_train["entities"] = df_posts_train["full_text"].progress_apply(lambda x: nlp(x).ents)
df_posts_train["lemmas"] = df_posts_train["full_text"].progress_apply(lambda x: [y.lemma_ for y in nlp(x)])


100%|██████████| 4012/4012 [00:51<00:00, 77.80it/s]


In [9]:
df_posts_train["full_text"].iloc[:3].progress_apply(lambda x: [y.lemma_ for y in nlp(x)])

100%|██████████| 3/3 [00:00<00:00, 99.21it/s]


post_id
2     [", actually, ,, he, be, a, damn, sight, well,...
5     [", cigarette, smoking, do, not, cause, cancer...
13    [", environmentalist, ", Say, Fracking, be, ev...
Name: full_text, dtype: object

In [18]:
[y.lemma_ for y in nlp(df_posts_train["full_text"].iloc[0]) if y.is_stop == False and y.is_punct == False and y.is_space == False and y.pos_ in ["NOUN", "PROPN"]]

['sight',
 'president',
 'Miss',
 'Ardern',
 'Judith',
 'Collins',
 'Donald',
 'Trump',
 'thanks[SEP']

In [5]:
nlp("Alya told Jasmine that Andrew could pay with cash..").ents

(Jasmine, Andrew)

In [6]:
df_posts_train[df_posts_train["entities"].apply(lambda x: len(x) > 0)]

KeyError: 'entities'