In [1]:
from src.datasets import TextConcatPosts, TextConcatFactCheck
from src.models import EmbeddingModel

tasks_path = "data/splits/tasks_no_gs_overlap.json" # This is the file that will be replaced in the final version with new split
posts_path = "data/complete_data/posts.csv"
fact_checks_path = "data/complete_data/fact_checks.csv"
gs_path = "data/complete_data/pairs.csv"
langs = ['fra', 'spa', 'eng', 'por', 'tha', 'deu', 'msa', 'ara']

def succ_at_k(df, k, group=True):
    if group:
        return df.apply(lambda x: len(list((set(x["preds"][:k]) & set(x["gs"])))) > 0, axis=1).mean()
    else:
        return df.explode("gs").apply(lambda x: x["gs"] in x["preds"][:k], axis=1).mean()
    
def print_succ_at_k(df, k, group=True):
    print(f"S@{k} (group)", succ_at_k(df, k, group=True))
    print(f"S@{k} (explode)", succ_at_k(df, k, group=False))


lang = "fra"
    
print("\n\nProcessing", lang)
posts = TextConcatPosts(posts_path, tasks_path, task_name="monolingual", gs_path=gs_path, lang=lang)
fact_checks = TextConcatFactCheck(fact_checks_path, tasks_path, task_name="monolingual", lang=lang)

df_fc = fact_checks.df
df_posts_train = posts.df_train
df_posts_dev = posts.df_dev

model_name = '/home/bsc/bsc830651/.cache/huggingface/hub/models--intfloat--multilingual-e5-large/snapshots/ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb'

model = EmbeddingModel(model_name, df_fc, batch_size=512)

df_posts_dev["preds"] = model.predict(df_posts_dev["full_text"].values).tolist()

print_succ_at_k(df_posts_dev, 1)
print_succ_at_k(df_posts_dev, 5)
print_succ_at_k(df_posts_dev, 10)


  from tqdm.autonotebook import tqdm, trange




Processing fra


Batches: 100%|██████████| 9/9 [00:06<00:00,  1.50it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]

S@1 (group) 0.7320261437908496
S@1 (explode) 0.7225806451612903
S@5 (group) 0.8627450980392157
S@5 (explode) 0.864516129032258
S@10 (group) 0.8758169934640523
S@10 (explode) 0.8774193548387097





In [2]:
from src import config
from src.datasets import TextConcatPosts, TextConcatFactCheck
from src.models import EmbeddingModel

posts = TextConcatPosts(config.POSTS_PATH, "data/splits/tasks_no_gs_overlap.json", task_name="monolingual", gs_path=config.GS_PATH, lang="spa")
fact_checks = TextConcatFactCheck(config.FACT_CHECKS_PATH, "data/splits/tasks_no_gs_overlap.json", task_name="monolingual", lang="spa")
model_name = '/home/bsc/bsc830651/.cache/huggingface/hub/models--intfloat--multilingual-e5-large/snapshots/ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb'



  from tqdm.autonotebook import tqdm, trange


In [3]:
model = EmbeddingModel(model_name, fact_checks.df, batch_size=512)
df_posts_dev = posts.df_dev
df_posts_dev["preds"] = model.predict(df_posts_dev["full_text"].values).tolist()



Batches: 100%|██████████| 28/28 [00:15<00:00,  1.86it/s]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.00s/it]


In [4]:
d_eval = model.evaluate(df_posts_dev, posts.task_name, posts.lang)

In [7]:
d_eval.prettify()

AttributeError: 'dict' object has no attribute 'prettify'

In [6]:
import pandas as pd
pd.DataFrame(d_eval)

Unnamed: 0,monolingual
spa,"{'group': {1: 0.5928853754940712, 3: 0.8043478..."


In [36]:
fact_checks.df.iloc[[28005, 11307, 27919, 13709, 4286, 359, 1655, 4455, 20362, 13703], :].index

IndexError: positional indexers are out-of-bounds

In [30]:
df_posts_dev.reset_index().post_id

0         30
1        163
2        167
3        189
4        403
       ...  
380    27511
381    27708
382    27731
383    27955
384    27969
Name: post_id, Length: 385, dtype: int64

In [23]:
d_eval

{'crosslingual': {'group': {1: np.float64(0.0),
   3: np.float64(0.0),
   5: np.float64(0.0),
   10: np.float64(0.0)},
  'individual': {1: np.float64(0.0),
   3: np.float64(0.0),
   5: np.float64(0.0),
   10: np.float64(0.0)}}}

In [27]:
posts.tasks["monolingual"]["fra"]["posts_dev"]

[291,
 675,
 1073,
 1292,
 1299,
 1300,
 1301,
 1349,
 3191,
 3442,
 3569,
 3599,
 4104,
 4437,
 4619,
 4655,
 5264,
 5329,
 5330,
 5353,
 5504,
 5730,
 6220,
 6228,
 6258,
 6343,
 6344,
 6456,
 6474,
 6737,
 6762,
 6890,
 7087,
 7176,
 7178,
 7268,
 7404,
 7478,
 7503,
 7506,
 7507,
 7512,
 7541,
 7647,
 7648,
 8035,
 8036,
 8143,
 8267,
 8412,
 8413,
 9612,
 10269,
 10431,
 10502,
 10526,
 10569,
 11021,
 11158,
 11324,
 11454,
 12149,
 12150,
 12151,
 12162,
 12469,
 12936,
 12937,
 12942,
 13090,
 13091,
 13100,
 13244,
 13412,
 13413,
 13425,
 13533,
 13537,
 13553,
 13567,
 13608,
 13609,
 13623,
 13664,
 13671,
 13695,
 13707,
 13708,
 14020,
 14235,
 14637,
 16665,
 16727,
 16892,
 17085,
 17099,
 17192,
 17196,
 17197,
 17198,
 17782,
 18054,
 18434,
 18459,
 18865,
 19789,
 19846,
 20261,
 20277,
 20283,
 20289,
 20359,
 20364,
 20419,
 20824,
 20864,
 20906,
 20912,
 20917,
 21359,
 21733,
 21994,
 22011,
 22256,
 22412,
 25729,
 25924,
 25941,
 25946,
 26008,
 26768,
 26872

In [19]:
type(df_posts_dev.iloc[0]["preds"][0])

int

In [20]:
type(df_posts_dev.iloc[0]["gs"][0])

int

In [11]:
df_posts_dev.apply(lambda x: len(list((set(x["preds"]).intersection(set(x["gs"]))))) > 0, axis=1).mean()

np.float64(0.0)

In [3]:
d_eval

{'monolingual': {'group': {1: np.float64(0.0),
   3: np.float64(0.0),
   5: np.float64(0.0),
   10: np.float64(0.0)},
  'individual': {1: np.float64(0.0),
   3: np.float64(0.0),
   5: np.float64(0.0),
   10: np.float64(0.0)}}}

In [2]:
lang = "por"
    
print("\n\nProcessing", lang)
posts = TextConcatPosts(posts_path, tasks_path, task_name="monolingual", gs_path=gs_path, lang=lang)
fact_checks = TextConcatFactCheck(fact_checks_path, tasks_path, task_name="monolingual", lang=lang)

df_fc = fact_checks.df
df_posts_train = posts.df_train
df_posts_dev = posts.df_dev

model_name = '/home/bsc/bsc830651/.cache/huggingface/hub/models--intfloat--multilingual-e5-large/snapshots/ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb'

model = EmbeddingModel(model_name, df_fc, batch_size=512)

df_posts_dev["preds"] = model.predict(df_posts_dev["full_text"].values).tolist()

print_succ_at_k(df_posts_dev, 1)
print_succ_at_k(df_posts_dev, 5)
print_succ_at_k(df_posts_dev, 10)



Processing por


Batches: 100%|██████████| 43/43 [00:23<00:00,  1.81it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]

S@1 (group) 0.39759036144578314
S@1 (explode) 0.38372093023255816
S@5 (group) 0.7951807228915663
S@5 (explode) 0.7906976744186046
S@10 (group) 0.8493975903614458
S@10 (explode) 0.8430232558139535





In [3]:
lang = "deu"
    
print("\n\nProcessing", lang)
posts = TextConcatPosts(posts_path, tasks_path, task_name="monolingual", gs_path=gs_path, lang=lang)
fact_checks = TextConcatFactCheck(fact_checks_path, tasks_path, task_name="monolingual", lang=lang)

df_fc = fact_checks.df
df_posts_train = posts.df_train
df_posts_dev = posts.df_dev

model_name = '/home/bsc/bsc830651/.cache/huggingface/hub/models--intfloat--multilingual-e5-large/snapshots/ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb'

model = EmbeddingModel(model_name, df_fc, batch_size=512)

df_posts_dev["preds"] = model.predict(df_posts_dev["full_text"].values).tolist()

print_succ_at_k(df_posts_dev, 1)
print_succ_at_k(df_posts_dev, 5)
print_succ_at_k(df_posts_dev, 10)



Processing deu


Batches: 100%|██████████| 10/10 [00:08<00:00,  1.14it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.97it/s]

S@1 (group) 0.32786885245901637
S@1 (explode) 0.31746031746031744
S@5 (group) 0.5901639344262295
S@5 (explode) 0.6031746031746031
S@10 (group) 0.6721311475409836
S@10 (explode) 0.6825396825396826





In [4]:
len(df_posts_train), len(df_posts_dev)

(606, 61)