In [None]:
import duckdb
import pandas as pd

parquet_path = '/home/dhia/Downloads/0000.parquet'

df = duckdb.query(f"""
    SELECT abstract,
--     concat('[',reactions, '] in this order of preference from most inportant to least important. Anything else that is not in this list is not relevant and should be discarded') as fulltext,
    fulltext,
    string_split(reactions, ',') AS reactions_list
    FROM parquet_scan('{parquet_path}')
    LIMIT 20
""").to_df()

df2 = (
    df.explode('reactions_list')
      .dropna(subset=['reactions_list'])
)

df2['reactions'] = df2['reactions_list'].str.strip()
df2 = pd.DataFrame({'car_types': df2['reactions'].unique()[::-1]})

df2

In [None]:
import pandas as pd

import lotus
from lotus.models import LM, SentenceTransformersRM
from lotus.types import CascadeArgs
from lotus.vector_store import FaissVS
lm = LM(model="gpt-4o-mini")
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
vs = FaissVS()

lotus.settings.configure(lm=lm, rm=rm, vs=vs)

expr = "Does {fulltext:left} precisely describe or report {car_types:right} ? It needs to be central, super relevant and not generic."


cascade_args = CascadeArgs(recall_target=0.8, precision_target=0.8)
res, stats = df.sem_join(df2, expr, cascade_args=cascade_args, return_stats=True)


print(f"Joined {df.shape[0]} rows from df1 with {df2.shape[0]} rows from df2")
print(f"    Join cascade took {stats['join_resolved_by_large_model']} LM calls")
print(f"    Helper resolved {stats['join_resolved_by_helper_model']} LM calls")
print(f"Join cascade used {stats['total_LM_calls']} LM calls in total")
print(f"Naive join would require {df.shape[0]*df2.shape[0]} LM calls")
print(res)


In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# ————————————————
# 1) Ground-truth mapping
# original df has one row per abstract with a list of reactions
true_map = (
    df.set_index('fulltext')['reactions']
      .apply(lambda lst: [r.strip() for r in lst])  # clean whitespace
      .to_dict()
)

# ————————————————
# 2) Predicted mapping
# your `res` DataFrame has one row per joined (abstract, reaction)
pred_map = (
    res.groupby('fulltext')['reactions:right']
       .apply(list)
       .to_dict()
)

# ————————————————
# 3) Align into parallel lists
abstracts = list(true_map.keys())
y_true = [ true_map[a] for a in abstracts ]
y_pred = [ pred_map.get(a, []) for a in abstracts ]

# ————————————————
# 4) Binarize and compute metrics
mlb    = MultiLabelBinarizer()
Y_true = mlb.fit_transform(y_true)
Y_pred = mlb.transform(y_pred)

print("Micro-precision: ", precision_score(Y_true, Y_pred, average='micro'))
print("Micro-recall:    ", recall_score(Y_true, Y_pred, average='micro'))
print("Micro-F1:        ", f1_score(Y_true, Y_pred, average='micro'))
print("Subset accuracy:", accuracy_score(Y_true, Y_pred))


In [None]:
# — Debugging outputs —

# Basic counts
print(f"Num abstracts: {len(abstracts)}")
print(f"Total ground‐truth pairs: {sum(len(lbls) for lbls in y_true)}")
print(f"Total predicted pairs:    {sum(len(lbls) for lbls in y_pred)}\n")

# Peek at the first few examples
for i, a in enumerate(abstracts[:3]):
    print(f"--- Abstract #{i} ---")
    print(a)
    print("  True reactions:     ", true_map[a])
    print("  Predicted reactions:", pred_map.get(a, []), "\n")

# Build lists of false positives and false negatives
false_positives = []
false_negatives = []
for a in abstracts:
    true_set = set(true_map[a])
    pred_set = set(pred_map.get(a, []))
    for r in pred_set - true_set:
        false_positives.append((a, r))
    for r in true_set - pred_set:
        false_negatives.append((a, r))

# Print samples
print("Sample false positives (predicted but not true):")
for abs_, rxn in false_positives[:10]:
    print(f"  • [{rxn}] in abstract: {abs_[:20]}...")

print("\nSample false negatives (true but not predicted):")
for abs_, rxn in false_negatives[:10]:
    print(f"  • [{rxn}] in abstract: {abs_[:20]}...")


In [None]:
for k, v in list(true_map.items())[:5]:
    print("\n \n\n \n")
    print("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")
    print("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")
    print("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")
    print(k)

    print("------->", v)
    print("--xx--->", pred_map.get(k, []))

In [None]:
res

# TOP k

In [None]:
ranked

In [None]:
from lotus.types import ReasoningStrategy

# -------------------------------------------
# 1.  Grab the join output and normalise names
# -------------------------------------------
# If you asked for return_stats=True, the first element is the pairs DF
pairs = res[0] if isinstance(res, tuple) else res
lm = LM(model="gpt-4o-mini")
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
vs = FaissVS()

lotus.settings.configure(lm=lm, rm=rm, vs=vs)
# NB: after sem_join() LOTUS keeps the original column
# names suffixed with “:left” / “:right”
ART_COL = "fulltext"          # or whatever identifies the article rows
LAB_COL = "reactions"        # the candidate reaction

# -------------------------------------------
# 2.  Add LOTUS rankingss (top-k per article)
# -------------------------------------------
rank_expr = (
    "Given {fulltext}, which {reactions} is most precisely relevant? "
)

ranked, topk_stats = pairs.sem_topk(
    rank_expr,
    K = 5,                      # or 5 if you only need top-5
    group_by=[ART_COL],        # rank within each article
    method="quick",
    strategy = ReasoningStrategy.ZS_COT,

    return_explanations = True,
    return_stats=True
)

# -------------------------------------------
# 3.  Build the gold-standard (exploded) label table
# -------------------------------------------


In [None]:
ranked
gold = (
    df[["fulltext", "reactions_list"]]   # df is the original articles DF you loaded
      .explode("reactions_list")
      .dropna(subset=["reactions_list"])
      .assign(reactions_list=lambda d: d["reactions_list"].str.strip())
      .rename(columns={"fulltext": ART_COL, "reactions_list": "label_id"})
)

# -------------------------------------------
# 4.  Helper to compute RP@k exactly as in §5.2
# -------------------------------------------
def rp_at_k(pred_df, gold_df, k=5):
    gold = gold_df.groupby(ART_COL)["label_id"].apply(set).to_dict()

    per_article = []
    for art, true_set in gold.items():
        preds = (pred_df.loc[pred_df[ART_COL] == art, LAB_COL]
                          .head(k)
                          .tolist())
        hits = sum(p in true_set for p in preds)
        denom = min(k, len(true_set))
        per_article.append(hits / denom)

    return sum(per_article) / len(per_article) if per_article else 0.0

# -------------------------------------------
# 5.  Evaluate
# -------------------------------------------
rp5  = rp_at_k(ranked, gold, k=5)
rp10 = rp_at_k(ranked, gold, k=10)

print(f"RP@5  = {rp5:.3f}")
print(f"RP@10 = {rp10:.3f}")
print(1)


In [None]:
ranked.groupby(ART_COL).cumcount()