In [None]:
!pip install sacrebleu



In [None]:
!pip install sentence_transformers



In [None]:
from sentence_transformers import SentenceTransformer

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from scipy.stats import spearmanr
from tqdm.auto import tqdm, trange
from sacrebleu import CHRF

In [None]:
model = SentenceTransformer('sentence-transformers/sentence-t5-base')

## Import the dataset
* Download it from drive and import it from the prompt here (refering to the paper dataset)
* Then try with our dataset

In [None]:
#from google.colab import files
#uploaded = files.upload()

In [None]:
import json
with open("binary_df.json", "r") as f:
    trial_data = json.load(f)

df = pd.json_normalize(trial_data)

In [None]:
df['is_hallucination'] = df['is_hallucination'].astype(int)
df['named_entities'] = df['named_entities'].astype(int)
df['full_hallu'] = df['full_hallu'].astype(int)
df['strong_hallu'] = df['strong_hallu'].astype(int)

In [None]:
df.head()

Unnamed: 0,hyp,src,tgt,task,word_to_define,is_hallucination,named_entities,strong_hallu,full_hallu
0,Resembling or characteristic of a weasel.,The writer had just entered into his eighteent...,Resembling a weasel (in appearance).,DM,weaselly,0,0,0,0
1,Alternative form of sheath knife,Sailors ' and fishermen 's <define> sheath - k...,.,DM,sheath - knives,1,0,1,0
2,(obsolete) A short period of time.,"As to age , Bead could not form any clear impr...","(poetic) An instant, a short moment.",DM,eyewink,0,0,0,0
3,(slang) An incel.,Because redpillers are usually normies or <def...,"(incel, _, slang) A man of a slightly lower ra...",DM,Chadlites,0,0,0,0
4,"An island in Lienchiang County, Taiwan.",On the second day of massive live - fire drill...,"An island in Dongyin, Lienchiang, Taiwan, in t...",DM,Xiyin,0,0,0,0


In [None]:
#emb_src = np.array([
    #model.encode([row['word_to_define']], show_progress_bar=True)[0] if row['task'] == 'DM' else
    #model.encode([row['src']], show_progress_bar=True)[0]
    #for _, row in df.iterrows()
#])


In [None]:
emb_tgt = model.encode(df.tgt.tolist(), show_progress_bar=True)
emb_hyp = model.encode(df.hyp.tolist(), show_progress_bar=True)
#emb_src = model.encode(df.src.tolist(), show_progress_bar=True)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
emb_tgt.shape

(499, 768)

In [None]:
sims = (emb_tgt * emb_hyp).sum(1)
#sims = (emb_tgt * emb_src).sum(1)
sims.shape

(499,)

In [None]:
chrfpp = CHRF(word_order=2)

ref_chrfpp = [
    chrfpp.sentence_score(row.hyp, [row.tgt]).score
    for i, row in tqdm(df.iterrows(), total=df.shape[0])
]

  0%|          | 0/499 [00:00<?, ?it/s]

In [None]:
target_columns = ['is_hallucination'	,'named_entities',	'full_hallu',	'strong_hallu']

In [None]:
df["full_prob"] = 1-sims
df["sims"] = sims

In [None]:
all_metrics = pd.concat([
    pd.DataFrame({
        'model similarity': -sims,
        'tgt_chrfpp_neg': -np.array(ref_chrfpp),
    })], axis=1)

# **HYP - TGT no preprocessing**

In [None]:
aucs = pd.DataFrame({
    target: {pred: roc_auc_score(df[target], all_metrics[pred]) for pred in all_metrics.columns}
    for target in target_columns
})
aucs.sort_values('is_hallucination', ascending=False)

Unnamed: 0,is_hallucination,named_entities,full_hallu,strong_hallu
model similarity,0.816955,0.860161,0.805245,0.748185
tgt_chrfpp_neg,0.696889,0.965795,0.643157,0.684749


# **Spearman Correlation**

In [None]:
spearmans = pd.DataFrame({
    target: {pred: spearmanr(df[target], all_metrics[pred]).correlation for pred in all_metrics.columns}
    for target in target_columns
})
spearmans.sort_values('is_hallucination', ascending=False)

Unnamed: 0,is_hallucination,named_entities,full_hallu,strong_hallu
model similarity,0.54459,0.078828,0.389916,0.298569
tgt_chrfpp_neg,0.338293,0.101948,0.182866,0.222254
