In [1]:
import math
import pandas as pd
from mixture_model.MLE import MLE
from mixture_model.estimation import estimate_text_distribution
from mixture_model.tokenizer import tokenize as tokenize_text
from mixture_model.evaluate_parameter_search import fn_to_setting
from IPython.display import display, HTML
from pathlib import Path

In [2]:
def most_probable(distrib, n):
    return sorted(distrib.items(), key = lambda x:x[1], reverse=True)[:n]

def more_probable_retracted(ret_log_vals, ref_log_vals):
    diff_pq_hat = {}
    for word in ret_log_vals:
        diff_pq_hat[word] =  ret_log_vals[word] - ref_log_vals[word] 
    return diff_pq_hat


In [3]:
def get_model(trainset, pos_tags, reason, domain):
    save_path = f"mixture_model/distribution/eval_{reason}_{domain}.parquet"
    if Path(save_path).exists():
        pass
    else:
        sentences_retracted = []
        sentences_reference = []
        train_retracted = [tokenize_text(text) for text in trainset[trainset["label"]==1]["text"]]
        train_reference = [tokenize_text(text) for text in trainset[trainset["label"]==0]["text"]]
        for abstract in train_retracted:
            sentences_retracted += [[w.lower() for w, pos in sentence if pos in pos_tags] for sentence in abstract]
        for abstract in train_reference:
            sentences_reference += [[w.lower() for w, pos in sentence if pos in pos_tags] for sentence in abstract]
        train_df1 = pd.DataFrame(pd.Series(sentences_retracted), columns=["sentences"])
        train_df2 = pd.DataFrame(pd.Series(sentences_reference), columns=["sentences"])
        mixture_distribution = estimate_text_distribution(train_df1, train_df2, save_path)

    mixture_distribution = pd.read_parquet(save_path)
    model = MLE(mixture_distribution, workers=2)
    return model

In [4]:
def make_inference_set(testset, pos_tags):
    sentences = []
    for abstract in testset:
        sentences += [[w.lower() for w, pos in sentence if pos in pos_tags] for sentence in abstract]
    inference_sents = pd.DataFrame(pd.Series(sentences), columns=["inference_sentence"])
    return inference_sents

In [7]:
for reason, reason_testset in [
    ("Paper-Mill", "paper_mill"),
    ("Randomly-Generated-Content", "falsification"),
    ("Falsification|Fabrication-of-Data", "random_content"),
    # ("Fake-Peer-Review_Concerns|Issues-with-Peer-Review",
]:
    testset = pd.read_parquet(f"test_{reason_testset}_samples.gzip")
    trainset = pd.read_parquet(f"train_{reason_testset}_samples.gzip")
    
    test_tokenized = [tokenize_text(text) for text in testset["text"]]
    df = pd.read_csv(f"case_study_results_{reason}.csv")
    print()
    for domain, model_name in df.groupby("domain")["model"].unique().items():
        pos_tags = fn_to_setting(model_name[0]+".....")["pos_tags"]
        mixture_distribution = get_model(trainset, pos_tags, reason, domain)
        alpha, ci =  mixture_distribution.inference(make_inference_set(test_tokenized, pos_tags), n_bootstrap=10_000)
        print(f"evaluation {reason} in {domain}: alpha: {1-alpha} ({ci}), error: {abs((1-alpha) - (len(testset[testset["label"]==1])/len(testset)))}")


time elapsed in precompute_log_probabilities: 1.204s
time elapsed on bootstrapping: 14.311s
evaluation Paper-Mill in Health Sciences: alpha: 0.503 (0.015), error: 0.0005124378109452321
time elapsed in precompute_log_probabilities: 1.211s
time elapsed on bootstrapping: 14.351s
evaluation Paper-Mill in Life Sciences: alpha: 0.503 (0.015), error: 0.0005124378109452321
time elapsed in precompute_log_probabilities: 0.774s
time elapsed on bootstrapping: 15.816s
evaluation Paper-Mill in Physical Sciences: alpha: 0.472 (0.018), error: 0.030487562189054795
time elapsed in precompute_log_probabilities: 1.195s
time elapsed on bootstrapping: 14.327s
evaluation Paper-Mill in Social Sciences: alpha: 0.503 (0.015), error: 0.0005124378109452321

time elapsed in precompute_log_probabilities: 0.322s
time elapsed on bootstrapping: 8.869s
evaluation Randomly-Generated-Content in Health Sciences: alpha: 0.43300000000000005 (0.034), error: 0.06699999999999995
time elapsed in precompute_log_probabilities: 0

log p hat is the probability that the token occurs in a collection of retracted works means retracted and q that is occurs in the reference set

In [8]:
for reason in [
    "Paper-Mill",
    "Randomly-Generated-Content",
    "Falsification|Fabrication-of-Data",
    # "Fake-Peer-Review_Concerns|Issues-with-Peer-Review"
]:
    df = pd.read_csv(f"case_study_results_{reason}.csv")
    for domain in df["domain"].unique():
        mixture_distribution = pd.read_parquet(f"mixture_model/distribution/eval_{reason}_{domain}.parquet")
        model = MLE(mixture_distribution, workers=2)
        print("-"*30)
        print(domain)
        print("-"*30)
        for word, probability in most_probable(more_probable_retracted(model.log_p_hat, model.log_q_hat), 20):
            print(word, probability)
        print()
        for word, probability in most_probable(more_probable_retracted(model.log_one_minus_p_hat, model.log_one_minus_q_hat), 20):
            print(word, probability)

------------------------------
Physical Sciences
------------------------------
english 3.997459059037209
nsclc 3.2965297380352085
western 3.039619324250182
vocational 2.9443091444458576
hepatocellular 2.8012083008051842
nasopharyngeal 2.6640071792916986
foreign 2.558646663633872
big 2.5553733382889035
noncoding 2.4816856224977446
ovarian 2.4816856224977446
adjacent 2.411010664827808
postoperative 2.346472143690237
catenin 2.3157004850234832
fuzzy 2.3157004850234824
coding 2.1661687510525196
apoptotic 2.1271093152159324
gastric 2.082085633841978
suppressive 2.060808235394693
mesenchymal 2.0280184125717025
reverse 2.0280184125717016

such 0.029312793281751785
available 0.023472410928417488
other 0.017896406768355928
future 0.014306577399590447
single 0.014226679649756966
clinical 0.013638773082912978
global 0.013313706358297482
recent 0.012336085572708103
most 0.012304402082801296
many 0.011969974077888173
specific 0.011418700170737013
major 0.011031541114105148
large 0.0106990395442070

In [9]:
for reason in [
    "Paper-Mill",
    "Randomly-Generated-Content",
    "Falsification|Fabrication-of-Data",
    "Fake-Peer-Review_Concerns|Issues-with-Peer-Review"
]:
    df = pd.read_csv(f"case_study_results_{reason}.csv")
    for domain, model in df.groupby("domain")["model"].unique().items():
        mixture_distribution = pd.read_parquet(f"mixture_model/distribution/{model[0]}.parquet")
        model = MLE(mixture_distribution, workers=2)
        print("-"*30)
        print(domain)
        print("-"*30)
        for word, probability in most_probable(more_probable_retracted(model.log_p_hat, model.log_q_hat), 20):
            print(word, probability)
        print()
        for word, probability in most_probable(more_probable_retracted(model.log_one_minus_p_hat, model.log_one_minus_q_hat), 20):
            print(word, probability)

------------------------------
Health Sciences
------------------------------
divided 2.3598136446548406
inhibiting 1.9543485365466768
promoting 1.941103309796656
suppressed 1.900281315276401
aims 1.9002813152764002
measure 1.7179597584824462
proved 1.7179597584824462
besides 1.7179597584824462
randomly 1.6489668869954945
signaling 1.6489668869954945
effectively 1.555440828984671
explore 1.5299075269795068
stimulated 1.4948162071682365
inhibit 1.4948162071682365
targeting 1.4948162071682365
promote 1.4948162071682365
analyze 1.4630675088536558
learning 1.4078048301786064
understood 1.4078048301786064
detect 1.3814875218612332

including 0.015032246731648883
associated 0.014782876024442552
related 0.011986426147919935
however 0.010574633259855838
only 0.010285792413603674
included 0.008763811742170158
known 0.008419178701420399
described 0.0077501562184989475
respectively 0.0070914045290829345
assess 0.007024203373549915
followed 0.006220667773866679
more 0.005707637569418469
reported 0

In [10]:
def highlight_text(words, scores):
    colored_text = " ".join([
        f'<span style="background-color: rgba({255 if score < 0 else 0}, {255 if score > 0 else 0}, 0, {abs(score)}); padding: 2px;">{word}</span>'
        for word, score in zip(words, scores)
    ])
    display(HTML(f"<p>{colored_text}</p>"))

In [11]:
for reason, reason_testset in [
    ("Paper-Mill", "paper_mill"),
    ("Randomly-Generated-Content", "falsification"),
    ("Falsification|Fabrication-of-Data", "random_content"),
    # ("Fake-Peer-Review_Concerns|Issues-with-Peer-Review",
]:
    testset = pd.read_parquet(f"test_{reason_testset}_samples.gzip")
    trainset = pd.read_parquet(f"train_{reason_testset}_samples.gzip")
    df = pd.read_csv(f"case_study_results_{reason}.csv")
    for domain, model_name in df.groupby("domain")["model"].unique().items():
        print(model_name[0])
        if "Health-Sciences" in model_name[0] or "Physical-Sciences" in model_name[0]:
            mixture_distribution = pd.read_parquet(f"mixture_model/distribution/eval_{reason}_{domain}.parquet")
            model = MLE(mixture_distribution, workers=2)
            pos_tags = fn_to_setting(model_name[0]+".....")["pos_tags"]
            sent_probs = []
            p_minus_q = more_probable_retracted(model.log_p_hat, model.log_q_hat)
            not_p_minus_not_q = more_probable_retracted(model.log_one_minus_p_hat, model.log_one_minus_q_hat)
            max_p_minus_q = max(p_minus_q.values())
            max_not_p_minus_not_q = max(not_p_minus_not_q.values())
            for text, label in zip(testset["text"], testset["label"]):
                for sentence in tokenize_text(text):
                    words = [w.lower() for w, pos in sentence if pos in pos_tags]
                    p_val_sent = model.calculate_log_p_vals(words)
                    q_val_sent = model.calculate_log_q_vals(words)
                    sent_probs.append((sentence, words, p_val_sent, q_val_sent, p_val_sent - q_val_sent, label))
            print("-"*30)
            print(f"{domain} - {model_name[0]} RETRACTED P - Q")
            print("-"*30)
            for s,w,p,q,pq,l in sorted(sent_probs, key=lambda x: x[4], reverse=True)[:10]:
                scores = [p_minus_q[word.lower()] / max_p_minus_q if word.lower() in p_minus_q and pos in pos_tags and word in w else 0 for word, pos in s]
                print(f"{('not' if l==0 else '')} retracted: {pq:.2f}")
                highlight_text([w for w,_ in s], scores)
            print("-"*30)
            print(f"{domain} - {model_name[0]} NOT RETRACTED ~P - ~Q")
            print("-"*30)
            for s,w,p,q,pq,l in sorted(sent_probs, key=lambda x: x[4])[:10]:
                scores = [not_p_minus_not_q[word.lower()] / max_not_p_minus_not_q if word.lower() in not_p_minus_not_q and pos in pos_tags and word in w else 0 for word, pos in s]
                print(f"{('not' if l==0 else '')} retracted: {pq:.2f}")
                highlight_text([w for w,_ in s], scores)

vc_0.5_al_sentence_s_Introduction_Abstract_r_Paper-Mill_d_Health-Sciences_f_all_p_VERB_ADV
------------------------------
Health Sciences - vc_0.5_al_sentence_s_Introduction_Abstract_r_Paper-Mill_d_Health-Sciences_f_all_p_VERB_ADV RETRACTED P - Q
------------------------------
 retracted: 15.20


 retracted: 15.03


 retracted: 12.98


 retracted: 12.50


 retracted: 11.94


 retracted: 11.92


 retracted: 11.57


 retracted: 11.04


 retracted: 10.74


 retracted: 10.68


------------------------------
Health Sciences - vc_0.5_al_sentence_s_Introduction_Abstract_r_Paper-Mill_d_Health-Sciences_f_all_p_VERB_ADV NOT RETRACTED ~P - ~Q
------------------------------
not retracted: -13.78


not retracted: -8.82


not retracted: -8.75


not retracted: -8.33


not retracted: -8.13


not retracted: -8.03


not retracted: -7.92


not retracted: -7.92


not retracted: -7.87


not retracted: -7.74


vc_0.5_al_sentence_s_Introduction_Abstract_r_Paper-Mill_d_Life-Sciences_f_all_p_VERB_ADV
vc_0.5_al_sentence_s_all_r_Paper-Mill_d_Physical-Sciences_f_all_p_ADJ
------------------------------
Physical Sciences - vc_0.5_al_sentence_s_all_r_Paper-Mill_d_Physical-Sciences_f_all_p_ADJ RETRACTED P - Q
------------------------------
 retracted: 24.97


 retracted: 17.08


 retracted: 14.16


 retracted: 12.92


 retracted: 12.78


 retracted: 11.39


 retracted: 10.97


 retracted: 10.94


 retracted: 10.47


 retracted: 10.40


------------------------------
Physical Sciences - vc_0.5_al_sentence_s_all_r_Paper-Mill_d_Physical-Sciences_f_all_p_ADJ NOT RETRACTED ~P - ~Q
------------------------------
not retracted: -12.50


not retracted: -9.61


not retracted: -7.38


not retracted: -7.37


not retracted: -7.35


not retracted: -7.05


not retracted: -6.96


not retracted: -6.85


not retracted: -6.83


not retracted: -6.82


vc_0.5_al_sentence_s_Introduction_Abstract_r_Paper-Mill_d_Social-Sciences_f_all_p_VERB_ADV
vc_0.5_al_sentence_s_Introduction_Abstract_r_Randomly-Generated-Content_d_Health-Sciences_f_all_p_VERB_ADV
------------------------------
Health Sciences - vc_0.5_al_sentence_s_Introduction_Abstract_r_Randomly-Generated-Content_d_Health-Sciences_f_all_p_VERB_ADV RETRACTED P - Q
------------------------------
 retracted: 11.06


 retracted: 8.65


 retracted: 8.32


 retracted: 7.75


 retracted: 7.22


not retracted: 6.96


 retracted: 6.56


 retracted: 6.30


 retracted: 5.73


 retracted: 5.73


------------------------------
Health Sciences - vc_0.5_al_sentence_s_Introduction_Abstract_r_Randomly-Generated-Content_d_Health-Sciences_f_all_p_VERB_ADV NOT RETRACTED ~P - ~Q
------------------------------
not retracted: -7.54


not retracted: -6.82


not retracted: -6.03


 retracted: -5.78


not retracted: -5.25


not retracted: -5.08


not retracted: -4.86


not retracted: -4.49


not retracted: -4.49


not retracted: -4.47


vc_0.5_al_sentence_s_Introduction_Abstract_r_Randomly-Generated-Content_d_Life-Sciences_f_all_p_VERB_ADV
vc_0.5_al_sentence_s_all_r_Randomly-Generated-Content_d_Physical-Sciences_f_all_p_ADJ
------------------------------
Physical Sciences - vc_0.5_al_sentence_s_all_r_Randomly-Generated-Content_d_Physical-Sciences_f_all_p_ADJ RETRACTED P - Q
------------------------------
 retracted: 9.14


 retracted: 6.02


 retracted: 6.02


 retracted: 5.10


 retracted: 5.09


 retracted: 4.97


 retracted: 4.93


 retracted: 4.77


 retracted: 4.72


 retracted: 4.47


------------------------------
Physical Sciences - vc_0.5_al_sentence_s_all_r_Randomly-Generated-Content_d_Physical-Sciences_f_all_p_ADJ NOT RETRACTED ~P - ~Q
------------------------------
 retracted: -12.49


 retracted: -10.78


not retracted: -7.38


not retracted: -7.31


not retracted: -6.87


not retracted: -6.70


 retracted: -6.70


not retracted: -6.56


not retracted: -6.48


not retracted: -6.07


vc_0.5_al_sentence_s_all_r_Randomly-Generated-Content_d_Social-Sciences_f_all_p_ADJ
vc_0.5_al_sentence_s_Introduction_Abstract_r_Falsification|Fabrication-of-Data_d_Health-Sciences_f_all_p_VERB_ADV
------------------------------
Health Sciences - vc_0.5_al_sentence_s_Introduction_Abstract_r_Falsification|Fabrication-of-Data_d_Health-Sciences_f_all_p_VERB_ADV RETRACTED P - Q
------------------------------
 retracted: 17.25


 retracted: 15.34


 retracted: 15.15


 retracted: 14.47


 retracted: 13.51


 retracted: 13.24


 retracted: 12.09


 retracted: 11.85


 retracted: 11.66


 retracted: 11.65


------------------------------
Health Sciences - vc_0.5_al_sentence_s_Introduction_Abstract_r_Falsification|Fabrication-of-Data_d_Health-Sciences_f_all_p_VERB_ADV NOT RETRACTED ~P - ~Q
------------------------------
not retracted: -10.66


not retracted: -8.53


not retracted: -7.74


not retracted: -7.49


not retracted: -7.32


not retracted: -7.30


not retracted: -7.28


not retracted: -7.19


not retracted: -7.14


not retracted: -7.02


vc_0.5_al_sentence_s_all_r_Falsification|Fabrication-of-Data_d_Life-Sciences_f_all_p_ADJ
vc_0.5_al_sentence_s_all_r_Falsification|Fabrication-of-Data_d_Physical-Sciences_f_all_p_ADJ
------------------------------
Physical Sciences - vc_0.5_al_sentence_s_all_r_Falsification|Fabrication-of-Data_d_Physical-Sciences_f_all_p_ADJ RETRACTED P - Q
------------------------------
 retracted: 12.82


 retracted: 12.20


 retracted: 12.16


 retracted: 11.77


 retracted: 11.58


 retracted: 10.60


 retracted: 10.49


 retracted: 10.39


 retracted: 10.20


 retracted: 10.15


------------------------------
Physical Sciences - vc_0.5_al_sentence_s_all_r_Falsification|Fabrication-of-Data_d_Physical-Sciences_f_all_p_ADJ NOT RETRACTED ~P - ~Q
------------------------------
not retracted: -8.77


not retracted: -8.08


not retracted: -7.99


not retracted: -7.88


not retracted: -7.81


 retracted: -7.20


not retracted: -6.96


not retracted: -6.74


not retracted: -6.48


not retracted: -6.32


vc_0.5_al_sentence_s_all_r_Falsification|Fabrication-of-Data_d_Social-Sciences_f_all_p_ADJ
