In [3]:
from src.estimation import estimate_text_distribution
from src.MLE import MLE

In [5]:
# for each subject, estimate the distribution of human-written text and AI-generated text
for name in ["CS","EESS","Math","Phys","Stat"]:
    estimate_text_distribution(f"data/training_data/{name}/human_data.parquet",f"data/training_data/{name}/ai_data.parquet",f"distribution/{name}.parquet")

In [6]:
# for each subject, estimate the alpha value of mixed text and calculate the error
for name in ["CS","EESS","Math","Phys","Stat"]:
    # load the framework
    model=MLE(f"distribution/{name}.parquet")
    for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
        estimated,ci=model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
        error=abs(estimated-alpha)
        print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
        print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
    print("=====================================")

Ground Truth,Prediction,        CI,     Error
     0.000,     0.025,     0.002,     0.025
Ground Truth,Prediction,        CI,     Error
     0.025,     0.054,     0.003,     0.029
Ground Truth,Prediction,        CI,     Error
     0.050,     0.080,     0.004,     0.030
Ground Truth,Prediction,        CI,     Error
     0.075,     0.101,     0.004,     0.026
Ground Truth,Prediction,        CI,     Error
     0.100,     0.128,     0.004,     0.028
Ground Truth,Prediction,        CI,     Error
     0.125,     0.151,     0.005,     0.026
Ground Truth,Prediction,        CI,     Error
     0.150,     0.174,     0.005,     0.024
Ground Truth,Prediction,        CI,     Error
     0.175,     0.194,     0.005,     0.019
Ground Truth,Prediction,        CI,     Error
     0.200,     0.223,     0.005,     0.023
Ground Truth,Prediction,        CI,     Error
     0.225,     0.244,     0.005,     0.019
Ground Truth,Prediction,        CI,     Error
     0.250,     0.270,     0.006,     0.020
Ground Tru

KeyboardInterrupt: 

In [7]:
import pandas as pd
data = pd.read_parquet("data/training_data/CS/human_data.parquet")

In [8]:
data.columns

Index(['human_sentence'], dtype='object')

In [9]:
data["human_sentence"]

0        [particularly, the, proposed, spreading, curve...
1        [this, mixed, approach, has, rarely, been, app...
2        [firstly, observing, the, limited, rotation, i...
3        [however, existing, methods, either, require, ...
4        [in, this, task, a, fused, image, containing, ...
                               ...                        
37855    [active, learning, shows, promise, to, decreas...
37856    [finally, simulations, using, flashflow, for, ...
37857    [in, this, paper, we, model, this, intention, ...
37858    [these, results, help, to, identify, the, most...
37859    [intuitively, a, shorter, tree, with, pure, le...
Name: human_sentence, Length: 37860, dtype: object

In [10]:
len(data)

37860

In [11]:
val_data = pd.read_parquet("data/validation_data/CS/ground_truth_alpha_0.25.parquet")

In [12]:
len(val_data)

30000

In [13]:
retraction_df = pd.read_parquet("../retraction_fulltext_dataset/24_08_22_retraction_with_text.gzip")
reference_df = pd.read_parquet("../retraction_fulltext_dataset/24_11_30_reference_articles.gzip")

In [14]:
retraction_df["Field"].value_counts()

Field
Medicine                                        7658
Biochemistry, Genetics and Molecular Biology    4914
Computer Science                                3346
Engineering                                     3337
Environmental Science                           1564
Social Sciences                                 1419
Materials Science                                983
Agricultural and Biological Sciences             968
Business, Management and Accounting              822
Neuroscience                                     723
Chemistry                                        633
Psychology                                       631
Immunology and Microbiology                      554
Earth and Planetary Sciences                     404
Health Professions                               378
Physics and Astronomy                            332
Economics, Econometrics and Finance              326
Decision Sciences                                278
Mathematics                             

In [15]:
retraction_df["Domain"].value_counts()

Domain
Physical Sciences    11143
Health Sciences       8368
Life Sciences         7334
Social Sciences       3697
Name: count, dtype: int64

In [16]:
retraction_df["OriginalPaperDate"] = pd.to_datetime(retraction_df["OriginalPaperDate"], format='%m/%d/%Y %H:%M')


In [17]:
df = retraction_df[retraction_df["OriginalPaperDate"]>pd.to_datetime('29.11.2022', format='%d.%m.%Y')]

In [18]:
from src.data_loader import dataset_gpt_inference

In [19]:
retraction_gpr_inferenece_dataset = dataset_gpt_inference(retraction_df)

In [20]:
retraction_gpt_inference_data = pd.DataFrame(pd.Series(retraction_gpr_inferenece_dataset), columns=["inference_sentence"])

In [21]:
retraction_gpt_inference_data["inference_sentence"]

0       [photodynamic, therapy, pdt, has, been, demons...
1       [however, tumour, regrowth, may, occur, after,...
2       [previous, research, has, confirmed, the, inhi...
3       [therefore, the, current, study, intends, to, ...
4       [the, combined, treatment, significantly, supp...
                              ...                        
4598    [by, november, december, ba, had, replaced, th...
4599    [polymerase, chain, reaction, and, near, full,...
4600    [mutations, altering, viral, tropism, replicat...
4601    [omicron, ancestors, were, therefore, present,...
4602    [these, data, also, indicate, that, travel, ba...
Name: inference_sentence, Length: 4603, dtype: object

In [22]:
retraction_gpt_inference_data.to_parquet("data/validation_data/retracted/all_from_29_11_2022.parquet")

In [23]:
model=MLE(f"distribution/CS.parquet")

In [24]:
estimated, ci=model.inference("data/validation_data/retracted/all_from_29_11_2022.parquet")

In [25]:
estimated

0.043

In [44]:
ci

0.009

In [26]:
reference_gpr_inferenece_dataset = dataset_gpt_inference(reference_df)
reference_gpt_inference_data = pd.DataFrame(pd.Series(reference_gpr_inferenece_dataset), columns=["inference_sentence"])
reference_gpt_inference_data.to_parquet("data/validation_data/retracted/reference_from_29_11_2022.parquet")

KeyError: 'OriginalPaperDate'

In [None]:
estimated, ci=model.inference("data/validation_data/retracted/reference_from_29_11_2022.parquet")
print(estimated)

0.042
