In [1]:
from src.estimation import estimate_text_distribution
from src.MLE import MLE
from src.HC import calculate_hc
import pandas as pd

In [2]:
# for each subject, estimate the distribution of human-written text and AI-generated text
for name in ["CS","EESS","Math","Phys","Stat"]:
    estimate_text_distribution(f"data/training_data/{name}/human_data.parquet",f"data/training_data/{name}/ai_data.parquet",f"distribution/{name}.parquet")

            Word      logP    log1-P      logQ    log1-Q
0          realm -8.456354 -0.000213 -3.296252 -0.037724
1      intricate -8.338571 -0.000239 -3.813718 -0.022313
2     showcasing -8.589886 -0.000186 -4.158771 -0.015750
3        pivotal -8.589886 -0.000186 -4.440337 -0.011862
4         stands -8.589886 -0.000186 -4.585943 -0.010246
..           ...       ...       ...       ...       ...
95     elucidate -8.926358 -0.000133 -6.726793 -0.001199
96       crucial -5.417802 -0.004447 -3.256069 -0.039302
97  capabilities -5.790864 -0.003060 -3.629708 -0.026882
98    foundation -7.203592 -0.000744 -5.035468 -0.006524
99    invaluable -8.926358 -0.000133 -6.753821 -0.001167

[100 rows x 5 columns]
          Word      logP    log1-P      logQ    log1-Q
0    intricate -8.735043 -0.000161 -4.094154 -0.016810
1       stands -8.735043 -0.000161 -4.797862 -0.008282
2   innovative -7.268706 -0.000697 -3.371828 -0.034930
3    advancing -8.917364 -0.000134 -5.021972 -0.006613
4       paving -8

In [None]:
estimate_text_distribution

In [None]:
# for each subject, estimate the alpha value of mixed text and calculate the error
for name in ["CS","EESS","Math","Phys","Stat"]:
    # load the framework
    model=MLE(f"distribution/{name}.parquet")
    for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
        estimated,ci=model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
        error=abs(estimated-alpha)
        print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
        print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
    print("=====================================")

In [3]:
for name in ["CS","EESS","Math","Phys","Stat"]:
   human_data=pd.read_parquet(f"data/training_data/{name}/human_data.parquet")
   ai_data=pd.read_parquet(f"data/training_data/{name}/ai_data.parquet")
   # Verify that the expected columns are present in each dataset.
   if 'human_sentence' not in human_data.columns:
       raise ValueError("human_sentence column not found in human data")
   if 'ai_sentence' not in ai_data.columns:
       raise ValueError("ai_sentence column not found in ai data")
   # Calculate HC-discrepancy value
   num_words, hc_words = calculate_hc(human_data, ai_data)
   print(f"Number of words: {num_words}")
   print(f"List of words: {hc_words}")
   print("=====================================")

26589
Number of words: 26974
List of words: ['dql', 'aps', 'mentoring', 'directly', 'quantifiers', 'captured', 'unleashed', 'chat', 'parametrizing', 'ou', 'i_g', 'ase', 'pact', 'linepytorchocr', 'finegrained', 'concealing', 'emotive', 'bayes', 'naturally', 'uct', 'acl20', 'delay', 'interweaved', 'incident', 'nelson', 's3d', 'tips', 'preprocessed', 'younger', 'subcategory', 'revise', 'triple', 'mmn', 'choroid', 'mi2datalab', 'adjacent', 'movielens', 'always', 'defects', 'hgsampling', 'spray', 'resunets', 'ice', 'debugging', 'grounds', 'rgbt', 'unoriginal', 'protruding', 'coveted', 'subterranean', 'powerful', 'fetal', 'liking', 'shutter', 'detection', 'vstreamdrls', 'hoesel', '_x', 'switcher', 'testability', 'burdens', 'multiply', 'anchor', 'dip', 'epilepsiae', 'surges', 'permutation', 'constraint', 'nen', 'adeptness', 'deter', 'royalty', 'porcine', 'meshfree', 'siw', 'biodigitdb', 'heteroscedasticity', 'epidemics', 'sertanejo', 'acknowledges', 'dbb', 'possess', 'wrapping', 'ck', 'system