In [2]:
import os
import sys

import numpy as np
import pandas as pd

In [3]:
sys.path.append(os.path.abspath('../src'))

from read_and_write_docs import read_jsonl, write_jsonl, read_rds
from utils import apply_temp_doc_id, build_metadata_df
from lambdaG import lambdaG, lambdaG_v2, lambdaG_no_ref
from performance import performance
from POSNoise import apply_posnoise
from kneser_ney import KneserNeyLanguageModel, ngrams

In [4]:
def compute_perplexity(logprobs):
    """
    Compute sentence-level perplexity from token log-probabilities.
    Assumes log-probs are natural logs (base e), as provided by Qwen‑2.5.
    """
    return np.exp(-np.mean(logprobs))

In [5]:
base_loc = "/Volumes/BCross/datasets/author_verification"

data_type = "test"
corpus = "Wiki"

model = "Qwen2.5-1.5B-Instruct"
model_name = model.lower().replace("-", "_")
token_type = model

known_loc = f"{base_loc}/sentence_log_probs_datasets/{data_type}/{corpus}/known_sentence_logprobs_{model_name}.jsonl"
known_loc = "/Users/user/Documents/test_data/known_sentence_logprobs_qwen2.5_1.5b_instruct.jsonl"
known = read_jsonl(known_loc)
known.rename(columns={'sentence': 'text'}, inplace=True)
known = apply_temp_doc_id(known)
known = known[known['num_tokens'] > 0]
known['perplexity'] = known['log_probs'].apply(compute_perplexity)

unknown_loc = f"{base_loc}/sentence_log_probs_datasets/{data_type}/{corpus}/unknown_sentence_logprobs_{model_name}.jsonl"
unknown_loc = "/Users/user/Documents/test_data/unknown_sentence_logprobs_qwen2.5_1.5b_instruct.jsonl"
unknown = read_jsonl(unknown_loc)
unknown.rename(columns={'sentence': 'text'}, inplace=True)
unknown = apply_temp_doc_id(unknown)
unknown = unknown[unknown['num_tokens'] > 0]
unknown['perplexity'] = unknown['log_probs'].apply(compute_perplexity)

metadata_loc = f"{base_loc}/{data_type}/metadata.rds"
metadata_loc = "/Users/user/Documents/test_data/metadata.rds"
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == corpus]
agg_metadata = build_metadata_df(filtered_metadata, known, unknown)

In [6]:
agg_metadata

Unnamed: 0,sample_id,problem,corpus,known_author,unknown_author,unknown_doc_id,known_doc_id
0,1,142.196.88.228 vs 142.196.88.228,Wiki,142.196.88.228,142.196.88.228,142_196_88_228_text_2,142_196_88_228_text_1
1,2,142.196.88.228 vs 142.196.88.228,Wiki,142.196.88.228,142.196.88.228,142_196_88_228_text_2,142_196_88_228_text_3
2,3,142.196.88.228 vs 142.196.88.228,Wiki,142.196.88.228,142.196.88.228,142_196_88_228_text_2,142_196_88_228_text_4
3,4,142.196.88.228 vs Aban1313,Wiki,142.196.88.228,Aban1313,aban1313_text_4,142_196_88_228_text_1
4,5,142.196.88.228 vs Aban1313,Wiki,142.196.88.228,Aban1313,aban1313_text_4,142_196_88_228_text_3
...,...,...,...,...,...,...,...
445,446,Hipocrite vs Hipocrite,Wiki,Hipocrite,Hipocrite,hipocrite_text_4,hipocrite_text_3
446,447,Hipocrite vs Hipocrite,Wiki,Hipocrite,Hipocrite,hipocrite_text_4,hipocrite_text_5
447,448,Hipocrite vs Hodja_Nasreddin,Wiki,Hipocrite,Hodja_Nasreddin,hodja_nasreddin_text_3,hipocrite_text_2
448,449,Hipocrite vs Hodja_Nasreddin,Wiki,Hipocrite,Hodja_Nasreddin,hodja_nasreddin_text_3,hipocrite_text_3


In [17]:
known.head(2)

Unnamed: 0,doc_id,orig_doc_id,corpus,chunk_id,author,texttype,text,tokens,log_probs,med_log_prob,differences,abs_differences,num_tokens,sum_log_prob,avg_log_prob,mean_diff,mean_abs_diff,perplexity
0,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,1,142.196.88.228,known,The article that is being referred to via the ...,"[The, article, that, is, being, referred, to, ...","[-16.99749755859375, -9.235851287841797, -4.50...","[-20.143911361694336, -20.143911361694336, -19...","[3.146413803100586, 10.908060073852539, 15.058...","[3.146413803100586, 10.908060073852539, 15.058...",51,-232.244278,-4.553809,16.524808,16.524808,94.993586
1,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,2,142.196.88.228,known,"This study provides little to nothing, which j...","[This, study, provides, little, to, nothing, ,...","[-14.410652160644531, -6.185196876525879, -3.5...","[-20.189170837402344, -20.189170837402344, -20...","[5.7785186767578125, 14.003973960876465, 17.35...","[5.7785186767578125, 14.003973960876465, 17.35...",21,-99.708021,-4.748001,15.963466,15.963466,115.353463


In [18]:
known_filtered = known[known['author'].isin(['142.196.88.228', 'A_Man_In_Black'])]

In [19]:
known_filtered_pos = apply_posnoise(known_filtered, 'text')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_col + "_POSNoise"] = df[text_col].apply(posnoise_transform)


In [20]:
known_filtered_pos.head()

Unnamed: 0,doc_id,orig_doc_id,corpus,chunk_id,author,texttype,text,tokens,log_probs,med_log_prob,differences,abs_differences,num_tokens,sum_log_prob,avg_log_prob,mean_diff,mean_abs_diff,perplexity,text_POSNoise
0,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,1,142.196.88.228,known,The article that is being referred to via the ...,"[The, article, that, is, being, referred, to, ...","[-16.99749755859375, -9.235851287841797, -4.50...","[-20.143911361694336, -20.143911361694336, -19...","[3.146413803100586, 10.908060073852539, 15.058...","[3.146413803100586, 10.908060073852539, 15.058...",51,-232.244278,-4.553809,16.524808,16.524808,94.993586,"The # that is being Ø to via the Ø #, Ø the # ..."
1,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,2,142.196.88.228,known,"This study provides little to nothing, which j...","[This, study, provides, little, to, nothing, ,...","[-14.410652160644531, -6.185196876525879, -3.5...","[-20.189170837402344, -20.189170837402344, -20...","[5.7785186767578125, 14.003973960876465, 17.35...","[5.7785186767578125, 14.003973960876465, 17.35...",21,-99.708021,-4.748001,15.963466,15.963466,115.353463,"This # Ø @ to nothing, which Ø the # that Ø ha..."
2,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,3,142.196.88.228,known,"The Georgian idea is completely untraceable, a...","[The, Georgian, idea, is, completely, un, trac...","[-16.997499465942383, -14.329171180725098, -10...","[-20.14389991760254, -20.14389991760254, -18.5...","[3.1464004516601562, 5.814728736877441, 8.1168...","[3.1464004516601562, 5.814728736877441, 8.1168...",21,-109.50929,-5.214728,15.34945,15.34945,183.961796,"The @ # is © @, as it is a @ # of some # #."
3,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,4,142.196.88.228,known,The ongoing perception among scholars is that ...,"[The, ongoing, perception, among, scholars, is...","[-16.997499465942383, -11.254472732543945, -9....","[-20.14389991760254, -20.14389991760254, -18.9...","[3.1464004516601562, 8.889427185058594, 9.8861...","[3.1464004516601562, 8.889427185058594, 9.8861...",44,-193.688556,-4.402013,17.165639,17.165639,81.614965,The @ # among # is that because @ § was in the...
4,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,5,142.196.88.228,known,It should therefore be considered part of Meso...,"[It, should, therefore, be, considered, part, ...","[-15.906315803527832, -5.273860931396484, -7.5...","[-21.922271728515625, -21.922271728515625, -21...","[6.015955924987793, 16.64841079711914, 14.1235...","[6.015955924987793, 16.64841079711914, 14.1235...",12,-53.89018,-4.490848,17.348327,17.348327,89.197086,It should therefore be Ø part of §.


In [41]:
lm = KneserNeyLanguageModel(10, 0.75)

In [42]:
text = known_filtered_pos.iloc[0,7]

In [43]:
text

['The',
 'article',
 'that',
 'is',
 'being',
 'referred',
 'to',
 'via',
 'the',
 'broken',
 'link',
 ',',
 'draws',
 'the',
 'conclusion',
 'that',
 'win',
 'em',
 'aking',
 'may',
 'have',
 'originated',
 'in',
 'South',
 'Eastern',
 'Turkey',
 'ref',
 'link',
 '',
 '2',
 '-',
 'although',
 'the',
 'authors',
 'did',
 'not',
 'rule',
 'out',
 'Georgia',
 ',',
 'Armenia',
 ',',
 'or',
 'NW',
 'Iran',
 ',',
 'because',
 'lack',
 'of',
 'sampling',
 '.']

In [44]:
lm.fit(text)

In [10]:
results = lambdaG_no_ref(unknown, known, agg_metadata)

    There are 75 known author(s) and 150 problem(s) in the dataset.
        Working on problem 1 of 150: 142.196.88.228 vs 142.196.88.228
        Working on problem 2 of 150: 142.196.88.228 vs Aban1313
        Working on problem 3 of 150: A_Man_In_Black vs A_Man_In_Black
        Working on problem 4 of 150: A_Man_In_Black vs Bankhallbretherton
        Working on problem 5 of 150: Aban1313 vs Aban1313
        Working on problem 6 of 150: Aban1313 vs Akuri
        Working on problem 7 of 150: Akuri vs Akuri
        Working on problem 8 of 150: Akuri vs AlanBarnet
        Working on problem 9 of 150: AlanBarnet vs AlanBarnet
        Working on problem 10 of 150: AlanBarnet vs Alanyst
        Working on problem 11 of 150: Alanyst vs Alanyst
        Working on problem 12 of 150: Alanyst vs AlasdairGreen27
        Working on problem 13 of 150: AlasdairGreen27 vs AlasdairGreen27
        Working on problem 14 of 150: AlasdairGreen27 vs Alienus
        Working on problem 15 of 150: Alienus vs A

In [12]:
results

Unnamed: 0,problem,known_author,unknown_author,target,score
0,142.196.88.228 vs 142.196.88.228,142.196.88.228,142.196.88.228,True,4.127460
1,142.196.88.228 vs Aban1313,142.196.88.228,Aban1313,False,21.480588
2,A_Man_In_Black vs A_Man_In_Black,A_Man_In_Black,A_Man_In_Black,True,8.091887
3,A_Man_In_Black vs Bankhallbretherton,A_Man_In_Black,Bankhallbretherton,False,16.690130
4,Aban1313 vs Aban1313,Aban1313,Aban1313,True,19.900013
...,...,...,...,...,...
145,Haymaker vs HeadleyDown,Haymaker,HeadleyDown,False,11.113534
146,HeadleyDown vs HeadleyDown,HeadleyDown,HeadleyDown,True,8.115137
147,HeadleyDown vs Hipocrite,HeadleyDown,Hipocrite,False,12.766875
148,Hipocrite vs Hipocrite,Hipocrite,Hipocrite,True,9.514822


In [13]:
results_agg = (
    results
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)
score_col = 'score'
target_col = 'target'
performance(results_agg,
            score_col,
            target_col,
            additional_metadata={
                'corpus': corpus
            })

Unnamed: 0,corpus,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,Wiki,0.854068,0.854068,0.293333,0.208301,-0.227459,75,75,0.761778,0.713333,0.7,0.746667,0.722581,56,24,19,51


In [65]:
results = lambdaG(unknown, known_filtered, known, agg_metadata)
results_agg = (
    results
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)
score_col = 'score'
target_col = 'target'
performance(results_agg,
            score_col,
            target_col,
            additional_metadata={
                'corpus': corpus
            })

    There are 2 known author(s) and 4 problem(s) in the dataset.
        Working on problem 1 of 4: 142.196.88.228 vs 142.196.88.228
        Working on problem 2 of 4: 142.196.88.228 vs Aban1313
        Working on problem 3 of 4: A_Man_In_Black vs A_Man_In_Black
        Working on problem 4 of 4: A_Man_In_Black vs Bankhallbretherton


  llr = np.log10(pred_probs / (1 - pred_probs))


Unnamed: 0,corpus,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,Wiki,2.231305e-07,4.176619e-08,0.0,inf,-6.582147,2,2,1.0,1.0,1.0,1.0,1.0,2,0,0,2


In [70]:
results

Unnamed: 0,problem,known_author,unknown_author,target,score
0,142.196.88.228 vs 142.196.88.228,142.196.88.228,142.196.88.228,True,24276.521243
1,142.196.88.228 vs Aban1313,142.196.88.228,Aban1313,False,-7456.964457
2,A_Man_In_Black vs A_Man_In_Black,A_Man_In_Black,A_Man_In_Black,True,8799.167861
3,A_Man_In_Black vs Bankhallbretherton,A_Man_In_Black,Bankhallbretherton,False,-7149.204521


In [68]:
results_v2 = lambdaG_v2(unknown, known_filtered, known, agg_metadata)
results_agg_v2 = (
    results_v2
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)
score_col = 'score'
target_col = 'target'
performance(results_agg_v2,
            score_col,
            target_col,
            additional_metadata={
                'corpus': corpus
            })

    There are 2 known author(s) and 4 problem(s) in the dataset.
        Working on problem 1 of 4: 142.196.88.228 vs 142.196.88.228
        Working on problem 2 of 4: 142.196.88.228 vs Aban1313
        Working on problem 3 of 4: A_Man_In_Black vs A_Man_In_Black
        Working on problem 4 of 4: A_Man_In_Black vs Bankhallbretherton


Unnamed: 0,corpus,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,Wiki,0.05796,0.011887,0.0,4.303298,-3.387815,2,2,1.0,1.0,1.0,1.0,1.0,2,0,0,2


In [69]:
results_v2

Unnamed: 0,problem,known_author,unknown_author,target,score
0,142.196.88.228 vs 142.196.88.228,142.196.88.228,142.196.88.228,True,286.485069
1,142.196.88.228 vs Aban1313,142.196.88.228,Aban1313,False,-17.415187
2,A_Man_In_Black vs A_Man_In_Black,A_Man_In_Black,A_Man_In_Black,True,164.991628
3,A_Man_In_Black vs Bankhallbretherton,A_Man_In_Black,Bankhallbretherton,False,-34.32805


In [71]:
all_results = []

for rep in range(1, 4): 
    print(f"Repetition {rep}")
    df = lambdaG_v2(unknown, known, known, agg_metadata)
    # Add the repetition column at the start:
    df.insert(0, 'repetition', rep)
    df.insert(1, 'corpus', corpus)      # move corpus next
    df.insert(2, 'data_type', data_type)
    df.insert(2, 'token_type', token_type) 
    all_results.append(df)

# Combine all repetitions into one DataFrame
results = pd.concat(all_results, ignore_index=True)

Repetition 1
    There are 75 known author(s) and 150 problem(s) in the dataset.
        Working on problem 1 of 150: 142.196.88.228 vs 142.196.88.228
        Working on problem 2 of 150: 142.196.88.228 vs Aban1313
        Working on problem 3 of 150: A_Man_In_Black vs A_Man_In_Black
        Working on problem 4 of 150: A_Man_In_Black vs Bankhallbretherton
        Working on problem 5 of 150: Aban1313 vs Aban1313
        Working on problem 6 of 150: Aban1313 vs Akuri
        Working on problem 7 of 150: Akuri vs Akuri
        Working on problem 8 of 150: Akuri vs AlanBarnet
        Working on problem 9 of 150: AlanBarnet vs AlanBarnet
        Working on problem 10 of 150: AlanBarnet vs Alanyst
        Working on problem 11 of 150: Alanyst vs Alanyst
        Working on problem 12 of 150: Alanyst vs AlasdairGreen27
        Working on problem 13 of 150: AlasdairGreen27 vs AlasdairGreen27
        Working on problem 14 of 150: AlasdairGreen27 vs Alienus
        Working on problem 15 of 150:

In [72]:
results_agg = (
    results
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)
score_col = 'score'
target_col = 'target'
performance(results_agg,
            score_col,
            target_col,
            additional_metadata={
                'corpus': corpus
            })

  llr = np.log10(pred_probs / (1 - pred_probs))


Unnamed: 0,corpus,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,Wiki,0.468696,0.468696,0.146667,inf,-0.916299,75,75,0.934044,0.853333,0.863014,0.84,0.851351,63,10,12,65


In [9]:
save_loc = f"{base_loc}/lambda_g_results/{corpus}_{data_type}_{model_name}_raw.jsonl"
write_jsonl(results, save_loc)

In [None]:
training_loc = f"{base_loc}/lambda_g_results/{corpus}_training_{model_name}_raw.jsonl"
training = read_jsonl(training_loc)

test_loc = f"{base_loc}/lambda_g_results/{corpus}_test_{model_name}_raw.jsonl"
test = read_jsonl(test_loc)

In [None]:
training_results_agg = (
    training
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)

test_results_agg = (
    test
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)

In [None]:
score_col = 'score'
target_col = 'target'

In [None]:
result_metrics = performance(training_results_agg,
                             score_col,
                             target_col,
                             df_test=test_results_agg,
                             additional_metadata={
                                 'corpus': corpus
                             })

In [None]:
result_metrics