In [1]:
import os
import sys

import numpy as np
import pandas as pd

In [2]:
sys.path.append(os.path.abspath('../src'))

from read_and_write_docs import read_jsonl, write_jsonl, read_rds
from utils import apply_temp_doc_id, build_metadata_df
from lambdaG import lambdaG, lambdaG_perplexity
from performance import performance

In [3]:
def compute_perplexity(logprobs):
    """
    Compute sentence-level perplexity from token log-probabilities.
    Assumes log-probs are natural logs (base e), as provided by Qwen‑2.5.
    """
    return np.exp(-np.mean(logprobs))

In [4]:
base_loc = "/Volumes/BCross/datasets/author_verification"

data_type = "test"
corpus = "Wiki"

model = "Qwen2.5-1.5B-Instruct"
model_name = model.lower().replace("-", "_")
token_type = model

known_loc = f"{base_loc}/sentence_log_probs_datasets/{data_type}/{corpus}/known_sentence_logprobs_{model_name}.jsonl"
known = read_jsonl(known_loc)
known.rename(columns={'sentence': 'text'}, inplace=True)
known = apply_temp_doc_id(known)
known = known[known['num_tokens'] > 0]
known['perplexity'] = known['log_probs'].apply(compute_perplexity)

unknown_loc = f"{base_loc}/sentence_log_probs_datasets/{data_type}/{corpus}/unknown_sentence_logprobs_{model_name}.jsonl"
unknown = read_jsonl(unknown_loc)
unknown.rename(columns={'sentence': 'text'}, inplace=True)
unknown = apply_temp_doc_id(unknown)
unknown = unknown[unknown['num_tokens'] > 0]
unknown['perplexity'] = unknown['log_probs'].apply(compute_perplexity)

metadata_loc = f"{base_loc}/{data_type}/metadata.rds"
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == corpus]
agg_metadata = build_metadata_df(filtered_metadata, known, unknown)

In [5]:
known

Unnamed: 0,doc_id,orig_doc_id,corpus,chunk_id,author,texttype,text,tokens,log_probs,med_log_prob,differences,abs_differences,num_tokens,sum_log_prob,avg_log_prob,mean_diff,mean_abs_diff,perplexity
0,hodja_nasreddin_text_1,known [Hodja_Nasreddin - Text-1].txt,Wiki,1,Hodja_Nasreddin,known,If they actually censor anything is another qu...,"[If, they, actually, censor, anything, is, ano...","[-16.8627872467041, -9.027709007263184, -6.257...","[-21.484634399414062, -21.484634399414062, -19...","[4.621847152709961, 12.456925392150879, 13.358...","[4.621847152709961, 12.456925392150879, 13.358...",9,-64.705513,-7.189501,13.161048,13.161048,1325.442305
1,hodja_nasreddin_text_1,known [Hodja_Nasreddin - Text-1].txt,Wiki,2,Hodja_Nasreddin,known,"Unlike others, Medvedev is an internationally ...","[Unlike, others, ,, Med, ved, ev, is, an, inte...","[-19.534250259399414, -7.3190765380859375, -0....","[-19.001110076904297, -19.001110076904297, -19...","[-0.5331401824951172, 11.68203353881836, 18.70...","[0.5331401824951172, 11.68203353881836, 18.706...",12,-63.955127,-5.329594,14.519564,14.608421,206.354155
2,hodja_nasreddin_text_1,known [Hodja_Nasreddin - Text-1].txt,Wiki,3,Hodja_Nasreddin,known,He tells that these people are governmental bu...,"[He, tells, that, these, people, are, governme...","[-12.041280746459961, -7.456192970275879, -4.8...","[-18.846717834472656, -18.846717834472656, -19...","[6.805437088012695, 11.390524864196777, 14.419...","[6.805437088012695, 11.390524864196777, 14.419...",15,-77.758858,-5.183924,14.607707,14.607707,178.381390
3,hodja_nasreddin_text_1,known [Hodja_Nasreddin - Text-1].txt,Wiki,4,Hodja_Nasreddin,known,"Main point this is a Can anyone clarify, pleas...","[Main, point, this, is, a, Can, anyone, clarif...","[-11.098122596740723, -9.430742263793945, -8.1...","[-18.113418579101562, -18.113418579101562, -18...","[7.01529598236084, 8.682676315307617, 9.983802...","[7.01529598236084, 8.682676315307617, 9.983802...",28,-131.723327,-4.704405,14.524282,14.524282,110.432508
4,hodja_nasreddin_text_1,known [Hodja_Nasreddin - Text-1].txt,Wiki,5,Hodja_Nasreddin,known,I hope you do not suggest to replace three lar...,"[I, hope, you, do, not, suggest, to, replace, ...","[-13.71926498413086, -6.178842544555664, -0.97...","[-20.256166458129883, -20.256166458129883, -20...","[6.536901473999023, 14.077323913574219, 19.066...","[6.536901473999023, 14.077323913574219, 19.066...",21,-129.736234,-6.177916,13.227157,13.227157,481.986392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6655,zjarrirrethues_text_5,known [ZjarriRrethues - Text-5].txt,Wiki,11,ZjarriRrethues,known,You 'really' don't need votestacking so learn ...,"[You, ', really, ', don, 't, need, vot, est, a...","[-17.53136444091797, -11.629192352294922, -13....","[-23.94777488708496, -23.94777488708496, -19.8...","[6.416410446166992, 12.318582534790039, 6.8424...","[6.416410446166992, 12.318582534790039, 6.8424...",18,-139.576474,-7.754249,12.691492,12.691492,2331.456752
6656,zjarrirrethues_text_5,known [ZjarriRrethues - Text-5].txt,Wiki,12,ZjarriRrethues,known,"If I were to report you, I would have done so ...","[If, I, were, to, report, you, ,, I, would, ha...","[-16.8627872467041, -5.449200630187988, -4.011...","[-21.484634399414062, -21.484634399414062, -20...","[4.621847152709961, 16.035433769226074, 16.798...","[4.621847152709961, 16.035433769226074, 16.798...",18,-79.414373,-4.411910,15.903303,15.903303,82.426716
6657,zjarrirrethues_text_5,known [ZjarriRrethues - Text-5].txt,Wiki,13,ZjarriRrethues,known,"However, if in the meantime you wait for 24 ho...","[However, ,, if, in, the, meantime, you, wait,...","[-23.379371643066406, -0.09925549477338791, -3...","[-21.31941032409668, -21.31941032409668, -18.4...","[-2.0599613189697266, 21.220154829323292, 15.2...","[2.0599613189697266, 21.220154829323292, 15.23...",37,-160.728426,-4.344012,16.014461,16.125811,77.015871
6658,zjarrirrethues_text_5,known [ZjarriRrethues - Text-5].txt,Wiki,14,ZjarriRrethues,known,I've started an RM on the appropriate title of...,"[I, 've, started, an, RM, on, the, appropriate...","[-13.71926498413086, -4.819160461425781, -5.04...","[-20.256166458129883, -20.256166458129883, -21...","[6.536901473999023, 15.437005996704102, 16.663...","[6.536901473999023, 15.437005996704102, 16.663...",21,-124.845412,-5.945020,14.090888,14.090888,381.846857


In [6]:
known_filtered = known[known['author'].isin(['Hodja_Nasreddin', 'ZjarriRrethues'])]

In [7]:
results = lambdaG_perplexity(unknown, known, known, agg_metadata)
results_agg = (
    results
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)
score_col = 'score'
target_col = 'target'
performance(results_agg,
            score_col,
            target_col,
            additional_metadata={
                'corpus': corpus
            })

    There are 112 known author(s) and 224 problem(s) in the dataset.
        Working on problem 1 of 224: Hodja_Nasreddin vs Hodja_Nasreddin

Document-level score (median of sentences): 0.0798
        Working on problem 2 of 224: Hodja_Nasreddin vs HonestopL

Document-level score (median of sentences): 0.0873
        Working on problem 3 of 224: HonestopL vs HonestopL

Document-level score (median of sentences): 0.2380
        Working on problem 4 of 224: HonestopL vs HOOTmag

Document-level score (median of sentences): 0.1605
        Working on problem 5 of 224: HOOTmag vs HOOTmag

Document-level score (median of sentences): 0.2355
        Working on problem 6 of 224: HOOTmag vs Iain99

Document-level score (median of sentences): 0.2080
        Working on problem 7 of 224: Iain99 vs Iain99

Document-level score (median of sentences): 0.0926
        Working on problem 8 of 224: Iain99 vs Icarus3

Document-level score (median of sentences): 0.1226
        Working on problem 9 of 224: Ic

Unnamed: 0,corpus,Cllr,Cllr_min,EER,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials
0,Wiki,1.006522,1.006522,0.991071,0.009566,0.017857,0.034483,0.035714,0.035088,4,112,108,0,-0.003934,0.003896,112,112


In [8]:
# all_results = []

# for rep in range(1, 6): 
#     print(f"Repetition {rep}")
#     df = lambdaG(unknown, known, known, agg_metadata)
#     # Add the repetition column at the start:
#     df.insert(0, 'repetition', rep)
#     df.insert(1, 'corpus', corpus)      # move corpus next
#     df.insert(2, 'data_type', data_type)
#     df.insert(2, 'token_type', token_type) 
#     all_results.append(df)

# # Combine all repetitions into one DataFrame
# results = pd.concat(all_results, ignore_index=True)

In [9]:
# save_loc = f"{base_loc}/lambda_g_results/{corpus}_{data_type}_{model_name}_raw.jsonl"
# write_jsonl(results, save_loc)

In [10]:
training_loc = f"{base_loc}/lambda_g_results/{corpus}_training_{model_name}_raw.jsonl"
training = read_jsonl(training_loc)

test_loc = f"{base_loc}/lambda_g_results/{corpus}_test_{model_name}_raw.jsonl"
test = read_jsonl(test_loc)

In [11]:
training_results_agg = (
    training
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)

test_results_agg = (
    test
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)

In [12]:
score_col = 'score'
target_col = 'target'

In [13]:
result_metrics = performance(training_results_agg,
                             score_col,
                             target_col,
                             df_test=test_results_agg,
                             additional_metadata={
                                 'corpus': corpus
                             })

In [14]:
result_metrics

Unnamed: 0,corpus,Cllr,Cllr_min,EER,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials
0,Wiki,0.476646,0.476489,0.151786,0.931043,0.857143,0.863636,0.848214,0.855856,95,15,17,97,1.242727,-0.905586,112,112
