In [1]:
import os
import sys

import numpy as np
import pandas as pd

In [2]:
sys.path.append(os.path.abspath('../src'))

from read_and_write_docs import read_jsonl, write_jsonl, read_rds
from utils import apply_temp_doc_id, build_metadata_df
from lambdaG import lambdaG, lambdaG_paraphrase
from performance import performance

In [3]:
def compute_perplexity(logprobs):
    """
    Compute sentence-level perplexity from token log-probabilities.
    Assumes log-probs are natural logs (base e), as provided by Qwen‑2.5.
    """
    return np.exp(-np.mean(logprobs))

In [4]:
base_loc = "/Volumes/BCross/datasets/author_verification"

data_type = "training"
corpus = "Wiki"

model = "Qwen2.5-1.5B-Instruct"
model_name = model.lower().replace("-", "_")
token_type = model

known_loc = f"{base_loc}/sentence_log_probs_datasets/{data_type}/{corpus}/known_sentence_logprobs_{model_name}.jsonl"
known_loc = "/Users/user/Documents/test_data/known_sentence_logprobs_qwen2.5_1.5b_instruct.jsonl"
known = read_jsonl(known_loc)
known.rename(columns={'sentence': 'text'}, inplace=True)
known = apply_temp_doc_id(known)
known = known[known['num_tokens'] > 0]
known['perplexity'] = known['log_probs'].apply(compute_perplexity)

unknown_loc = f"{base_loc}/sentence_log_probs_datasets/{data_type}/{corpus}/unknown_sentence_logprobs_{model_name}.jsonl"
unknown_loc = "/Users/user/Documents/test_data/unknown_sentence_logprobs_qwen2.5_1.5b_instruct.jsonl"
unknown = read_jsonl(unknown_loc)
unknown.rename(columns={'sentence': 'text'}, inplace=True)
unknown = apply_temp_doc_id(unknown)
unknown = unknown[unknown['num_tokens'] > 0]
unknown['perplexity'] = unknown['log_probs'].apply(compute_perplexity)

metadata_loc = f"{base_loc}/{data_type}/metadata.rds"
metadata_loc = "/Users/user/Documents/test_data/metadata.rds"
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == corpus]
agg_metadata = build_metadata_df(filtered_metadata, known, unknown)

In [5]:
impostor_loc = "/Users/user/Documents/test_data/top_impostors_tokenized"
test_impostors = read_jsonl(f"{impostor_loc}/alanyst_text_13.jsonl")

test_impostors.head(2)

Unnamed: 0,corpus,doc_id,impostor_id,chunk_id,author,texttype,text,tokens,log_probs,med_log_prob,differences,abs_differences,num_tokens,sum_log_prob,avg_log_prob,mean_diff,mean_abs_diff
0,Wiki,alanyst_text_13,1,1,Alanyst,known,"It is a caricature, and you have not clearly s...","[It, is, a, caricature,, and, you, have, not, ...","[-15.906315803527832, -0.5864660143852234, -2....","[-21.922271728515625, -21.922271728515625, -20...","[6.015955924987793, 21.3358057141304, 18.59740...","[6.015955924987793, 21.3358057141304, 18.59740...",26,-138.721345,-5.335436,15.50869,15.50869
1,Wiki,alanyst_text_13,1,2,Alanyst,known,I hasten to add that I do not believe you are ...,"[I, hasten, to, add, that, I, do, not, believe...","[-13.71926498413086, -15.373607635498047, -0.0...","[-20.256166458129883, -20.256166458129883, -24...","[6.536901473999023, 4.882558822631836, 23.9908...","[6.536901473999023, 4.882558822631836, 23.9908...",17,-53.507081,-3.147475,18.359526,18.359526


In [13]:
known_filtered = known[known['doc_id'].isin(['athenean_text_1', 'alanyst_text_13'])]
known_filtered = known[known['author'].isin(['Alanyst', 'Athenean'])]
known_filtered.head(2)

Unnamed: 0,doc_id,orig_doc_id,corpus,chunk_id,author,texttype,text,tokens,log_probs,med_log_prob,differences,abs_differences,num_tokens,sum_log_prob,avg_log_prob,mean_diff,mean_abs_diff,perplexity
301,alanyst_text_1,known [Alanyst - Text-1].txt,Wiki,1,Alanyst,known,"Strictly speaking, we don't know whether the N...","[Strict, ly, speaking, ,, we, don, 't, know, w...","[-12.616437911987305, -3.1433467864990234, -1....","[-18.865985870361328, -18.865985870361328, -18...","[6.249547958374023, 15.722639083862305, 16.654...","[6.249547958374023, 15.722639083862305, 16.654...",45,-169.019963,-3.755999,17.431134,17.431134,42.77694
302,alanyst_text_1,known [Alanyst - Text-1].txt,Wiki,2,Alanyst,known,Because there is at least one reliable source ...,"[Because, there, is, at, least, one, reliable,...","[-19.322343826293945, -6.060644149780273, -1.1...","[-19.10921859741211, -19.10921859741211, -20.8...","[-0.21312522888183594, 13.048574447631836, 19....","[0.21312522888183594, 13.048574447631836, 19.6...",27,-115.161958,-4.265258,16.34956,16.365347,71.183262


In [14]:
results = lambdaG_paraphrase(unknown, known_filtered,
                             metadata=agg_metadata, impostor_loc=impostor_loc)

    There are 2 known author(s) and 4 problem(s) in the dataset.
        Working on problem 1 of 4: Alanyst vs Alanyst
File not found, skipping: /Users/user/Documents/test_data/top_impostors_tokenized/alanyst_text_1.jsonl
File not found, skipping: /Users/user/Documents/test_data/top_impostors_tokenized/alanyst_text_12.jsonl
2 known_docs are missing from refs_filtered and will be skipped: ['alanyst_text_1', 'alanyst_text_12']
        Working on problem 2 of 4: Alanyst vs AlasdairGreen27
File not found, skipping: /Users/user/Documents/test_data/top_impostors_tokenized/alanyst_text_1.jsonl
File not found, skipping: /Users/user/Documents/test_data/top_impostors_tokenized/alanyst_text_12.jsonl
2 known_docs are missing from refs_filtered and will be skipped: ['alanyst_text_1', 'alanyst_text_12']
        Working on problem 3 of 4: Athenean vs Athenean
File not found, skipping: /Users/user/Documents/test_data/top_impostors_tokenized/athenean_text_3.jsonl
File not found, skipping: /Users/user/D

In [10]:
results

Unnamed: 0,problem,known_author,unknown_author,target,score
0,Alanyst vs Alanyst,Alanyst,Alanyst,True,25776.30054
1,Alanyst vs AlasdairGreen27,Alanyst,AlasdairGreen27,False,37328.820697


In [15]:
results_agg = (
    results
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)
score_col = 'score'
target_col = 'target'
performance(results_agg,
            score_col,
            target_col,
            additional_metadata={
                'corpus': corpus
            })

Unnamed: 0,corpus,Cllr,Cllr_min,EER,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials
0,Wiki,13.206207,13.136721,0.5,0.25,0.5,0.5,0.5,0.5,1,1,1,1,-3.827564,2.714702,2,2


In [None]:
# all_results = []

# for rep in range(1, 6): 
#     print(f"Repetition {rep}")
#     df = lambdaG_paraphrase(unknown, known_filtered,
#                             metadata=agg_metadata, impostor_loc=impostor_loc)
#     # Add the repetition column at the start:
#     df.insert(0, 'repetition', rep)
#     df.insert(1, 'corpus', corpus)      # move corpus next
#     df.insert(2, 'data_type', data_type)
#     df.insert(2, 'token_type', token_type) 
#     all_results.append(df)

# # Combine all repetitions into one DataFrame
# results = pd.concat(all_results, ignore_index=True)

In [None]:
# results_agg = (
#     results
#     .groupby(['problem', 'target'], as_index=False)
#     ['score']
#     .mean()
# )
# score_col = 'score'
# target_col = 'target'
# performance(results_agg,
#             score_col,
#             target_col,
#             additional_metadata={
#                 'corpus': corpus
#             })

In [None]:
# save_loc = f"{base_loc}/lambda_g_results/{corpus}_{data_type}_{model_name}_raw.jsonl"
# write_jsonl(results, save_loc)

In [None]:
# training_loc = f"{base_loc}/lambda_g_results/{corpus}_training_{model_name}_raw.jsonl"
# training = read_jsonl(training_loc)

# test_loc = f"{base_loc}/lambda_g_results/{corpus}_test_{model_name}_raw.jsonl"
# test = read_jsonl(test_loc)

In [None]:
# training_results_agg = (
#     training
#     .groupby(['problem', 'target'], as_index=False)
#     ['score']
#     .mean()
# )

# test_results_agg = (
#     test
#     .groupby(['problem', 'target'], as_index=False)
#     ['score']
#     .mean()
# )

In [None]:
# score_col = 'score'
# target_col = 'target'

In [None]:
# result_metrics = performance(training_results_agg,
#                              score_col,
#                              target_col,
#                              df_test=test_results_agg,
#                              additional_metadata={
#                                  'corpus': corpus
#                              })

In [None]:
# result_metrics