In [1]:
import os
import sys

import numpy as np
import pandas as pd

In [2]:
sys.path.append(os.path.abspath('../src'))

from read_and_write_docs import read_jsonl, write_jsonl, read_rds
from utils import apply_temp_doc_id, build_metadata_df
from lambdaG import lambdaG, lambdaG_perplexity
from performance import performance

In [3]:
def compute_perplexity(logprobs):
    """
    Compute sentence-level perplexity from token log-probabilities.
    Assumes log-probs are natural logs (base e), as provided by Qwen‑2.5.
    """
    return np.exp(-np.mean(logprobs))

In [13]:
base_loc = "/Volumes/BCross/datasets/author_verification"

data_type = "test"
corpus = "Wiki"

model = "Qwen2.5-1.5B-Instruct"
model_name = model.lower().replace("-", "_")
token_type = model

known_loc = f"{base_loc}/sentence_log_probs_datasets/{data_type}/{corpus}/known_sentence_logprobs_{model_name}.jsonl"
known_loc = "/Users/user/Documents/test_data/known_sentence_logprobs_qwen2.5_1.5b_instruct.jsonl"
known = read_jsonl(known_loc)
known.rename(columns={'sentence': 'text'}, inplace=True)
known = apply_temp_doc_id(known)
known = known[known['num_tokens'] > 0]
known['perplexity'] = known['log_probs'].apply(compute_perplexity)

unknown_loc = f"{base_loc}/sentence_log_probs_datasets/{data_type}/{corpus}/unknown_sentence_logprobs_{model_name}.jsonl"
unknown_loc = "/Users/user/Documents/test_data/unknown_sentence_logprobs_qwen2.5_1.5b_instruct.jsonl"
unknown = read_jsonl(unknown_loc)
unknown.rename(columns={'sentence': 'text'}, inplace=True)
unknown = apply_temp_doc_id(unknown)
unknown = unknown[unknown['num_tokens'] > 0]
unknown['perplexity'] = unknown['log_probs'].apply(compute_perplexity)

metadata_loc = f"{base_loc}/{data_type}/metadata.rds"
metadata_loc = "/Users/user/Documents/test_data/metadata.rds"
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == corpus]
agg_metadata = build_metadata_df(filtered_metadata, known, unknown)

In [14]:
known

Unnamed: 0,doc_id,orig_doc_id,corpus,chunk_id,author,texttype,text,tokens,log_probs,med_log_prob,differences,abs_differences,num_tokens,sum_log_prob,avg_log_prob,mean_diff,mean_abs_diff,perplexity
0,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,1,142.196.88.228,known,The article that is being referred to via the ...,"[The, article, that, is, being, referred, to, ...","[-16.99749755859375, -9.235851287841797, -4.50...","[-20.143911361694336, -20.143911361694336, -19...","[3.146413803100586, 10.908060073852539, 15.058...","[3.146413803100586, 10.908060073852539, 15.058...",51,-232.244278,-4.553809,16.524808,16.524808,94.993586
1,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,2,142.196.88.228,known,"This study provides little to nothing, which j...","[This, study, provides, little, to, nothing, ,...","[-14.410652160644531, -6.185196876525879, -3.5...","[-20.189170837402344, -20.189170837402344, -20...","[5.7785186767578125, 14.003973960876465, 17.35...","[5.7785186767578125, 14.003973960876465, 17.35...",21,-99.708021,-4.748001,15.963466,15.963466,115.353463
2,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,3,142.196.88.228,known,"The Georgian idea is completely untraceable, a...","[The, Georgian, idea, is, completely, un, trac...","[-16.997499465942383, -14.329171180725098, -10...","[-20.14389991760254, -20.14389991760254, -18.5...","[3.1464004516601562, 5.814728736877441, 8.1168...","[3.1464004516601562, 5.814728736877441, 8.1168...",21,-109.509290,-5.214728,15.349450,15.349450,183.961796
3,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,4,142.196.88.228,known,The ongoing perception among scholars is that ...,"[The, ongoing, perception, among, scholars, is...","[-16.997499465942383, -11.254472732543945, -9....","[-20.14389991760254, -20.14389991760254, -18.9...","[3.1464004516601562, 8.889427185058594, 9.8861...","[3.1464004516601562, 8.889427185058594, 9.8861...",44,-193.688556,-4.402013,17.165639,17.165639,81.614965
4,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,5,142.196.88.228,known,It should therefore be considered part of Meso...,"[It, should, therefore, be, considered, part, ...","[-15.906315803527832, -5.273860931396484, -7.5...","[-21.922271728515625, -21.922271728515625, -21...","[6.015955924987793, 16.64841079711914, 14.1235...","[6.015955924987793, 16.64841079711914, 14.1235...",12,-53.890180,-4.490848,17.348327,17.348327,89.197086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4169,hipocrite_text_5,known [Hipocrite - Text-5].txt,Wiki,9,Hipocrite,known,Perhaps the next time someone brings you a sec...,"[Perhaps, the, next, time, someone, brings, yo...","[-19.00861930847168, -0.6481980681419373, -6.4...","[-20.851003646850586, -20.851003646850586, -19...","[1.8423843383789062, 20.20280557870865, 13.257...","[1.8423843383789062, 20.20280557870865, 13.257...",42,-190.126232,-4.526815,16.409739,16.409739,92.463599
4170,hipocrite_text_5,known [Hipocrite - Text-5].txt,Wiki,10,Hipocrite,known,How many of you ran for arbcom with a transpar...,"[How, many, of, you, ran, for, arb, com, with,...","[-21.734783172607422, -2.3489601612091064, -4....","[-22.568588256835938, -22.568588256835938, -21...","[0.8338050842285156, 20.21962809562683, 16.913...","[0.8338050842285156, 20.21962809562683, 16.913...",16,-106.689189,-6.668074,13.303092,13.303092,786.878863
4171,hipocrite_text_5,known [Hipocrite - Text-5].txt,Wiki,11,Hipocrite,known,Release all the emails that do not have person...,"[Release, all, the, emails, that, do, not, hav...","[-8.069883346557617, -7.805580139160156, -2.14...","[-19.67418670654297, -19.67418670654297, -17.3...","[11.604303359985352, 11.868606567382812, 15.24...","[11.604303359985352, 11.868606567382812, 15.24...",20,-91.973492,-4.598675,16.050299,16.050299,99.352546
4172,hipocrite_text_5,known [Hipocrite - Text-5].txt,Wiki,12,Hipocrite,known,Stop accepting emails except for information t...,"[Stop, accepting, emails, except, for, informa...","[-9.637933731079102, -10.569330215454102, -6.2...","[-17.2180118560791, -17.2180118560791, -17.461...","[7.580078125, 6.648681640625, 11.2410035133361...","[7.580078125, 6.648681640625, 11.2410035133361...",10,-59.482925,-5.948293,13.358489,13.358489,383.098651


In [6]:
# known_filtered = known[known['author'].isin(['Hodja_Nasreddin', 'ZjarriRrethues'])]

In [7]:
# results = lambdaG_perplexity(unknown, known, known, agg_metadata)
# results_agg = (
#     results
#     .groupby(['problem', 'target'], as_index=False)
#     ['score']
#     .mean()
# )
# score_col = 'score'
# target_col = 'target'
# performance(results_agg,
#             score_col,
#             target_col,
#             additional_metadata={
#                 'corpus': corpus
#             })

In [8]:
all_results = []

for rep in range(1, 6): 
    print(f"Repetition {rep}")
    df = lambdaG(unknown, known, known, agg_metadata)
    # Add the repetition column at the start:
    df.insert(0, 'repetition', rep)
    df.insert(1, 'corpus', corpus)      # move corpus next
    df.insert(2, 'data_type', data_type)
    df.insert(2, 'token_type', token_type) 
    all_results.append(df)

# Combine all repetitions into one DataFrame
results = pd.concat(all_results, ignore_index=True)

Repetition 1
    There are 48 known author(s) and 96 problem(s) in the dataset.
        Working on problem 1 of 96: Kevin.hyatt vs Kevin.hyatt
        Working on problem 2 of 96: Kevin.hyatt vs Kimberly.watson
        Working on problem 3 of 96: Kimberly.watson vs Kimberly.watson
        Working on problem 4 of 96: Kimberly.watson vs Larry.campbell
        Working on problem 5 of 96: Larry.campbell vs Larry.campbell
        Working on problem 6 of 96: Larry.campbell vs Lindy.donoho
        Working on problem 7 of 96: Lindy.donoho vs Lindy.donoho
        Working on problem 8 of 96: Lindy.donoho vs Liz.taylor
        Working on problem 9 of 96: Liz.taylor vs Liz.taylor
        Working on problem 10 of 96: Liz.taylor vs Louise.kitchen
        Working on problem 11 of 96: Louise.kitchen vs Louise.kitchen
        Working on problem 12 of 96: Louise.kitchen vs Lynn.blair
        Working on problem 13 of 96: Lynn.blair vs Lynn.blair
        Working on problem 14 of 96: Lynn.blair vs Lysa.akin

In [9]:
save_loc = f"{base_loc}/lambda_g_results/{corpus}_{data_type}_{model_name}_raw.jsonl"
write_jsonl(results, save_loc)

In [None]:
training_loc = f"{base_loc}/lambda_g_results/{corpus}_training_{model_name}_raw.jsonl"
training = read_jsonl(training_loc)

test_loc = f"{base_loc}/lambda_g_results/{corpus}_test_{model_name}_raw.jsonl"
test = read_jsonl(test_loc)

In [None]:
training_results_agg = (
    training
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)

test_results_agg = (
    test
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)

In [None]:
score_col = 'score'
target_col = 'target'

In [None]:
result_metrics = performance(training_results_agg,
                             score_col,
                             target_col,
                             df_test=test_results_agg,
                             additional_metadata={
                                 'corpus': corpus
                             })

In [None]:
result_metrics