In [1]:
import os
import sys

import numpy as np
import pandas as pd

In [2]:
sys.path.append(os.path.abspath('../src'))

from read_and_write_docs import read_jsonl, write_jsonl, read_rds
from utils import apply_temp_doc_id, build_metadata_df
from lambdaG import extract_ngrams, lambdaG_paraphrase, lambdaG, lambdaG_v2
from performance import performance

In [3]:
def compute_perplexity(logprobs):
    """
    Compute sentence-level perplexity from token log-probabilities.
    Assumes log-probs are natural logs (base e), as provided by Qwen‑2.5.
    """
    return np.exp(-np.mean(logprobs))

In [4]:
base_loc = "/Volumes/BCross/datasets/author_verification"

data_type = "training"
corpus = "Wiki"

model = "Qwen2.5-1.5B-Instruct"
model_name = model.lower().replace("-", "_")
token_type = model

known_loc = f"{base_loc}/sentence_log_probs_datasets/{data_type}/{corpus}/known_sentence_logprobs_{model_name}.jsonl"
known_loc = "/Users/user/Documents/test_data/known_sentence_logprobs_qwen2.5_1.5b_instruct.jsonl"
known = read_jsonl(known_loc)
known.rename(columns={'sentence': 'text'}, inplace=True)
known = apply_temp_doc_id(known)
known = known[known['num_tokens'] > 0]
known['perplexity'] = known['log_probs'].apply(compute_perplexity)

unknown_loc = f"{base_loc}/sentence_log_probs_datasets/{data_type}/{corpus}/unknown_sentence_logprobs_{model_name}.jsonl"
unknown_loc = "/Users/user/Documents/test_data/unknown_sentence_logprobs_qwen2.5_1.5b_instruct.jsonl"
unknown = read_jsonl(unknown_loc)
unknown.rename(columns={'sentence': 'text'}, inplace=True)
unknown = apply_temp_doc_id(unknown)
unknown = unknown[unknown['num_tokens'] > 0]
unknown['perplexity'] = unknown['log_probs'].apply(compute_perplexity)

metadata_loc = f"{base_loc}/{data_type}/metadata.rds"
metadata_loc = "/Users/user/Documents/test_data/metadata.rds"
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == corpus]
agg_metadata = build_metadata_df(filtered_metadata, known, unknown)

In [None]:
impostor_loc = "/Users/user/Documents/test_data/top_impostors_tokenized"
test_impostors = read_jsonl(f"{impostor_loc}/alanyst_text_13.jsonl")

test_impostors.head(2)

In [None]:
known_filtered = known[known['doc_id'].isin(['athenean_text_1', 'alanyst_text_13'])]
known_filtered = known[known['author'].isin(['Alanyst', 'Athenean'])]
known_filtered.head(2)

In [None]:
agg_metadata[agg_metadata['known_author'] == 'Alanyst']

In [10]:
known_filtered = known[known['author'].isin(['Alanyst', 'Athenean'])]
known_tokens = known_filtered['tokens']

unknown_filtered = unknown[unknown['author'].isin(['Alanyst'])]
unknown_tokens = unknown_filtered['tokens']

unknown_filtered_2 = unknown[unknown['author'].isin(['AlasdairGreen27'])]
unknown_tokens_2 = unknown_filtered_2['tokens']

refs_filtered = known[~known['author'].isin(['Alanyst', 'AlasdairGreen27'])]
refs_tokens = refs_filtered['tokens']

In [None]:
refs_tokens.head()

In [None]:
import random
import math
import nltk
import pandas as pd
from nltk.lm import KneserNeyInterpolated
from nltk.lm.preprocessing import padded_everygram_pipeline
from collections import defaultdict

def build_kn_model(sentences, N):
    """
    Build an N-gram language model with Kneser-Ney smoothing.
    sentences: list of token lists, e.g. [['This','is','.'], ['Another','one','.']]
    """
    train_data, padded_vocab = padded_everygram_pipeline(N, sentences)
    model = KneserNeyInterpolated(order=N)
    model.fit(train_data, padded_vocab)
    return model

In [None]:
k_model = build_kn_model(known_tokens, 10)

In [None]:
def sentence_log10_prob(model, sentence, N):
    """
    Compute log10 probability of a single tokenized sentence under the model.
    """
    # generate N-grams
    grams = list(nltk.lm.preprocessing.padded_everygrams(N, sentence))
    logp = 0.0
    for gram in grams:
        context, word = tuple(gram[:-1]), gram[-1]
        # model.score returns P(word | context)
        p = model.score(word, context)
        # avoid log(0)
        if p <= 0:
            p = 1e-300
        logp += math.log10(p)
    return logp

In [None]:
k_logp_1 = sum(sentence_log10_prob(k_model, s, 10) for s in unknown_tokens)
k_logp_1

In [None]:
k_logp_2 = sum(sentence_log10_prob(k_model, s, 10) for s in unknown_tokens_2)
k_logp_2

In [None]:
second_sentence = known_filtered.iloc[1, 7]

In [None]:
sentence_log10_prob(test_model, second_sentence, 10)

In [None]:
len(known_tokens)

In [None]:
lr_sum_1 = 0.0
for _ in range(30):
    sampled = random.sample(sorted(refs_tokens), len(known_tokens))
    ref_model = build_kn_model(sampled, 10)
    ref_logp = sum(sentence_log10_prob(ref_model, s, 10) for s in unknown_tokens)
    lr_sum_1 += (k_logp_1 - ref_logp)
avg_lr_1 = lr_sum_1 / 30

In [None]:
avg_lr_1

In [None]:
lr_sum_2 = 0.0
for _ in range(30):
    sampled = random.sample(sorted(refs_tokens), len(known_tokens))
    ref_model = build_kn_model(sampled, 10)
    ref_logp = sum(sentence_log10_prob(ref_model, s, 10) for s in unknown_tokens_2)
    lr_sum += (k_logp_2 - ref_logp)
avg_lr_2 = lr_sum_2 / 30
avg_lr_2

In [11]:
results = lambdaG(unknown, known_filtered, known, metadata=agg_metadata)

    There are 2 known author(s) and 4 problem(s) in the dataset.
        Working on problem 1 of 4: Alanyst vs Alanyst
        Working on problem 2 of 4: Alanyst vs AlasdairGreen27
        Working on problem 3 of 4: Athenean vs Athenean
        Working on problem 4 of 4: Athenean vs Avraham


In [12]:
results_v2 = lambdaG_v2(unknown, known_filtered, known, metadata=agg_metadata)

    There are 2 known author(s) and 4 problem(s) in the dataset.
        Working on problem 1 of 4: Alanyst vs Alanyst
        Working on problem 2 of 4: Alanyst vs AlasdairGreen27
        Working on problem 3 of 4: Athenean vs Athenean
        Working on problem 4 of 4: Athenean vs Avraham


In [13]:
results

Unnamed: 0,problem,known_author,unknown_author,target,score
0,Alanyst vs Alanyst,Alanyst,Alanyst,True,8365.254806
1,Alanyst vs AlasdairGreen27,Alanyst,AlasdairGreen27,False,3450.470084
2,Athenean vs Athenean,Athenean,Athenean,True,-3658.48383
3,Athenean vs Avraham,Athenean,Avraham,False,-8486.122589


In [14]:
results_v2

Unnamed: 0,problem,known_author,unknown_author,target,score
0,Alanyst vs Alanyst,Alanyst,Alanyst,True,81683.804182
1,Alanyst vs AlasdairGreen27,Alanyst,AlasdairGreen27,False,23076.959751
2,Athenean vs Athenean,Athenean,Athenean,True,-45137.383344
3,Athenean vs Avraham,Athenean,Avraham,False,-79036.280003


In [None]:
results = lambdaG_paraphrase(unknown, known_filtered,
                             metadata=agg_metadata, impostor_loc=impostor_loc)

In [None]:
results

In [15]:
results_agg = (
    results
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)
score_col = 'score'
target_col = 'target'
performance(results_agg,
            score_col,
            target_col,
            additional_metadata={
                'corpus': corpus
            })

  pred_llrs = np.log10(pred_probs / (1 - pred_probs))


Unnamed: 0,corpus,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,Wiki,32.731863,32.731863,1.0,-11.831542,inf,2,2,0.0,0.0,0.0,0.0,0.0,0,2,2,0


In [16]:
results_agg = (
    results_v2
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)
score_col = 'score'
target_col = 'target'
performance(results_agg,
            score_col,
            target_col,
            additional_metadata={
                'corpus': corpus
            })

  pred_llrs = np.log10(pred_probs / (1 - pred_probs))


Unnamed: 0,corpus,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,Wiki,34.38611,34.371296,1.0,-12.906524,inf,2,2,0.0,0.0,0.0,0.0,0.0,0,2,2,0


In [19]:
all_results = []

for rep in range(1, 6): 
    print(f"Repetition {rep}")
    df = lambdaG_v2(unknown, known, known,
                    metadata=agg_metadata)
    # Add the repetition column at the start:
    df.insert(0, 'repetition', rep)
    df.insert(1, 'corpus', corpus)      # move corpus next
    df.insert(2, 'data_type', data_type)
    df.insert(2, 'token_type', token_type) 
    all_results.append(df)

# Combine all repetitions into one DataFrame
results = pd.concat(all_results, ignore_index=True)

Repetition 1
    There are 75 known author(s) and 150 problem(s) in the dataset.
        Working on problem 1 of 150: 142.196.88.228 vs 142.196.88.228
        Working on problem 2 of 150: 142.196.88.228 vs Aban1313
        Working on problem 3 of 150: A_Man_In_Black vs A_Man_In_Black
        Working on problem 4 of 150: A_Man_In_Black vs Bankhallbretherton
        Working on problem 5 of 150: Aban1313 vs Aban1313


KeyboardInterrupt: 

In [None]:
# results_agg = (
#     results
#     .groupby(['problem', 'target'], as_index=False)
#     ['score']
#     .mean()
# )
# score_col = 'score'
# target_col = 'target'
# performance(results_agg,
#             score_col,
#             target_col,
#             additional_metadata={
#                 'corpus': corpus
#             })

In [None]:
# save_loc = f"{base_loc}/lambda_g_results/{corpus}_{data_type}_{model_name}_raw.jsonl"
# write_jsonl(results, save_loc)

In [None]:
# training_loc = f"{base_loc}/lambda_g_results/{corpus}_training_{model_name}_raw.jsonl"
# training = read_jsonl(training_loc)

# test_loc = f"{base_loc}/lambda_g_results/{corpus}_test_{model_name}_raw.jsonl"
# test = read_jsonl(test_loc)

In [None]:
# training_results_agg = (
#     training
#     .groupby(['problem', 'target'], as_index=False)
#     ['score']
#     .mean()
# )

# test_results_agg = (
#     test
#     .groupby(['problem', 'target'], as_index=False)
#     ['score']
#     .mean()
# )

In [None]:
# score_col = 'score'
# target_col = 'target'

In [None]:
# result_metrics = performance(training_results_agg,
#                              score_col,
#                              target_col,
#                              df_test=test_results_agg,
#                              additional_metadata={
#                                  'corpus': corpus
#                              })

In [None]:
# result_metrics