In [1]:
import sys
import os

In [2]:
sys.path.append(os.path.abspath('../../../../src'))

from read_and_write_docs import read_jsonl, read_rds
from tokenize_and_score import load_model
from utils import apply_temp_doc_id, build_metadata_df
from n_gram_functions import (
    common_ngrams,
    pretty_print_common_ngrams,
    keep_before_phrase,
    score_phrases,
    add_pmf_column,
    score_phrases_no_context
)

In [3]:
corpus = "Wiki"
data_type = "training"

known_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}/known_raw.jsonl"
known = read_jsonl(known_loc)
known = apply_temp_doc_id(known)

unknown_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}/unknown_raw.jsonl"
unknown = read_jsonl(unknown_loc)
unknown_df = apply_temp_doc_id(unknown)

metadata_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/metadata.rds"
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == corpus]
agg_metadata = build_metadata_df(filtered_metadata, known, unknown)

In [4]:
problem_dataset_agg = read_jsonl("/Users/user/Documents/test_data/n-gram_tracing/Wiki_training_agg.jsonl")
problem_dataset_profile = read_jsonl("/Users/user/Documents/test_data/n-gram_tracing/Wiki_training_profile.jsonl")

In [5]:
same_author_problems = problem_dataset_agg[problem_dataset_agg['known_author'] == problem_dataset_agg['unknown_author']].copy()
same_author_problems.sort_values(["highest_common_count"], ascending=[False], inplace=True)
same_author_problems[(same_author_problems['highest_common_count'] >= 3) & (same_author_problems['highest_common_count'] <= 10)]

Unnamed: 0,problem,known_author,unknown_author,known_doc_id,unknown_doc_id,highest_common_count,highest_common_ngram
379,Greg_L vs Greg_L,Greg_L,Greg_L,greg_l_text_11,greg_l_text_10,9,", Ġthey Ġshould Ġhave Ġparticipated Ġin Ġthe Ġ..."
432,Haymaker vs Haymaker,Haymaker,Haymaker,haymaker_text_3,haymaker_text_2,9,"Ġat Ġthe Ġend Ġof Ġthe Ġday , Ġwe 're"
354,Fragments_of_Jade vs Fragments_of_Jade,Fragments_of_Jade,Fragments_of_Jade,fragments_of_jade_text_2,fragments_of_jade_text_10,8,"Ġme , Ġand Ġit 's Ġgetting Ġold .Ċ"
337,Fixentries vs Fixentries,Fixentries,Fixentries,fixentries_text_2,fixentries_text_5,8,Ġthe Ġindividual Ġher it ability Ġof Ġintellig...
248,DonaNobisPacem vs DonaNobisPacem,DonaNobisPacem,DonaNobisPacem,donanobispacem_text_5,donanobispacem_text_2,8,Ġafter Ġ 1 8 - 2 0 Ġweeks
...,...,...,...,...,...,...,...
222,David_Shankbone vs David_Shankbone,David_Shankbone,David_Shankbone,david_shankbone_text_1,david_shankbone_text_4,3,", Ġwhich Ġis"
211,D7G1DX~0 vs D7G1DX~0,D7G1DX~0,D7G1DX~0,d7g1dx_0_text_2,d7g1dx_0_text_5,3,Ġdon 't Ġthink
187,Collect vs Collect,Collect,Collect,collect_text_12,collect_text_11,3,", Ġand Ġi"
198,Cptnono vs Cptnono,Cptnono,Cptnono,cptnono_text_1,cptnono_text_12,3,", Ġthough .Ċ"


In [6]:
known_doc = "fragments_of_jade_text_2"
known_text = known[known['doc_id'] == known_doc].reset_index().loc[0, 'text']

unknown_doc = "fragments_of_jade_text_10"
unknown_text = unknown[unknown['doc_id'] == unknown_doc].reset_index().loc[0, 'text']

In [7]:
tokenizer, model = load_model("/Volumes/BCross/models/Qwen 2.5/Qwen2.5-0.5B-Instruct")

In [8]:
common = common_ngrams(known_text, unknown_text, 2, model, tokenizer)

In [9]:
pretty_print_common_ngrams(common, tokenizer=tokenizer, order='len_asc')

2-grams (37): [" don't", ' even played', ' for a', ' for the', ' here is', ' in the', ' it clear', ' it,', ' me and', ' me.\n', ' of it', ' on the', ' pretty much', ' show up', ' that,', ' the games', ' the one', ' they just', ' to accept', ' to the', ' to you', ' trying to', ' way,', ' what you', ' wild arms', ' with me', ' with you', ' you.\n', "'re not", ', as', ', i', ', just', ', no', '.\nand', '.\nthe', 'the only', 'you have']
3-grams (7): [' his talk page', ' that you are', ' you, and', ', and you', ", you're", ".\nit's", '.\nyou only']
4-grams (1): [".\ni'm not"]
6-grams (1): [' admitted to barely even playing one']
8-grams (1): [" me, and it's getting old.\n"]


In [10]:
print(f"""
<DOC>
{known_text}
</DOC>
<NGRAM>
""")


<DOC>
You have no room to talk about it, since you've already admitted to barely even playing one.
Thus, you have no basis for essentially every thing that you are saying.
You can't say the games don't support that, nor can you say anything about WA2 or TV.
Fighting for the sake of fighting is immature and wrong, so just quit it already.
You're not here for a discussion-you're here to harass me and try to start a fight.
The burden of finding proof is not on me.
You're the one who is trying to challenge what is written in the articles.
The games themselves support that spelling.
Your claims citing WA2 and TV were shot down, as was everything you have presented, so just go.
More lies and insults from you, as expected.
You haven't proved a thing, you're just grasping at straws.
The 505 site is full of title errors, which makes it clear they just use basic capitalization for all titles.
And on the flipside, you have not done anything to prove their spelling wasn't an error, even though al

## Phrase 1 -  " me, and it's getting old.\n"

In [19]:
p_1 = " me, and it's getting old.\n"

para_1 = [" me, and it’s getting old.\n", " me, and it is getting old.\n", " me, and it's gettin' old.\n", " me, and it’s gettin’ old.\n", " me, and it's gettin old.\n", " me, and it is gettin' old.\n", " me, and it's growing old.\n", " me, and it’s growing old.\n", " me, and it is growing old.\n", " me, and it's becoming old.\n", " me, and it’s becoming old.\n", " me, and it is becoming old.\n", " me, and it's getting tiresome.\n", " me, and it’s getting tiresome.\n", " me, and it is getting tiresome.\n", " me, and it's getting tiring.\n", " me, and it’s getting tiring.\n", " me, and it is getting tiring.\n", " me, and it's getting stale.\n", " me, and it’s getting stale.\n", " me, and it is getting stale.\n", " me, and it's growing tiresome.\n", " me, and it’s growing tiresome.\n", " me, and it is growing tiresome.\n", " me, and it's growing tiring.\n", " me, and it’s growing tiring.\n", " me, and it is growing tiring.\n", " me, and it's becoming tiresome.\n", " me, and it’s becoming tiresome.\n", " me, and it is becoming tiresome.\n", " me, and it's becoming tiring.\n", " me, and it’s becoming tiring.\n", " me, and it is becoming tiring.\n", " me, and it's becoming stale.\n", " me, and it’s becoming stale.\n", " me, and it is becoming stale.\n"]

known_base_p1 = keep_before_phrase(known_text, p_1, True)
unknown_base_p1 = keep_before_phrase(unknown_text, p_1, True)

In [20]:
known_p1_scores = score_phrases(known_base_p1, p_1, para_1, tokenizer, model)
unknown_p1_scores = score_phrases(unknown_base_p1, p_1, para_1, tokenizer, model)

→ Scoring base_text alone…
   base_total = -944.3972

→ [1/37] Processing reference…
→ [2/37] Processing paraphrase…
→ [3/37] Processing paraphrase…
→ [4/37] Processing paraphrase…
→ [5/37] Processing paraphrase…
→ [6/37] Processing paraphrase…
→ [7/37] Processing paraphrase…
→ [8/37] Processing paraphrase…
→ [9/37] Processing paraphrase…
→ [10/37] Processing paraphrase…
→ [11/37] Processing paraphrase…
→ [12/37] Processing paraphrase…
→ [13/37] Processing paraphrase…
→ [14/37] Processing paraphrase…
→ [15/37] Processing paraphrase…
→ [16/37] Processing paraphrase…
→ [17/37] Processing paraphrase…
→ [18/37] Processing paraphrase…
→ [19/37] Processing paraphrase…
→ [20/37] Processing paraphrase…
→ [21/37] Processing paraphrase…
→ [22/37] Processing paraphrase…
→ [23/37] Processing paraphrase…
→ [24/37] Processing paraphrase…
→ [25/37] Processing paraphrase…
→ [26/37] Processing paraphrase…
→ [27/37] Processing paraphrase…
→ [28/37] Processing paraphrase…
→ [29/37] Processing paraphrase…

In [21]:
known_p1_pmf = add_pmf_column(known_p1_scores, 'phrase_log_probs')
unknown_p1_pmf = add_pmf_column(unknown_p1_scores, 'phrase_log_probs')

In [22]:
p1_pmf = score_phrases_no_context(p_1, para_1, tokenizer, model)
p1_pmf = add_pmf_column(p1_pmf, 'log_probs')

→ [1/37] Processing reference…
→ [2/37] Processing paraphrase…
→ [3/37] Processing paraphrase…
→ [4/37] Processing paraphrase…
→ [5/37] Processing paraphrase…
→ [6/37] Processing paraphrase…
→ [7/37] Processing paraphrase…
→ [8/37] Processing paraphrase…
→ [9/37] Processing paraphrase…
→ [10/37] Processing paraphrase…
→ [11/37] Processing paraphrase…
→ [12/37] Processing paraphrase…
→ [13/37] Processing paraphrase…
→ [14/37] Processing paraphrase…
→ [15/37] Processing paraphrase…
→ [16/37] Processing paraphrase…
→ [17/37] Processing paraphrase…
→ [18/37] Processing paraphrase…
→ [19/37] Processing paraphrase…
→ [20/37] Processing paraphrase…
→ [21/37] Processing paraphrase…
→ [22/37] Processing paraphrase…
→ [23/37] Processing paraphrase…
→ [24/37] Processing paraphrase…
→ [25/37] Processing paraphrase…
→ [26/37] Processing paraphrase…
→ [27/37] Processing paraphrase…
→ [28/37] Processing paraphrase…
→ [29/37] Processing paraphrase…
→ [30/37] Processing paraphrase…
→ [31/37] Processing

In [26]:
# unknown_p1_pmf.to_clipboard()

## Phrase 2 - " admitted to barely even playing one"

In [27]:
p_2 = " admitted to barely even playing one"

para_2 = [" admitted to hardly even playing one", " admitted to scarcely even playing one", " confessed to barely even playing one", " confessed to hardly even playing one", " confessed to scarcely even playing one", " conceded to barely even playing one", " conceded to hardly even playing one", " conceded to scarcely even playing one", " owned up to barely even playing one", " owned up to hardly even playing one", " owned up to scarcely even playing one", " fessed up to barely even playing one", " fessed up to hardly even playing one", " fessed up to scarcely even playing one", " copped to barely even playing one", " copped to hardly even playing one", " copped to scarcely even playing one", " made an admission to barely even playing one", " made an admission to hardly even playing one", " made an admission to scarcely even playing one", " gave an admission to barely even playing one", " gave an admission to hardly even playing one", " gave an admission to scarcely even playing one", " made a confession to barely even playing one", " made a confession to hardly even playing one", " made a confession to scarcely even playing one", " gave a confession to barely even playing one", " gave a confession to hardly even playing one", " gave a confession to scarcely even playing one", " admitted to barely even playin' one", " admitted to hardly even playin' one", " admitted to scarcely even playin' one", " confessed to barely even playin' one", " conceded to barely even playin' one", " owned up to barely even playin' one", " fessed up to barely even playin' one", " copped to barely even playin' one"]

known_base_p2 = keep_before_phrase(known_text, p_2, True)
unknown_base_p2 = keep_before_phrase(unknown_text, p_2, True)

In [28]:
known_p2_scores = score_phrases(known_base_p2, p_2, para_2, tokenizer, model)
unknown_p2_scores = score_phrases(unknown_base_p2, p_2, para_2, tokenizer, model)

→ Scoring base_text alone…
   base_total = -62.8822

→ [1/38] Processing reference…
→ [2/38] Processing paraphrase…
→ [3/38] Processing paraphrase…
→ [4/38] Processing paraphrase…
→ [5/38] Processing paraphrase…
→ [6/38] Processing paraphrase…
→ [7/38] Processing paraphrase…
→ [8/38] Processing paraphrase…
→ [9/38] Processing paraphrase…
→ [10/38] Processing paraphrase…
→ [11/38] Processing paraphrase…
→ [12/38] Processing paraphrase…
→ [13/38] Processing paraphrase…
→ [14/38] Processing paraphrase…
→ [15/38] Processing paraphrase…
→ [16/38] Processing paraphrase…
→ [17/38] Processing paraphrase…
→ [18/38] Processing paraphrase…
→ [19/38] Processing paraphrase…
→ [20/38] Processing paraphrase…
→ [21/38] Processing paraphrase…
→ [22/38] Processing paraphrase…
→ [23/38] Processing paraphrase…
→ [24/38] Processing paraphrase…
→ [25/38] Processing paraphrase…
→ [26/38] Processing paraphrase…
→ [27/38] Processing paraphrase…
→ [28/38] Processing paraphrase…
→ [29/38] Processing paraphrase…


In [29]:
known_p2_pmf = add_pmf_column(known_p2_scores, 'phrase_log_probs')
unknown_p2_pmf = add_pmf_column(unknown_p2_scores, 'phrase_log_probs')

In [30]:
p2_pmf = score_phrases_no_context(p_2, para_2, tokenizer, model)
p2_pmf = add_pmf_column(p2_pmf, 'log_probs')

→ [1/38] Processing reference…
→ [2/38] Processing paraphrase…
→ [3/38] Processing paraphrase…
→ [4/38] Processing paraphrase…
→ [5/38] Processing paraphrase…
→ [6/38] Processing paraphrase…
→ [7/38] Processing paraphrase…
→ [8/38] Processing paraphrase…
→ [9/38] Processing paraphrase…
→ [10/38] Processing paraphrase…
→ [11/38] Processing paraphrase…
→ [12/38] Processing paraphrase…
→ [13/38] Processing paraphrase…
→ [14/38] Processing paraphrase…
→ [15/38] Processing paraphrase…
→ [16/38] Processing paraphrase…
→ [17/38] Processing paraphrase…
→ [18/38] Processing paraphrase…
→ [19/38] Processing paraphrase…
→ [20/38] Processing paraphrase…
→ [21/38] Processing paraphrase…
→ [22/38] Processing paraphrase…
→ [23/38] Processing paraphrase…
→ [24/38] Processing paraphrase…
→ [25/38] Processing paraphrase…
→ [26/38] Processing paraphrase…
→ [27/38] Processing paraphrase…
→ [28/38] Processing paraphrase…
→ [29/38] Processing paraphrase…
→ [30/38] Processing paraphrase…
→ [31/38] Processing

In [34]:
# unknown_p2_pmf.to_clipboard()

## Phrase 3 - " his talk page"

In [35]:
p_3 = " his talk page"

para_3 = [" his Talk page", " his talk Page", " his Talk Page", " his talk-page", " his talkpage", " his discussion page", " his Discussion page", " his discussion-page", " his discussionpage", " his conversation page", " his conversation-page", " his conversationpage", " his chat page", " his chat-page", " his chatpage", " his talk pg", " his discussion pg", " his conversation pg", " his chat pg", " his page for talk", " his page of talk", " his page for discussion", " his page of discussion", " his page for conversation", " his page for chat", " his “talk” page", " his 'talk' page"]

known_base_p3 = keep_before_phrase(known_text, p_3, True)
unknown_base_p3 = keep_before_phrase(unknown_text, p_3, True)

In [36]:
known_p3_scores = score_phrases(known_base_p3, p_3, para_3, tokenizer, model)
unknown_p3_scores = score_phrases(unknown_base_p3, p_3, para_3, tokenizer, model)

→ Scoring base_text alone…
   base_total = -1981.7070

→ [1/28] Processing reference…
→ [2/28] Processing paraphrase…
→ [3/28] Processing paraphrase…
→ [4/28] Processing paraphrase…
→ [5/28] Processing paraphrase…
→ [6/28] Processing paraphrase…
→ [7/28] Processing paraphrase…
→ [8/28] Processing paraphrase…
→ [9/28] Processing paraphrase…
→ [10/28] Processing paraphrase…
→ [11/28] Processing paraphrase…
→ [12/28] Processing paraphrase…
→ [13/28] Processing paraphrase…
→ [14/28] Processing paraphrase…
→ [15/28] Processing paraphrase…
→ [16/28] Processing paraphrase…
→ [17/28] Processing paraphrase…
→ [18/28] Processing paraphrase…
→ [19/28] Processing paraphrase…
→ [20/28] Processing paraphrase…
→ [21/28] Processing paraphrase…
→ [22/28] Processing paraphrase…
→ [23/28] Processing paraphrase…
→ [24/28] Processing paraphrase…
→ [25/28] Processing paraphrase…
→ [26/28] Processing paraphrase…
→ [27/28] Processing paraphrase…
→ [28/28] Processing paraphrase…
→ Scoring base_text alone…
   b

In [37]:
known_p3_pmf = add_pmf_column(known_p3_scores, 'phrase_log_probs')
unknown_p3_pmf = add_pmf_column(unknown_p3_scores, 'phrase_log_probs')

In [38]:
p3_pmf = score_phrases_no_context(p_3, para_3, tokenizer, model)
p3_pmf = add_pmf_column(p3_pmf, 'log_probs')

→ [1/28] Processing reference…
→ [2/28] Processing paraphrase…
→ [3/28] Processing paraphrase…
→ [4/28] Processing paraphrase…
→ [5/28] Processing paraphrase…
→ [6/28] Processing paraphrase…
→ [7/28] Processing paraphrase…
→ [8/28] Processing paraphrase…
→ [9/28] Processing paraphrase…
→ [10/28] Processing paraphrase…
→ [11/28] Processing paraphrase…
→ [12/28] Processing paraphrase…
→ [13/28] Processing paraphrase…
→ [14/28] Processing paraphrase…
→ [15/28] Processing paraphrase…
→ [16/28] Processing paraphrase…
→ [17/28] Processing paraphrase…
→ [18/28] Processing paraphrase…
→ [19/28] Processing paraphrase…
→ [20/28] Processing paraphrase…
→ [21/28] Processing paraphrase…
→ [22/28] Processing paraphrase…
→ [23/28] Processing paraphrase…
→ [24/28] Processing paraphrase…
→ [25/28] Processing paraphrase…
→ [26/28] Processing paraphrase…
→ [27/28] Processing paraphrase…
→ [28/28] Processing paraphrase…


In [42]:
# unknown_p3_pmf.to_clipboard()