# N-gram Tracing

This notebook will be used to test out n-gram tracing for use with author verification methods. The end goal is to ensure the code works to find common n-grams between two texts and that we can return the text prior to those n-grams.

In [56]:
import sys

import pandas as pd

from from_root import from_root

sys.path.insert(0, str(from_root("src")))

from model_loading import load_model
from read_and_write_docs import read_txt
from n_gram_tracing import (
    common_ngrams,
    tokens_to_text,
    texts_around_each_ngram
)
from n_gram_scoring import score_ngrams

In [57]:
tokenizer, model = load_model("/Volumes/BCross/models/gpt2")

In [58]:
known_text = read_txt("../../data/hodja_nasreddin_text_1.txt")
unknown_text = read_txt("../../data/hodja_nasreddin_text_10.txt")

## Get Common N-Grams

Here we get the n-grams in common between the two texts.

In [59]:
common = common_ngrams(
    text1=known_text,
    text2=unknown_text,
    n=2,
    tokenizer=tokenizer,
    include_subgrams=False,
    lowercase=True
)

In [60]:
sample_tokens = common[-1]

In [61]:
sample_text = tokens_to_text(sample_tokens, tokenizer)
sample_text

' right now, but'

## Find Starting Positions

Two options here, to find the starting positions of n-grams and return the text before that or to include the n-gram in the text.

In [62]:
example_texts = texts_around_each_ngram(known_text, sample_text)

In [63]:
example_texts

["If they actually censor anything is another question.\\nUnlike others, Medvedev is an internationally recognized historian.\\nHe tells that these people are governmental bureaucrats although some of them have degrees.\\nMain point this is a Can anyone clarify, please, where the 21 million number for three countries in version 3 comes from?\\nI hope you do not suggest to replace three large sections about Kirov assassination by his single paragraph?\\nOf course, if there is any sourced information in his text that currently missing, it might be 'added' to current version.\\nPerhaps this article should be merged, but this must be properly done.\\nYes, that was one of the reasons why she left big sport so early.\\nOf course he did not write about his abuse in reports to KGB superiors.\\nI like book by Radzinsky, but this source provides much more details with a lot of references Radzinsky works on a bigger 3-volume biography of Stalin right now.\\nSo, you are very welcome to improve thi

In [64]:
score_ngrams(sample_tokens, model, tokenizer, known_text)

{'phrase': ' right now, but',
 'ngram_tokens': ['Ġright', 'Ġnow', ',', 'Ġbut'],
 'ngram_len': 4,
 'tokens': ['if',
  'Ġthey',
  'Ġactually',
  'Ġcensor',
  'Ġanything',
  'Ġis',
  'Ġanother',
  'Ġquestion',
  '.',
  '\\',
  'n',
  'un',
  'like',
  'Ġothers',
  ',',
  'Ġmed',
  'ved',
  'ev',
  'Ġis',
  'Ġan',
  'Ġinternationally',
  'Ġrecognized',
  'Ġhistorian',
  '.',
  '\\',
  'n',
  'he',
  'Ġtells',
  'Ġthat',
  'Ġthese',
  'Ġpeople',
  'Ġare',
  'Ġgovernmental',
  'Ġbureaucrats',
  'Ġalthough',
  'Ġsome',
  'Ġof',
  'Ġthem',
  'Ġhave',
  'Ġdegrees',
  '.',
  '\\',
  'n',
  'main',
  'Ġpoint',
  'Ġthis',
  'Ġis',
  'Ġa',
  'Ġcan',
  'Ġanyone',
  'Ġclarify',
  ',',
  'Ġplease',
  ',',
  'Ġwhere',
  'Ġthe',
  'Ġ21',
  'Ġmillion',
  'Ġnumber',
  'Ġfor',
  'Ġthree',
  'Ġcountries',
  'Ġin',
  'Ġversion',
  'Ġ3',
  'Ġcomes',
  'Ġfrom',
  '?',
  '\\',
  'ni',
  'Ġhope',
  'Ġyou',
  'Ġdo',
  'Ġnot',
  'Ġsuggest',
  'Ġto',
  'Ġreplace',
  'Ġthree',
  'Ġlarge',
  'Ġsections',
  'Ġabout',


In [65]:
score_ngrams(sample_tokens, model, tokenizer)

{'phrase': ' right now, but',
 'ngram_tokens': ['Ġright', 'Ġnow', ',', 'Ġbut'],
 'ngram_len': 4,
 'tokens': ['Ġright', 'Ġnow', ',', 'Ġbut'],
 'text_len': 4,
 'log_probs': [None,
  -6.1002349853515625,
  -1.4600659608840942,
  -2.102572202682495],
 'ngram_log_probs': [-6.1002349853515625,
  -1.4600659608840942,
  -2.102572202682495],
 'ngram_sum_log_probs': -9.662873148918152}

In [66]:
score_ngrams(sample_text, model, tokenizer)

{'phrase': ' right now, but',
 'ngram_tokens': ['Ġright', 'Ġnow', ',', 'Ġbut'],
 'ngram_len': 4,
 'tokens': ['Ġright', 'Ġnow', ',', 'Ġbut'],
 'text_len': 4,
 'log_probs': [None,
  -6.1002349853515625,
  -1.4600659608840942,
  -2.102572202682495],
 'ngram_log_probs': [-6.1002349853515625,
  -1.4600659608840942,
  -2.102572202682495],
 'ngram_sum_log_probs': -9.662873148918152}

In [69]:
def score_ngrams_to_df(
    ngrams,
    model,
    tokenizer,
    full_text: str | None = None,
    *,
    lowercase: bool = True,
    use_bos: bool = False
) -> pd.DataFrame:
    """
    Build a test DataFrame with:
      - phrase_num (1-based index in ngrams list)
      - phrase_occurrence (1-based occurrence in text; 1 if no-context)
    and all outputs from score_ngram_end.

    If full_text is None:
      - scores each n-gram once with text=None (no context)
    If full_text is provided:
      - uses texts_around_each_ngram(full_text, phrase, ...) to get all occurrences
        (strings that end with the n-gram), then scores each.
    """
    rows = []

    for phrase_num, ng in enumerate(ngrams, start=1):
        phrase = ng if isinstance(ng, str) else tokens_to_text(list(ng), tokenizer)

        if full_text is None:
            # no-context: score once
            res = score_ngrams(
                ngram=ng,
                model=model,
                tokenizer=tokenizer,
                text=None,
                lowercase=lowercase,
                use_bos=use_bos
            )
            rows.append({"phrase_num": phrase_num, "phrase_occurrence": 1, **res})
            continue

        # with context: score every occurrence
        occ_texts = texts_around_each_ngram(full_text, phrase, lowercase=lowercase)

        for phrase_occurrence, occ_text in enumerate(occ_texts, start=1):
            res = score_ngrams(
                ngram=ng,
                model=model,
                tokenizer=tokenizer,
                text=occ_text,
                lowercase=lowercase,
                use_bos=use_bos
            )
            rows.append({"phrase_num": phrase_num, "phrase_occurrence": phrase_occurrence, **res})

    df = pd.DataFrame(rows)

    # Ensure first two cols are phrase_num + phrase_occurrence
    first = ["phrase_num", "phrase_occurrence"]
    df = df[first + [c for c in df.columns if c not in first]]

    return df

In [70]:
df_no = score_ngrams_to_df(common, model, tokenizer, full_text=None, use_bos=True)

In [71]:
df_no

Unnamed: 0,phrase_num,phrase_occurrence,phrase,ngram_tokens,ngram_len,tokens,text_len,log_probs,ngram_log_probs,ngram_sum_log_probs
0,1,1,", it","[,, Ġit]",2,"[,, Ġit]",2,"[-5.659754753112793, -5.349859714508057]","[-5.659754753112793, -5.349859714508057]",-11.009614
1,2,1,as was,"[Ġas, Ġwas]",2,"[Ġas, Ġwas]",2,"[-8.7579984664917, -6.396714210510254]","[-8.7579984664917, -6.396714210510254]",-15.154713
2,3,1,in the,"[Ġin, Ġthe]",2,"[Ġin, Ġthe]",2,"[-7.723654747009277, -2.6319215297698975]","[-7.723654747009277, -2.6319215297698975]",-10.355576
3,4,1,is a,"[Ġis, Ġa]",2,"[Ġis, Ġa]",2,"[-7.881564140319824, -1.3263921737670898]","[-7.881564140319824, -1.3263921737670898]",-9.207956
4,5,1,is not,"[Ġis, Ġnot]",2,"[Ġis, Ġnot]",2,"[-7.881564140319824, -3.0078954696655273]","[-7.881564140319824, -3.0078954696655273]",-10.88946
5,6,1,of the,"[Ġof, Ġthe]",2,"[Ġof, Ġthe]",2,"[-7.159384727478027, -1.758285403251648]","[-7.159384727478027, -1.758285403251648]",-8.91767
6,7,1,of them,"[Ġof, Ġthem]",2,"[Ġof, Ġthem]",2,"[-7.159384727478027, -6.839210510253906]","[-7.159384727478027, -6.839210510253906]",-13.998595
7,8,1,should be,"[Ġshould, Ġbe]",2,"[Ġshould, Ġbe]",2,"[-10.483878135681152, -1.6870325803756714]","[-10.483878135681152, -1.6870325803756714]",-12.170911
8,9,1,they actually,"[Ġthey, Ġactually]",2,"[Ġthey, Ġactually]",2,"[-10.044596672058105, -6.167309284210205]","[-10.044596672058105, -6.167309284210205]",-16.211906
9,10,1,to replace,"[Ġto, Ġreplace]",2,"[Ġto, Ġreplace]",2,"[-7.229643821716309, -5.293249130249023]","[-7.229643821716309, -5.293249130249023]",-12.522893


In [72]:
df_known = score_ngrams_to_df(common, model, tokenizer, full_text=known_text, use_bos=True)

In [73]:
df_known

Unnamed: 0,phrase_num,phrase_occurrence,phrase,ngram_tokens,ngram_len,tokens,text_len,log_probs,ngram_log_probs,ngram_sum_log_probs
0,1,1,", it","[,, Ġit]",2,"[if, Ġthey, Ġactually, Ġcensor, Ġanything, Ġis...",109,"[-7.736754417419434, -5.582278251647949, -6.57...","[-1.1568869352340698, -2.668604612350464]",-3.825492
1,2,1,as was,"[Ġas, Ġwas]",2,"[if, Ġthey, Ġactually, Ġcensor, Ġanything, Ġis...",256,"[-7.736754417419434, -5.582278251647949, -6.57...","[-6.783299446105957, -7.503755569458008]",-14.287055
2,3,1,in the,"[Ġin, Ġthe]",2,"[if, Ġthey, Ġactually, Ġcensor, Ġanything, Ġis...",347,"[-7.736754417419434, -5.582278251647949, -6.57...","[-1.7697020769119263, -2.5736238956451416]",-4.343326
3,3,2,in the,"[Ġin, Ġthe]",2,"[if, Ġthey, Ġactually, Ġcensor, Ġanything, Ġis...",435,"[-7.736754417419434, -5.582278251647949, -6.57...","[-3.7110161781311035, -1.9962078332901]",-5.707224
4,4,1,is a,"[Ġis, Ġa]",2,"[if, Ġthey, Ġactually, Ġcensor, Ġanything, Ġis...",7,"[-7.736754417419434, -5.582261085510254, -6.57...","[-6.983565330505371, -2.911646842956543]",-9.895212
5,4,2,is a,"[Ġis, Ġa]",2,"[if, Ġthey, Ġactually, Ġcensor, Ġanything, Ġis...",20,"[-7.736754417419434, -5.582278251647949, -6.57...","[-2.337247848510742, -2.453457832336426]",-4.790706
6,4,3,is a,"[Ġis, Ġa]",2,"[if, Ġthey, Ġactually, Ġcensor, Ġanything, Ġis...",48,"[-7.736754417419434, -5.582278251647949, -6.57...","[-0.8769075870513916, -2.3020081520080566]",-3.178916
7,4,4,is a,"[Ġis, Ġa]",2,"[if, Ġthey, Ġactually, Ġcensor, Ġanything, Ġis...",99,"[-7.736754417419434, -5.582278251647949, -6.57...","[-0.9258678555488586, -1.5793343782424927]",-2.505202
8,5,1,is not,"[Ġis, Ġnot]",2,"[if, Ġthey, Ġactually, Ġcensor, Ġanything, Ġis...",251,"[-7.736754417419434, -5.582278251647949, -6.57...","[-1.5428647994995117, -1.4554862976074219]",-2.998351
9,6,1,of the,"[Ġof, Ġthe]",2,"[if, Ġthey, Ġactually, Ġcensor, Ġanything, Ġis...",38,"[-7.736754417419434, -5.582278251647949, -6.57...","[-1.2634352445602417, -2.4145898818969727]",-3.678025


In [None]:
df_unknown = score_ngrams_to_df(common, model, tokenizer, full_text=unknown_text)

In [None]:
tokenizer.bos_token

'<|endoftext|>'

In [None]:
print("bos_token:", tokenizer.bos_token, tokenizer.bos_token_id)
print("eos_token:", tokenizer.eos_token, tokenizer.eos_token_id)
print("pad_token:", tokenizer.pad_token, tokenizer.pad_token_id)
print("model bos id:", getattr(model.config, "bos_token_id", None))
print("model eos id:", getattr(model.config, "eos_token_id", None))


bos_token: <|endoftext|> 50256
eos_token: <|endoftext|> 50256
pad_token: None None
model bos id: 50256
model eos id: 50256
