In [37]:
import os
import re
import pickle
import random
from tqdm import tqdm
import time

In [38]:
try:
    from nltk.tokenize.treebank import TreebankWordTokenizer
    tokenizer = TreebankWordTokenizer()
    def penn_tokenize(s): return tokenizer.tokenize(s)
except Exception:
    def penn_tokenize(s):
        s = s.replace("n't", " n't")
        s = re.sub(r'([.,:;()\[\]"\'])', r' \1 ', s)
        return [tok for tok in s.split() if tok.strip()]

In [23]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import SystemMessage, HumanMessage

model = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0)

PENN_TAGS = (
    "CC, CD, DT, EX, FW, IN, JJ, JJR, JJS, LS, MD, NN, NNS, NNP, NNPS, "
    "PDT, POS, PRP, PRP$, RB, RBR, RBS, RP, SYM, TO, UH, "
    "VB, VBD, VBG, VBN, VBP, VBZ, WDT, WP, WP$, WRB, ., ,, :, `` , '' , -NONE-"
)
TAGSET = {t.strip() for t in PENN_TAGS.split(",") if t.strip()}

SYSTEM_MSG = SystemMessage(
    content=(
        "You are a strict POS tagger using the Penn Treebank tagset.\n"
        "INPUT: each line is a tokenized sentence (tokens separated by single spaces).\n"
        "OUTPUT FORMAT REQUIRED: For each sentence, output one block consisting of one line per token.\n"
        "Each line must be: <token><TAB><TAG>\n"
        "Separate sentences by a single blank line. Use ONLY the provided Penn tags.\n"
        "If you are unsure about a token, tag it -NONE-. Output nothing else."
    )
)

In [24]:
EXAMPLE_BLOCKS = (
    r"The cat sat on the mat .\n"
    r"The\tDT\ncat\tNN\nsat\tVBD\non\tIN\nthe\tDT\nmat\tNN\n.\t.\n\n"
    r"He runs quickly .\n"
    r"He\tPRP\nruns\tVBZ\nquickly\tRB\n.\t.\n\n"
    r"Rohit and Neha visited Mumbai in 2020 .\n"
    r"Rohit\tNNP\nand\tCC\nNeha\tNNP\nvisited\tVBD\nMumbai\tNNP\nin\tIN\n2020\tCD\n.\t.\n\n"
)


In [25]:
def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

def save_pickle(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)

In [26]:
TREEBANK_PLACEHOLDER = re.compile(r'(^\*.*\*?$)|(^\*$)|(^0$)')

In [None]:
def normalize_input(raw):
    if isinstance(raw, dict):
        for k in ("tagged_sents", "tagged_sentences", "data", "sentences"):
            if k in raw:
                raw = raw[k]
                break
    if not isinstance(raw, list):
        raise ValueError("Unrecognized dataset format. expected list of sentences.")
    if raw and isinstance(raw[0], tuple) and isinstance(raw[0][0], str):
        raw = [raw]
    return raw

In [None]:
def clean_pairlist(pairlist):
    tokens, tags = [], []
    for item in pairlist:
        if isinstance(item, (list, tuple)) and len(item) == 2:
            w, t = item[0], item[1]
        else:
            w = getattr(item, "word", None) or getattr(item, "token", None) or getattr(item, "text", None)
            t = getattr(item, "tag", None) or getattr(item, "pos", None) or getattr(item, "label", None)
        if w is None or t is None:
            continue
        if TREEBANK_PLACEHOLDER.match(w):
            continue
        if t == "-NONE-":
            continue
        tokens.append(w)
        tags.append(t)
    return tokens, tags

In [None]:
def load_and_clean(pkl_path):
    raw = load_pickle(pkl_path)
    normalized = normalize_input(raw)
    sentences = []
    token_lists = []
    gold_tags = []
    for pairlist in normalized:
        toks, tags = clean_pairlist(pairlist)
        sentence_str = " ".join(toks)
        tokenized = penn_tokenize(sentence_str)
        if len(tokenized) == len(tags):
            final_tokens = tokenized
            final_tags = tags
        else:
            final_tokens = toks
            final_tags = tags
        sentences.append(" ".join(final_tokens))
        token_lists.append(final_tokens)
        gold_tags.append(final_tags)
    return {"sentences": sentences, "tokens": token_lists, "gold_tags": gold_tags}

In [30]:
train = load_and_clean("train_pos_data.pkl")
test  = load_and_clean("test_pos_data.pkl")

In [31]:
save_pickle(train, "train_pos_data_tokenized_cleaned.pkl")
save_pickle(test,  "test_pos_data_tokenized_cleaned.pkl")
print("Saved cleaned tokenized datasets: train_pos_data_tokenized_cleaned.pkl, test_pos_data_tokenized_cleaned.pkl")
print(f"Example tokenized sentence (test[0]): {test['tokens'][0]}")

Saved cleaned tokenized datasets: train_pos_data_tokenized_cleaned.pkl, test_pos_data_tokenized_cleaned.pkl
Example tokenized sentence (test[0]): ['The', 'average', 'of', 'interbank', 'offered', 'rates', 'for', 'dollar', 'deposits', 'in', 'the', 'Chandrapur', 'market', 'based', 'on', 'quotations', 'at', 'five', 'major', 'banks', '.']


In [None]:
def build_batch_messages(batch_token_lists):
    human_lines = []
    human_lines.append("PENN TAGS:\n" + PENN_TAGS + "\n\n")
    human_lines.append("Few-shot examples (token<TAB>tag blocks):\n")
    human_lines.append(EXAMPLE_BLOCKS)
    human_lines.append("Now tag these tokenized sentences. Each sentence is a single line of tokens (tokens separated by single spaces).")
    human_lines.append("\n\nINPUT (one tokenized sentence per line):\n")
    for tokens in batch_token_lists:
        human_lines.append(" ".join(tokens))
    human_prompt = "\n".join(human_lines)
    return [SYSTEM_MSG, HumanMessage(content=human_prompt)]

In [None]:

REQUEST_LIMIT_PER_MINUTE = 15
MIN_INTERVAL = 60.0 / REQUEST_LIMIT_PER_MINUTE  
last_request_time = 0

In [None]:

def call_model(messages):
    global last_request_time
    current_time = time.time()
    time_since_last_request = current_time - last_request_time
    if time_since_last_request < MIN_INTERVAL:
        sleep_time = MIN_INTERVAL - time_since_last_request
        print(f"Throttling: waiting {sleep_time:.2f} seconds to respect rate limit")
        time.sleep(sleep_time)
    
    resp = model.invoke(messages)
    
    last_request_time = time.time()
    return resp.content.strip()

In [None]:
def parse_token_tag_output(text, expected_token_counts):
    """
    Expects blocks separated by blank lines. Each block: one line per token -> token<TAB>tag
    Returns parsed_tags_list (list[list[tag]]) and ok flag.
    """
    text = text.strip()
    blocks = re.split(r'\n\s*\n', text)
    parsed = []
    for blk in blocks:
        lines = [ln for ln in blk.splitlines() if ln.strip()]
        tags = []
        for line in lines:
            if "\t" in line:
                token, tag = line.split("\t", 1)
            else:
                parts = line.split()
                if len(parts) >= 2:
                    token, tag = parts[0], parts[-1]
                else:
                    continue
            tags.append(tag.strip())
        parsed.append(tags)
    if len(parsed) != len(expected_token_counts):
        flat_lines = [ln for ln in text.splitlines() if ln.strip()]
        alt = []
        pos = 0
        ok_alt = True
        for cnt in expected_token_counts:
            if pos + cnt > len(flat_lines):
                ok_alt = False
                break
            cur_tags = []
            for j in range(cnt):
                line = flat_lines[pos + j]
                if "\t" in line:
                    token, tag = line.split("\t", 1)
                else:
                    parts = line.split()
                    if len(parts) >= 2:
                        tag = parts[-1]
                    else:
                        ok_alt = False
                        break
                cur_tags.append(tag.strip())
            if not ok_alt:
                break
            alt.append(cur_tags)
            pos += cnt
        if ok_alt and len(alt) == len(expected_token_counts):
            return alt, True
        else:
            return parsed, False
    for tags, expected_cnt in zip(parsed, expected_token_counts):
        if len(tags) != expected_cnt:
            return parsed, False
    return parsed, True

In [None]:
import time
def batch_tag_tokenized(token_lists, batch_size=20):
    predicted = []
    for i in tqdm(range(0, len(token_lists), batch_size), desc="Tagging batches"):
        batch = token_lists[i:i+batch_size]
        messages = build_batch_messages(batch)
        try:
            resp_text = call_model(messages)
        except Exception as e:
            resp_text = ""
        parsed, ok = parse_token_tag_output(resp_text, [len(t) for t in batch])
        if not ok:
            for tokens in batch:
                messages = build_batch_messages([tokens])
                resp = call_model(messages)
                p, ok_single = parse_token_tag_output(resp, [len(tokens)])
                if ok_single:
                    predicted.append(p[0])
                else:
                    predicted.append(["-NONE-"] * len(tokens))
        else:
            predicted.extend(parsed)
    return predicted

In [43]:
token_lists = test["tokens"]
gold_tags = test["gold_tags"]
print(f"Starting tagging on {len(token_lists)} sentences (batches of 20).")
predicted_tags = batch_tag_tokenized(token_lists, batch_size=20)

Starting tagging on 783 sentences (batches of 20).


Tagging batches:   2%|▎         | 1/40 [00:08<05:25,  8.35s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:   5%|▌         | 2/40 [02:03<45:11, 71.35s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:   8%|▊         | 3/40 [03:57<55:50, 90.55s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  10%|█         | 4/40 [05:50<59:41, 99.48s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  12%|█▎        | 5/40 [07:44<1:01:12, 104.93s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  15%|█▌        | 6/40 [09:36<1:00:49, 107.33s/it]

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches:  18%|█▊        | 7/40 [09:48<41:45, 75.93s/it]   

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches:  20%|██        | 8/40 [10:00<29:40, 55.63s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  22%|██▎       | 9/40 [11:57<38:41, 74.89s/it]

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches:  25%|██▌       | 10/40 [12:09<27:43, 55.44s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  28%|██▊       | 11/40 [13:59<34:56, 72.29s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  30%|███       | 12/40 [15:54<39:45, 85.18s/it]

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches:  32%|███▎      | 13/40 [16:06<28:19, 62.95s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  35%|███▌      | 14/40 [18:08<34:58, 80.71s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  38%|███▊      | 15/40 [20:03<37:59, 91.20s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  40%|████      | 16/40 [22:00<39:32, 98.84s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  42%|████▎     | 17/40 [23:58<40:05, 104.59s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  45%|████▌     | 18/40 [25:56<39:49, 108.62s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  48%|████▊     | 19/40 [28:02<39:51, 113.86s/it]

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches:  50%|█████     | 20/40 [28:11<27:28, 82.41s/it] 

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  52%|█████▎    | 21/40 [30:13<29:50, 94.25s/it]

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches:  55%|█████▌    | 22/40 [30:22<20:38, 68.82s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  57%|█████▊    | 23/40 [32:22<23:51, 84.18s/it]

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches:  60%|██████    | 24/40 [32:32<16:28, 61.81s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  62%|██████▎   | 25/40 [34:33<19:54, 79.67s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  65%|██████▌   | 26/40 [36:40<21:54, 93.92s/it]

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches:  68%|██████▊   | 27/40 [36:51<14:57, 69.03s/it]

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches:  70%|███████   | 28/40 [37:05<10:28, 52.41s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  72%|███████▎  | 29/40 [39:08<13:30, 73.64s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  75%|███████▌  | 30/40 [41:13<14:48, 88.86s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  78%|███████▊  | 31/40 [43:15<14:51, 99.01s/it]

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches:  80%|████████  | 32/40 [43:26<09:40, 72.57s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  82%|████████▎ | 33/40 [45:24<10:03, 86.24s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  85%|████████▌ | 34/40 [47:22<09:34, 95.82s/it]

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches:  88%|████████▊ | 35/40 [47:35<05:54, 70.98s/it]

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches:  90%|█████████ | 36/40 [47:47<03:31, 53.00s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  92%|█████████▎| 37/40 [49:43<03:36, 72.15s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  95%|█████████▌| 38/40 [51:40<02:51, 85.65s/it]

Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling: waiting 4.00 seconds to respect rate limit
Throttling

Tagging batches:  98%|█████████▊| 39/40 [53:55<01:40, 100.44s/it]

Throttling: waiting 4.00 seconds to respect rate limit


Tagging batches: 100%|██████████| 40/40 [54:02<00:00, 81.05s/it] 


In [44]:
def compute_token_accuracy(predicted, gold):
    correct = 0
    total = 0
    for p_tags, g_tags in zip(predicted, gold):
        for p, g in zip(p_tags, g_tags):
            total += 1
            if p == g:
                correct += 1
    return round(100 * correct / total, 4) if total else 0.0

acc = compute_token_accuracy(predicted_tags, gold_tags)
print(f"\nToken-level accuracy: {acc}%")


Token-level accuracy: 64.1367%


In [45]:
results = {
    "tokens": token_lists,
    "predicted_tags": predicted_tags,
    "gold_tags": gold_tags,
    "accuracy": acc,
}
save_pickle(results, "llm_pos_predicted_tokenized.pkl")
print("\nSaved results to llm_pos_predicted_tokenized.pkl")


Saved results to llm_pos_predicted_tokenized.pkl


In [49]:
print("\nRandom samples (predicted vs gold):")
for idx in random.sample(range(len(token_lists)), min(5, len(token_lists))):
    print(f"\nIndex {idx}")
    print("Tokens:", token_lists[idx])
    print("Pred:", predicted_tags[idx])
    print("Gold:", gold_tags[idx])
    print("Len pred:", len(predicted_tags[idx]), "Len gold:", len(gold_tags[idx]))


Random samples (predicted vs gold):

Index 522
Tokens: ['``', 'Wa', '``', 'is', 'Japanese', 'for', '``', 'team', 'spirit', '``', 'and', 'Japanese', 'ballplayers', 'have', 'miles', 'and', 'miles', 'of', 'it', '.']
Pred: ['``', 'NNP', '``', 'VBZ', 'JJ', 'IN', '``', 'NN', 'NN', '``', 'CC', 'JJ', 'NNS', 'VBP', 'NNS', 'CC', 'NNS', 'IN', 'PRP', '.']
Gold: ['``', 'NNP', "''", 'VBZ', 'NNP', 'IN', '``', 'NN', 'NN', "''", 'CC', 'JJ', 'NNS', 'VBP', 'NNS', 'CC', 'NNS', 'IN', 'PRP', '.']
Len pred: 20 Len gold: 20

Index 209
Tokens: ['The', 'radio', 'show', '``', 'enraged', 'us', ',', '``', 'says', 'Mrs.', 'Ward', '.']
Pred: ['DT', 'NN', 'NN', '``', 'VBD', 'PRP', ',', "''", 'VBZ', 'NNP', 'NNP', '.']
Gold: ['DT', 'NN', 'NN', '``', 'VBD', 'PRP', ',', "''", 'VBZ', 'NNP', 'NNP', '.']
Len pred: 12 Len gold: 12

Index 552
Tokens: ['Diaper', 'shortages', 'this', 'summer', 'limited', 'growth', 'at', 'RIMA', 'BIO-NUTRI', 'PRIVATE', ',', 'Bhagalpur', ',', 'Shimla', ',', 'where', 'business', 'is', 'up', '25',