In [1]:
import pickle
def load_pickle_file(path):
    with open(path,"rb") as file:
        data=pickle.load(file)
    return data

In [2]:
train_data = load_pickle_file("train_pos_data.pkl")
test_data = load_pickle_file("test_pos_data.pkl")

In [4]:
test_sentences = [" ".join([word for word, tag in sent]) for sent in test_data]
true_tags = [[tag for word, tag in sent] for sent in test_data]

In [5]:
print(test_sentences[:5])

['The average of interbank offered rates for dollar deposits in the Chandrapur market based * on quotations at five major banks .', 'The company plugged itself right into satbir campaign rhetoric about * rebuilding the South Bronx and kept *-1 using the minority -- South Bronx angle through the Reagan ritik .', "nirosha Madison , a corporate trader with ARIES BIOTECH PRIVATE in Azamgarh , traced the dollar 's recent solid performance against the yen to purchases of securities by Japanese insurance companies and trust banks and the sense that another wave of investment is waiting in the wings .", 'Not that Anand and Bidar disagree on the Japanese acquisitions ; indeed , each has come out in favor of unfettered investment in the Palanpur .', "*-1 No dummies , the drivers pointed out 0 they still had space on their machines for another sponsor 's name or two ."]


In [6]:
print(true_tags[:5])

[['DT', 'NN', 'IN', 'NN', 'VBD', 'NNS', 'IN', 'NN', 'NNS', 'IN', 'DT', 'NNP', 'NN', 'VBN', '-NONE-', 'IN', 'NNS', 'IN', 'CD', 'JJ', 'NNS', '.'], ['DT', 'NN', 'VBD', 'PRP', 'RB', 'IN', 'NNP', 'NN', 'NN', 'IN', '-NONE-', 'VBG', 'DT', 'NNP', 'NNP', 'CC', 'VBD', '-NONE-', 'VBG', 'DT', 'NN', ':', 'NNP', 'NNP', 'NN', 'IN', 'DT', 'NNP', 'CD', '.'], ['NNP', 'NNP', ',', 'DT', 'JJ', 'NN', 'IN', 'NNP', 'IN', 'NNP', 'IN', 'NNP', ',', 'VBD', 'DT', 'NN', 'POS', 'JJ', 'JJ', 'NN', 'IN', 'DT', 'NN', 'TO', 'NNS', 'IN', 'NNS', 'IN', 'JJ', 'NN', 'NNS', 'CC', 'NN', 'NNS', 'CC', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'NN', 'VBZ', 'VBG', 'IN', 'DT', 'NNS', '.'], ['RB', 'IN', 'NNP', 'CC', 'NNP', 'VBP', 'IN', 'DT', 'JJ', 'NNS', ':', 'RB', ',', 'DT', 'VBZ', 'VBN', 'RP', 'IN', 'NN', 'IN', 'JJ', 'NN', 'IN', 'DT', 'NNP', '.'], ['-NONE-', 'DT', 'NNS', ',', 'DT', 'NNS', 'VBD', 'RP', '-NONE-', 'PRP', 'RB', 'VBD', 'NN', 'IN', 'PRP$', 'NNS', 'IN', 'DT', 'NN', 'POS', 'NN', 'CC', 'CD', '.']]


In [9]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import SystemMessage, HumanMessage
import time
import os
from dotenv import load_dotenv
import re
load_dotenv()

True

In [8]:
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0)

In [10]:
PENN_TAGS = (
    "CC, CD, DT, EX, FW, IN, JJ, JJR, JJS, LS, MD, NN, NNS, NNP, NNPS, "
    "PDT, POS, PRP, PRP$, RB, RBR, RBS, RP, SYM, TO, UH, "
    "VB, VBD, VBG, VBN, VBP, VBZ, WDT, WP, WP$, WRB, ., ,, :, `` , '' , -NONE-"
)

EXAMPLES = [
    ("The cat sat on the mat.", "DT NN VBD IN DT NN ."),
    ("He runs quickly.", "PRP VBZ RB ."),
    ("Dogs bark loudly at strangers.", "NNS VBP RB IN NNS ."),
]

EXAMPLES_TEXT = "\n".join(
    f"Sentence: {s}\nTags: {t}\n" for s, t in EXAMPLES
)

In [12]:
def build_prompt(batch_sentences):
    system_msg = SystemMessage(
        content=(
            "You are a strict and deterministic POS tagger.\n"
            "Use ONLY the Penn Treebank POS tagset. Each tag sequence must exactly match token order.\n"
            "Return only space-separated tag lines, one per sentence, with no explanations or extra text."
        )
    )

    human_prompt = (
        f"PENN TAGS:\n{PENN_TAGS}\n\n"
        f"Few-shot examples:\n{EXAMPLES_TEXT}\n\n"
        "Now tag these sentences strictly in the same format.\n\n"
        "INPUT SENTENCES:\n" + "\n".join(batch_sentences)
    )

    human_msg = HumanMessage(content=human_prompt)
    return [system_msg, human_msg]

In [13]:
def parse_response(response_text, expected_n):
    tagset = {t.strip() for t in PENN_TAGS.split(",")}
    lines = [l.strip() for l in response_text.strip().splitlines() if l.strip()]
    parsed = []
    for line in lines:
        tokens = [tok for tok in line.split() if tok in tagset]
        parsed.append(tokens)
    # Adjust length if mismatch
    if len(parsed) != expected_n:
        parsed = parsed[:expected_n] + [[] for _ in range(expected_n - len(parsed))]
    return parsed


In [14]:
def call_model(prompt_messages):
    response = model.invoke(prompt_messages)
    return response.content.strip()

In [15]:
def compute_accuracy(pred_tags, true_tags):
    correct, total = 0, 0
    for pred, true in zip(pred_tags, true_tags):
        for p, t in zip(pred, true):
            total += 1
            if p == t:
                correct += 1
    return round((correct / total) * 100, 2) if total > 0 else 0

In [16]:
predicted_tags = []
batch_size = 20

In [19]:
from tqdm import tqdm


In [20]:
for i in tqdm(range(0, len(test_sentences), batch_size), desc="Processing batches"):
    batch = test_sentences[i:i + batch_size]
    prompt_messages = build_prompt(batch)
    response_text = call_model(prompt_messages)
    parsed = parse_response(response_text, len(batch))
    predicted_tags.extend(parsed)

Processing batches: 100%|██████████| 40/40 [1:21:27<00:00, 122.19s/it]


In [21]:
accuracy = compute_accuracy(predicted_tags, true_tags)


In [22]:
print(accuracy)

20.71


In [25]:
for i, (sent, pred, true) in enumerate(zip(test_sentences, predicted_tags, true_tags)):
    if i >= 10:
        break
    print(f"{i+1}. Sentence: {sent}")
    print(f"Predicted: {' '.join(pred)}")
    print(f"True:      {' '.join(true)}\n")


1. Sentence: The average of interbank offered rates for dollar deposits in the Chandrapur market based * on quotations at five major banks .
Predicted: DT JJ IN NNS IN NN IN DT NNP NNS .
True:      DT NN IN NN VBD NNS IN NN NNS IN DT NNP NN VBN -NONE- IN NNS IN CD JJ NNS .

2. Sentence: The company plugged itself right into satbir campaign rhetoric about * rebuilding the South Bronx and kept *-1 using the minority -- South Bronx angle through the Reagan ritik .
Predicted: DT NN VBD PRP RB IN NNP NNS IN VBZ DT NN .
True:      DT NN VBD PRP RB IN NNP NN NN IN -NONE- VBG DT NNP NNP CC VBD -NONE- VBG DT NN : NNP NNP NN IN DT NNP CD .

3. Sentence: nirosha Madison , a corporate trader with ARIES BIOTECH PRIVATE in Azamgarh , traced the dollar 's recent solid performance against the yen to purchases of securities by Japanese insurance companies and trust banks and the sense that another wave of investment is waiting in the wings .
Predicted: NNP NNP DT NN NN IN NNP NNP IN NNP VBD DT NN NNS N

In [26]:
results = {
    "sentences": test_sentences,
    "predicted_tags": predicted_tags,
    "true_tags": true_tags,
    "accuracy": accuracy
}

with open("predicted_tags.pkl", "wb") as f:
    pickle.dump(results, f)

print("Predicted tags saved to predicted_tags.pkl")

Predicted tags saved to predicted_tags.pkl


In [27]:
import random

sample_indices = random.sample(range(len(test_sentences)), min(10, len(test_sentences)))

for idx in sample_indices:
    sent = test_sentences[idx]
    pred = predicted_tags[idx]
    true = true_tags[idx]
    print(f"Sentence {idx+1}: {sent}")
    print(f"Predicted: {' '.join(pred)}")
    print(f"True:      {' '.join(true)}")
    print(f"Predicted tag count: {len(pred)}, True tag count: {len(true)}\n")


Sentence 29: rajkaranta R. Breakey shittal Pamela J. Fischer RASAYANI (GUJARAT) PVT LTD
Predicted: CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD CD C