In [None]:

import pickle
import random
import spacy
from spacy.training import Example
from spacy.tokens import Doc
from sklearn.metrics import accuracy_score, f1_score


def load_pkl(path):
    with open(path, "rb") as f:
        return pickle.load(f)

train_raw = load_pkl("train_pos_data.pkl")
test_raw  = load_pkl("test_pos_data.pkl")

print("✅ PKL files loaded successfully.")



def normalize(data):
    sentences = []
    tags = []

    for sample in data:
        words = [w for (w, t) in sample]
        pos   = [t for (w, t) in sample]

        sentences.append(words)   # ✅ store as list of words
        tags.append(pos)

    return sentences, tags


train_sentences, train_tags = normalize(train_raw)
test_sentences,  test_tags  = normalize(test_raw)

print("✅ Example normalized:", train_sentences[0])



nlp = spacy.blank("en")

def custom_tokenizer(nlp, text):
    words = text.split(" ")
    spaces = [True] * len(words)
    spaces[-1] = False
    return Doc(nlp.vocab, words=words, spaces=spaces)

nlp.tokenizer = lambda text: custom_tokenizer(nlp, text)

tagger = nlp.add_pipe("tagger")


for seq in train_tags:
    for tag in seq:
        tagger.add_label(tag)

nlp.initialize()



train_data = [(" ".join(words), {"tags": t}) for words, t in zip(train_sentences, train_tags)]
test_data  = [(" ".join(words), {"tags": t}) for words, t in zip(test_sentences,  test_tags)]

for epoch in range(10):
    random.shuffle(train_data)
    losses = {}

    for text, ann in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, ann)
        nlp.update([example], losses=losses)

    print(f"Epoch {epoch+1}/10 - Loss:", losses["tagger"])



true_labels = []
pred_labels = []

for sent_words, gold_tags in zip(test_sentences, test_tags):
    text = " ".join(sent_words)
    doc = nlp(text)

    preds = [tok.tag_ for tok in doc]

    assert len(preds) == len(gold_tags), "❌ Token mismatch detected!"

    true_labels.extend(gold_tags)
    pred_labels.extend(preds)

acc = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, average='macro')

print("\n✅ FINAL METRICS")
print("Accuracy:", acc)
print("Macro F1:", f1)



def predict_spacy(text):
    doc = nlp(text)
    return [(tok.text, tok.tag_) for tok in doc]

print("\n✅ Test Inference:")
print(predict_spacy("Aditya visited Mumbai yesterday"))


✅ PKL files loaded successfully.
✅ Example normalized: ['In', 'talks', 'with', 'Mr.', 'yoshoda', ',', 'Chinese', 'leaders', 'expressed', 'no', 'regret', 'for', 'the', 'killings', ',', 'and', 'even', 'suggested', 'that', 'the', 'Hindupur', 'was', 'prominently', 'involved', '*-1', 'in', 'the', 'demonstrations', 'this', 'spring', '.']
Epoch 1/10 - Loss: 13219.18774368207
Epoch 2/10 - Loss: 6901.332701245044
Epoch 3/10 - Loss: 5161.281156843302
Epoch 4/10 - Loss: 4015.003908773549
Epoch 5/10 - Loss: 3147.5030995552684
Epoch 6/10 - Loss: 2500.6087099696147
Epoch 7/10 - Loss: 2098.481982845162
Epoch 8/10 - Loss: 1803.6427189096707
Epoch 9/10 - Loss: 1604.2614158968493
Epoch 10/10 - Loss: 1363.4789129328992

✅ FINAL METRICS
Accuracy: 0.9288397015373829
Macro F1: 0.8557783423772984

✅ Test Inference:
[('Aditya', 'NNP'), ('visited', 'VBD'), ('Mumbai', 'NNP'), ('yesterday', 'NN')]
