In [3]:
import spacy
from spacy import displacy
import pandas as pd
from collections import defaultdict

In [4]:
def load_conll_data(file_path):
    sentences = []
    sentence = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                # Split line: word, POS, chunk, NER
                parts = line.split()
                if len(parts) == 4:
                    word, pos, chunk, ner = parts
                    sentence.append((word, ner))
    if sentence:
        sentences.append(sentence)
    return sentences

In [5]:
# Load datasets
train_data = load_conll_data("train.txt")
valid_data = load_conll_data("valid.txt")
test_data  = load_conll_data("test.txt")

print(f"Number of training sentences: {len(train_data)}")
print(f"First training sentence: {train_data[0]}")

Number of training sentences: 14987
First training sentence: [('-DOCSTART-', 'O')]


In [6]:
def convert_to_text(sentences):
    texts = []
    for sent in sentences:
        words = [word for word, tag in sent]
        text = " ".join(words)
        texts.append(text)
    return texts

train_texts = convert_to_text(train_data)
valid_texts = convert_to_text(valid_data)
test_texts  = convert_to_text(test_data)

In [7]:
nlp_sm = spacy.load("en_core_web_sm")
nlp_md = spacy.load("en_core_web_md")

def extract_entities(sent):
    entities = []
    start = None
    end = None
    label = None
    text_tokens = [word for word, tag in sent]
    
    for i, (word, tag) in enumerate(sent):
        if tag.startswith("B-"):
            if start is not None:
                entities.append((" ".join(text_tokens[start:end]), label))
            start = i
            end = i+1
            label = tag[2:]
        elif tag.startswith("I-") and start is not None:
            end += 1
        else:
            if start is not None:
                entities.append((" ".join(text_tokens[start:end]), label))
                start = None
                end = None
                label = None
    if start is not None:
        entities.append((" ".join(text_tokens[start:end]), label))
    return entities

print("Entities in first training sentence:", extract_entities(train_data[0]))

Entities in first training sentence: []


In [8]:
for i, text in enumerate(test_texts[:3]):
    print(f"\n=== Test Article {i+1} ===")
    
    doc_sm = nlp_sm(text)
    doc_md = nlp_md(text)
    
    ents_sm = [(ent.text, ent.label_) for ent in doc_sm.ents]
    ents_md = [(ent.text, ent.label_) for ent in doc_md.ents]
    
    print("Small model entities:", ents_sm)
    print("Medium model entities:", ents_md)
    
    print("\n--- Small model visualization ---")
    displacy.render(doc_sm, style="ent", jupyter=True)
    
    print("\n--- Medium model visualization ---")
    displacy.render(doc_md, style="ent", jupyter=True)


=== Test Article 1 ===
Small model entities: []
Transformer model entities: [('-DOCSTART-', 'ORG')]

--- Small model visualization ---





--- Transformer model visualization ---



=== Test Article 2 ===
Small model entities: [('DEFEAT', 'ORG')]
Transformer model entities: [('SOCCER - JAPAN', 'ORG'), ('SURPRISE', 'GPE')]

--- Small model visualization ---



--- Transformer model visualization ---



=== Test Article 3 ===
Small model entities: [('Nadim Ladki', 'PERSON')]
Transformer model entities: [('Nadim Ladki', 'PERSON')]

--- Small model visualization ---



--- Transformer model visualization ---


In [9]:
df_test = pd.DataFrame({"text": test_texts})
df_test['ents_sm'] = df_test['text'].apply(lambda x: [(ent.text, ent.label_) for ent in nlp_sm(x).ents])
df_test['ents_md'] = df_test['text'].apply(lambda x: [(ent.text, ent.label_) for ent in nlp_md(x).ents])
df_test['num_ents_sm'] = df_test['ents_sm'].apply(len)
df_test['num_ents_md'] = df_test['ents_md'].apply(len)

print(df_test[['num_ents_sm','num_ents_md']].head())

   num_ents_sm  num_ents_trf
0            0             1
1            1             2
2            1             1
3            3             3
4            6             6
