In [1]:
import time
import numpy as np
import pandas as pd
import os

import spacy
import en_core_web_sm

spacy.prefer_gpu()

from sklearn.model_selection import train_test_split
import ast

import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans

from spacy.training import Example
import random
from tqdm import tqdm



In [2]:
df_final = pd.read_csv('dataset/df_final.csv')

In [3]:
df_final.head()

Unnamed: 0,Sentence,Tag
0,Thousands of demonstrators have marched throug...,"['O', 'O', 'O', 'O', 'O', 'O', 'B-GEO', 'O', '..."
1,Families of soldiers killed in the conflict jo...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,They marched from the Houses of Parliament to ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,"Police put the number of marchers at 10,000 wh...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,The protest comes on the eve of the annual con...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [4]:
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner", "lemmatizer"])



In [5]:
# Modifying the spacy tokenizer to not split on hyphens

from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

# Modify tokenizer infix patterns
infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[0-9])[+\\-\\*^](?=[0-9-])",
            r"(?<=[{al}{q}])\\.(?=[{au}{q}])".format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
            ),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            # ✅ Commented out regex that splits on hyphens between letters:
            # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
            r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

In [6]:
final_data = []
for i in range(len(df_final)):
    temp_dict = {}
    temp_dict['text'] = df_final['Sentence'][i]
    temp_dict['entities'] = []
    doc = nlp(df_final['Sentence'][i])
    tag = df_final['Tag'][i]
    for token in doc:
        m = 0
        start = token.idx
        end = start + len(token.text)
        label = tag[m]
        temp_dict['entities'].append((start, end, label))
        m += 1
    final_data.append(temp_dict)

In [8]:
# Split data into train and test
print("Number of records: ", len(final_data))
train = final_data[:int(0.85 * len(final_data))]
test = final_data[int(0.85 * len(final_data)):]

print("\nTrain data length: ", len(train))
print("Test data length: ", len(test))

Number of records:  47959

Train data length:  40765
Test data length:  7194


In [None]:
nlp = spacy.blank("en")


def convert_data(data, output_path):
    # create a docbin object
    db = DocBin()
    for example in tqdm(data):
        text = example['text']
        labels = example['entities']
        # create a doc object from text
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in labels:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is not None:
                ents.append(span)
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents
        db.add(doc)
    db.to_disk(output_path)

In [None]:
convert_data(train, 'output/train.spacy')
convert_data(test, 'output/dev.spacy')