In [191]:
import pandas as pd

df = pd.read_csv('training_set_ner.csv', encoding='latin1')
df.head()

Unnamed: 0,text,annotation
0,Starbucks violated federal labor law when it i...,"[('Starbucks', 'ORG'), ('National Labor Relati..."
1,The decision is the latest in a series of NLRB...,"[('NLRB', 'ORG'), ('Starbucks', 'ORG')]"
2,"""The issue at the heart of this case is whethe...","[('Board', 'ORG'), ('Starbucks', 'ORG'), ('Mar..."
3,"""It was not.""",[]
4,The first suspect to plead guilty in Singapore...,"[('first', 'ORDINAL'), ('Singapore', 'GPE'), (..."


In [210]:
from tqdm import tqdm
from difflib import SequenceMatcher
import re
import pickle
import ast

def convert_string_to_list(s):
    """
    Converts a string representation of a list of tuples into an actual list of tuples.

    Args:
        s (str): The string representation of the list of tuples.

    Returns:
        list: The converted list of tuples.
    """
    try:
        return ast.literal_eval(s)
    except (SyntaxError, ValueError) as e:
        print(f"Error: Invalid input string - {e}")
        return None

def matcher(string, pattern):
    '''
    Return the start and end index of any pattern present in the text.
    '''
    match_list = []
    pattern = pattern.strip()
    seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
    match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
    if (match.size == len(pattern)):
        start = match.a
        end = match.a + match.size
        match_tup = (start, end)
        string = string.replace(pattern, "X" * len(pattern), 1)
        match_list.append(match_tup)
        
    return match_list, string

def mark_sentence(s, match_list):
    '''
    Marks all the entities in the sentence as per the BIO scheme. 
    '''
    word_dict = {}
    for word in s.split():
        word_dict[word] = 'O'
        
    for start, end, e_type in match_list:
        temp_str = s[start:end]
        tmp_list = temp_str.split()
        if len(tmp_list) > 1:
            word_dict[tmp_list[0]] = 'B-' + e_type
            for w in tmp_list[1:]:
                word_dict[w] = 'I-' + e_type
        else:
            word_dict[temp_str] = 'B-' + e_type
    return word_dict

def clean(text):
    '''
    Just a helper fuction to add a space before the punctuations for better tokenization
    '''
    filters = ["!", "#", "$", "%", "&", "(", ")", "/", "*", ".", ":", ";", "<", "=", ">", "?", "@", "[",
               "\\", "]", "_", "`", "{", "}", "~", "'"]
    for i in text:
        if i in filters:
            text = text.replace(i, " " + i)
            
    return text

def clean_entity(entity):
    filters = ["!", "#", "(", ")", "/", "*", ":", ";", "<", "=", ">", "?", "@", "[",
            "\\", "]", "_", "`", "{", "}", "~"]
    
    for i in entity:
        if i in filters:
            entity = entity.replace(i, " " + i)

    if len(entity) > 4:
        if entity[:4] == "the ":
            entity = entity[4:]
    
    if len(entity) > 9:
        if entity[:9] == "more than":
            entity = entity[9:]
            
    return entity

def create_data(df, filepath):
    '''
    The function responsible for the creation of data in the said format.
    '''
    with open(filepath , 'w') as f:
        for text, annotation in zip(df.text, df.annotation):
            print(annotation)
            annotation = convert_string_to_list(annotation)
            text = clean(text)
            text_ = text        
            match_list = []
            print(type(annotation))
            for i in annotation:
                a, text_ = matcher(text, clean_entity(i[0]))
                if len(a) >= 1:
                  match_list.append((a[0][0], a[0][1], i[1]))
                

            d = mark_sentence(text, match_list)

            for i in d.keys():
                print(i)
                f.writelines(i + ' ' + d[i] +'\n')
            f.writelines('\n')
    
## path to save the txt file.
filepath = 'original.txt'

## creating the file.
create_data(df, filepath)

[('Starbucks', 'ORG'), ('National Labor Relations Board', 'ORG'), ('Thursday', 'DATE')]
<class 'list'>
Starbucks
violated
federal
labor
law
when
it
increased
wages
and
offered
new
perks
benefits
only
to
non-union
employees,
a
National
Labor
Relations
Board
judge
found
Thursday
.
[('NLRB', 'ORG'), ('Starbucks', 'ORG')]
<class 'list'>
The
decision
is
the
latest
in
a
series
of
NLRB
rulings
finding
that
Starbucks
has
violated
labor
law
its
efforts
to
stop
unions
from
forming
coffee
shops
.
[('Board', 'ORG'), ('Starbucks', 'ORG'), ('Mara-Louise Anzalone', 'PERSON')]
<class 'list'>
"The
issue
at
the
heart
of
this
case
is
whether,
under
current
Board
law,
[Starbucks
]
was
entitled
to
explicitly
reward
employees,"
for
not
participating
in
union
activity,
"while
falsely
telling
its
workers
that
federal
labor
law
forced
it
take
action,
wrote
administrative
judge
Mara-Louise
Anzalone
."
Starbucks
[]
<class 'list'>
"It
was
not
."
[('first', 'ORDINAL'), ('Singapore', 'GPE'), ("13 months'", 'DATE'),

In [193]:
test_df = pd.read_csv('training_set_ner.csv', encoding='latin1')

test_df.head()

Unnamed: 0,text,annotation
0,Starbucks violated federal labor law when it i...,"[('Starbucks', 'ORG'), ('National Labor Relati..."
1,The decision is the latest in a series of NLRB...,"[('NLRB', 'ORG'), ('Starbucks', 'ORG')]"
2,"""The issue at the heart of this case is whethe...","[('Board', 'ORG'), ('Starbucks', 'ORG'), ('Mar..."
3,"""It was not.""",[]
4,The first suspect to plead guilty in Singapore...,"[('first', 'ORDINAL'), ('Singapore', 'GPE'), (..."


In [211]:
## path to save the txt file.
filepath = 'test.txt'

## creating the file.
create_data(test_df, filepath)

[('Starbucks', 'ORG'), ('National Labor Relations Board', 'ORG'), ('Thursday', 'DATE')]
<class 'list'>
Starbucks
violated
federal
labor
law
when
it
increased
wages
and
offered
new
perks
benefits
only
to
non-union
employees,
a
National
Labor
Relations
Board
judge
found
Thursday
.
[('NLRB', 'ORG'), ('Starbucks', 'ORG')]
<class 'list'>
The
decision
is
the
latest
in
a
series
of
NLRB
rulings
finding
that
Starbucks
has
violated
labor
law
its
efforts
to
stop
unions
from
forming
coffee
shops
.
[('Board', 'ORG'), ('Starbucks', 'ORG'), ('Mara-Louise Anzalone', 'PERSON')]
<class 'list'>
"The
issue
at
the
heart
of
this
case
is
whether,
under
current
Board
law,
[Starbucks
]
was
entitled
to
explicitly
reward
employees,"
for
not
participating
in
union
activity,
"while
falsely
telling
its
workers
that
federal
labor
law
forced
it
take
action,
wrote
administrative
judge
Mara-Louise
Anzalone
."
Starbucks
[]
<class 'list'>
"It
was
not
."
[('first', 'ORDINAL'), ('Singapore', 'GPE'), ("13 months'", 'DATE'),

In [216]:
import random 

fin = open('original.txt', 'rb') 
train_fin = open("train.txt", 'wb') 
dev_fin = open("dev.txt", 'wb')

for line in fin: 
  r = random.random() 
  if (0.0 <=  r <= 0.9): 
    train_fin.write(line) 
  else:
    dev_fin.write(line)

fin.close() 
train_fin.close() 
dev_fin.close()

In [207]:
!pip install flair

^C


Collecting flair
  Using cached flair-0.15.0-py3-none-any.whl.metadata (12 kB)
Collecting boto3>=1.20.27 (from flair)
  Using cached boto3-1.36.7-py3-none-any.whl.metadata (6.6 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Using cached conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting deprecated>=1.2.13 (from flair)
  Using cached Deprecated-1.2.18-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting ftfy>=6.1.0 (from flair)
  Using cached ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting gdown>=4.4.0 (from flair)
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting huggingface-hub>=0.10.0 (from flair)
  Using cached huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Collecting langdetect>=1.0.9 (from flair)
  Using cached langdetect-1.0.9-py3-none-any.whl
Collecting mpld3>=0.3 (from flair)
  Using cached mpld3-0.5.10-py3-none-any.whl.metadata (5.1 kB)
Collecting pytorch-revgrad>=0.2.0 (from flair)
  Using cached pytorch_revgrad-0.2.0-py3-no


[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [228]:
import flair

from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0 : 'text', 1 : 'ner'}
# directory where the data resides
data_folder = r"C:\wamp64\www\datathon-2025\flair_training"
# initializing the corpus

corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file = 'train.txt',
                              test_file = 'test.txt',
                              dev_file = 'dev.txt',
                              encoding="latin1")

2025-01-28 22:19:06,847 Reading data from C:\wamp64\www\datathon-2025\flair_training
2025-01-28 22:19:06,848 Train: C:\wamp64\www\datathon-2025\flair_training\train.txt
2025-01-28 22:19:06,849 Dev: C:\wamp64\www\datathon-2025\flair_training\dev.txt
2025-01-28 22:19:06,850 Test: C:\wamp64\www\datathon-2025\flair_training\test.txt


In [235]:
print(len(corpus.train))
print(corpus.train[0].to_tagged_string('ner'))

6536
Sentence[25]: "Starbucks violated federal labor law when it increased wages and offered new perks benefits only to employees, a National Labor Relations Board found Thursday ." → ["Starbucks"/ORG, "National Labor Relations Board"/ORG, "Thursday"/DATE]


In [236]:
# tag to predict
tag_type = 'ner'
# make tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

  tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)


In [238]:
from flair.models import SequenceTagger
tagger = SequenceTagger.load("flair/ner-english-ontonotes-large")
print(tagger)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


2025-01-28 22:26:12,277 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY
SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )

In [None]:
from flair.trainers import ModelTrainer
trainer : ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train('resources/taggers/example-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150)