In [1]:
def load_sentences(filepath):

    final = []
    sentences = []

    with open(filepath, 'r') as f:
        
        for line in f.readlines():
            
            if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                if len(sentences) > 0:
                    final.append(sentences)
                    sentences = []
            else:
                l = line.split(' ')
                sentences.append((l[0], l[3].strip('\n')))
    
    return final

In [2]:
base_path = '/kaggle/input/conll003-englishversion/'

train_samples = load_sentences(base_path + 'train.txt')
test_samples = load_sentences(base_path + 'test.txt')
valid_samples = load_sentences(base_path + 'valid.txt')

In [3]:
train_samples[:5]

[[('EU', 'B-ORG'),
  ('rejects', 'O'),
  ('German', 'B-MISC'),
  ('call', 'O'),
  ('to', 'O'),
  ('boycott', 'O'),
  ('British', 'B-MISC'),
  ('lamb', 'O'),
  ('.', 'O')],
 [('Peter', 'B-PER'), ('Blackburn', 'I-PER')],
 [('BRUSSELS', 'B-LOC'), ('1996-08-22', 'O')],
 [('The', 'O'),
  ('European', 'B-ORG'),
  ('Commission', 'I-ORG'),
  ('said', 'O'),
  ('on', 'O'),
  ('Thursday', 'O'),
  ('it', 'O'),
  ('disagreed', 'O'),
  ('with', 'O'),
  ('German', 'B-MISC'),
  ('advice', 'O'),
  ('to', 'O'),
  ('consumers', 'O'),
  ('to', 'O'),
  ('shun', 'O'),
  ('British', 'B-MISC'),
  ('lamb', 'O'),
  ('until', 'O'),
  ('scientists', 'O'),
  ('determine', 'O'),
  ('whether', 'O'),
  ('mad', 'O'),
  ('cow', 'O'),
  ('disease', 'O'),
  ('can', 'O'),
  ('be', 'O'),
  ('transmitted', 'O'),
  ('to', 'O'),
  ('sheep', 'O'),
  ('.', 'O')],
 [('Germany', 'B-LOC'),
  ("'s", 'O'),
  ('representative', 'O'),
  ('to', 'O'),
  ('the', 'O'),
  ('European', 'B-ORG'),
  ('Union', 'I-ORG'),
  ("'s", 'O'),
  ('vete

In [4]:
import nltk


In [5]:
def preprocess_sentence(sentence):
    tokens = []
    labels = []
    
    for token, ner in sentence:
        # Tokenize each word
        tokenized_words = nltk.word_tokenize(token)
        
        # Assign BIO tags to each word
        if len(tokenized_words) == 1:
            # Single-word entity
            tokens.append(tokenized_words[0])
            labels.append("B-" + ner)
        else:
            # Multi-word entity
            tokens.extend(tokenized_words)
            labels.append("B-" + ner)
            labels.extend(["I-" + ner] * (len(tokenized_words) - 1))
    
    return list(zip(tokens, labels))

In [6]:
preprocessed_sentences = [preprocess_sentence(sentence) for sentence in train_samples]

for i, sample in enumerate(preprocessed_sentences):
    if i == 5:
        break
    for token, label in sample:
        print(token, label)
    print()

EU B-B-ORG
rejects B-O
German B-B-MISC
call B-O
to B-O
boycott B-O
British B-B-MISC
lamb B-O
. B-O

Peter B-B-PER
Blackburn B-I-PER

BRUSSELS B-B-LOC
1996-08-22 B-O

The B-O
European B-B-ORG
Commission B-I-ORG
said B-O
on B-O
Thursday B-O
it B-O
disagreed B-O
with B-O
German B-B-MISC
advice B-O
to B-O
consumers B-O
to B-O
shun B-O
British B-B-MISC
lamb B-O
until B-O
scientists B-O
determine B-O
whether B-O
mad B-O
cow B-O
disease B-O
can B-O
be B-O
transmitted B-O
to B-O
sheep B-O
. B-O

Germany B-B-LOC
's B-O
representative B-O
to B-O
the B-O
European B-B-ORG
Union B-I-ORG
's B-O
veterinary B-O
committee B-O
Werner B-B-PER
Zwingmann B-I-PER
said B-O
on B-O
Wednesday B-O
consumers B-O
should B-O
buy B-O
sheepmeat B-O
from B-O
countries B-O
other B-O
than B-O
Britain B-B-LOC
until B-O
the B-O
scientific B-O
advice B-O
was B-O
clearer B-O
. B-O



In [7]:
#another  way to read
def read_conll_file(file_path):
    sentences = []
    current_sentence = []
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            
            if line == '':
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
            else:
                token, pos, chunk, ner = line.split(' ')
                current_sentence.append((token, ner))
    
    return sentences

# Example usage
file_path = '/kaggle/input/conll003-englishversion/train.txt' 
sentences = read_conll_file(file_path)

# Access the sentences and annotations
for sentence in sentences:
    for token, ner in sentence:
        # Access token and NER tag
        print(token, ner)
    print()

-DOCSTART- O

EU B-ORG
rejects O
German B-MISC
call O
to O
boycott O
British B-MISC
lamb O
. O

Peter B-PER
Blackburn I-PER

BRUSSELS B-LOC
1996-08-22 O

The O
European B-ORG
Commission I-ORG
said O
on O
Thursday O
it O
disagreed O
with O
German B-MISC
advice O
to O
consumers O
to O
shun O
British B-MISC
lamb O
until O
scientists O
determine O
whether O
mad O
cow O
disease O
can O
be O
transmitted O
to O
sheep O
. O

Germany B-LOC
's O
representative O
to O
the O
European B-ORG
Union I-ORG
's O
veterinary O
committee O
Werner B-PER
Zwingmann I-PER
said O
on O
Wednesday O
consumers O
should O
buy O
sheepmeat O
from O
countries O
other O
than O
Britain B-LOC
until O
the O
scientific O
advice O
was O
clearer O
. O

" O
We O
do O
n't O
support O
any O
such O
recommendation O
because O
we O
do O
n't O
see O
any O
grounds O
for O
it O
, O
" O
the O
Commission B-ORG
's O
chief O
spokesman O
Nikolaus B-PER
van I-PER
der I-PER
Pas I-PER
told O
a O
news O
briefing O
. O

He O
said O
further O
sc