https://huggingface.co/docs/transformers/main_classes/pipelines

In [56]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import csv

In [57]:
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
ner_model = pipeline("ner", model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [58]:
def open_file(set: str):
    with open(f'../en-ner-conll-2003/{set}', 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        text_data = [row for row in reader]
    return text_data

In [59]:
def prepare_tokens_lists(set: str):
    processed_texts = []

    for line in open_file(set):
        if '</S>' in line:
            tokens = [token for token in line if token != '</S>']
            if tokens:
                processed_texts.append(tokens)
        else:
            processed_texts.append(line)

    return processed_texts

In [60]:
def tokens_to_labels(tokens, preds):
    labels = ['O'] * len(tokens)
    current_entity = None
    
    for pred in preds:
        entity_type = pred['entity'][2:]
        start, end = pred['start'], pred['end']
        
        for i, token in enumerate(tokens):
            if start <= i < end:
                if i == start or current_entity != entity_type:
                    labels[i] = 'B-' + entity_type
                    current_entity = entity_type
                else:
                    labels[i] = 'I-' + entity_type
                    
    return labels

In [61]:
def save_preds(file_path, labels):
    with open(f'../en-ner-conll-2003/{file_path}', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, delimiter='\t')
        for label_sequence in labels:
            writer.writerow([' '.join(label_sequence)])

In [62]:
input_sentences = prepare_tokens_lists('dev-0/in.tsv')
print(input_sentences[:1])

all_preds = [ner_model(' '.join(sentence)) for sentence in input_sentences]

tokenized_sentences = [tokenizer.tokenize(' '.join(sentence)) for sentence in input_sentences]
all_labels = [tokens_to_labels(tokens, preds) for tokens, preds in zip(tokenized_sentences, all_preds)]

save_preds('dev-0/out.tsv', all_labels)

'''
./geval -t dev-0
0.00934


./geval -t dev-0 --metric GLEU --metric WER --metric Accuracy
BIO-F1  0.00934
GLEU    0.54233
WER     0.60932
Accuracy        0.00000
'''

[["CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY . </S> LONDON 1996-08-30 </S> West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship . </S> Their stay on top , though , may be short-lived as title rivals Essex , Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire . </S> After bowling Somerset out for 83 on the opening morning at Grace Road , Leicestershire extended their first innings by 94 runs before being bowled out for 296 with England discard Andy Caddick taking three for 83 . </S> Trailing by 213 , Somerset got a solid start to their second innings before Simmons stepped in to bundle them out for 174 . </S> Essex , however , look certain to regain their top spot after Nasser Hussain and Peter Such gave them a firm grip on their match against Yorkshire a

Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors


In [63]:
input_sentences = prepare_tokens_lists('test-A/in.tsv')

all_preds = [ner_model(' '.join(sentence)) for sentence in input_sentences]

tokenized_sentences = [tokenizer.tokenize(' '.join(sentence)) for sentence in input_sentences]
all_labels = [tokens_to_labels(tokens, preds) for tokens, preds in zip(tokenized_sentences, all_preds)]

save_preds('test-A/out.tsv', all_labels)