In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [10]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import itertools
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from datasets import Dataset
import evaluate
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

In [12]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
corpus = "The Bituminous coal strike of 1977-1978 was a 110-day national coal strike in the United States led by the United Mine Workers of America. It began December 6, 1977, and ended on March 19, 1978. It is generally considered a successful union strike, although the contract was not beneficial to union members. Since the 1940s, the United Mine Workers of America (UMWA) had negotiated a nationwide National Coal Wage Agreement with the Bituminous Coal Operators Association (BCOA), a group of large coal mine operators. The three-year agreements covered national bargaining issues such as wages, health and pension benefits, workplace health and safety, and work rules. Local agreements, far more limited in scope, were negotiated by each individual local affiliate of UMWA."

sentences = sent_tokenize(corpus)
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [14]:
tokenized_data = []
filename = "tokenized_event_data"

for index, sent in enumerate(tokenized_sentences):
    for token in sent:
        tokenized_data.append({
                        'sentence': index+1,
                        'token': token,
                        'tag': 'O'
                    })


df = pd.DataFrame(tokenized_data)
df.to_csv(f"{filename}.csv", index=False)
print(f"Article data saved to {filename}.csv")

Article data saved to tokenized_event_data.csv


In [20]:
datafilename = 'tagged_event_data.csv'
data = pd.read_csv(datafilename, encoding='unicode_escape')
data['tag'] = data['tag'].apply(str.upper)
data.head()

Unnamed: 0,sentence,token,tag
0,1,The,B
1,1,Bituminous,I
2,1,coal,I
3,1,strike,I
4,1,of,I


['B', 'I', 'O']

In [25]:
num_tags = len(data.tag.unique())
label_list = list(data.tag.unique())
print(data.count(), "\n")
print(f"Number of tags: {num_tags} \n")
print(f"Label List: {label_list}\n")
print(data.tag.value_counts())

sentence    142
token       142
tag         142
dtype: int64 

Number of tags: 3 

Label List: ['B', 'I', 'O']

tag
O    132
I      8
B      2
Name: count, dtype: int64


In [27]:
labels_to_ids = {k: v for v, k in enumerate(data.tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.tag.unique())}
print(labels_to_ids)
print(ids_to_labels)
label_encoding_dict = labels_to_ids

{'B': 0, 'I': 1, 'O': 2}
{0: 'B', 1: 'I', 2: 'O'}


In [32]:
# fill any missing tags
data = data.ffill()
# create a new column called "sequence", grouping words by sentence
data['sequence'] = data[['sentence','token','tag']].groupby(['sentence'])['token'].transform(lambda x: ' '.join(x))
# create a new column called "word_labels", grouping tags by sentence
data['word_labels'] = data[['sentence','token','tag']].groupby(['sentence'])['tag'].transform(lambda x: ' '.join(x))
# Only keep "sentence" and "word_labels" columns, and drop duplicates
data = data[["sequence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sequence,word_labels
0,The Bituminous coal strike of 1977-1978 was a ...,B I I I I I O O O O O O O O O O O O O O O O O O O
1,"It began December 6 , 1977 , and ended on Marc...",O O O O O O O O O O O O O O O
2,It is generally considered a successful union ...,O O O O O O O O O O O O O O O O O O O
3,"Since the 1940s , the United Mine Workers of A...",O O O O O O O O O O O O O O O O O B I I I O O ...
4,The three-year agreements covered national bar...,O O O O O O O O O O O O O O O O O O O O O O O O O


In [33]:
# Tokenize the sequences and the word_labels
data['tokens'] = data['sequence'].apply(lambda x: word_tokenize(x))
data['ner_tags'] = data['word_labels'].apply(lambda x: word_tokenize(x))
# Keep only the tokens and the ner_tags
data = data[["tokens", "ner_tags"]]
data.head()

Unnamed: 0,tokens,ner_tags
0,"[The, Bituminous, coal, strike, of, 1977-1978,...","[B, I, I, I, I, I, O, O, O, O, O, O, O, O, O, ..."
1,"[It, began, December, 6, ,, 1977, ,, and, ende...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,"[It, is, generally, considered, a, successful,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[Since, the, 1940s, ,, the, United, Mine, Work...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[The, three-year, agreements, covered, nationa...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [38]:
# Train/test split

train_size = 0.75
train_df = data.sample(frac=train_size,random_state=200)
test_df = data.drop(train_dataset.index).reset_index(drop=True)
train_df = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

FULL Dataset: (6, 2)
TRAIN Dataset: (4, 2)
TEST Dataset: (2, 2)


In [39]:
task = "ner"
model_checkpoint = "bert-base-cased"
batch_size = 16

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [40]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [47]:
train_tokenized_datasets

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 4
})

In [48]:
test_tokenized_datasets

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 2
})