In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import itertools
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


ImportError: cannot import name 'load_metric' from 'datasets' (/opt/miniconda3/lib/python3.12/site-packages/datasets/__init__.py)

In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/siruspalsson1/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [23]:
corpus = "The Bituminous coal strike of 1977-1978 was a 110-day national coal strike in the United States led by the United Mine Workers of America. It began December 6, 1977, and ended on March 19, 1978. It is generally considered a successful union strike, although the contract was not beneficial to union members. Since the 1940s, the United Mine Workers of America (UMWA) had negotiated a nationwide National Coal Wage Agreement with the Bituminous Coal Operators Association (BCOA), a group of large coal mine operators. The three-year agreements covered national bargaining issues such as wages, health and pension benefits, workplace health and safety, and work rules. Local agreements, far more limited in scope, were negotiated by each individual local affiliate of UMWA."

sentences = sent_tokenize(corpus)
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [25]:
tokenized_data = []
filename = "tokenized_event_data"

for index, sent in enumerate(tokenized_sentences):
    for token in sent:
        tokenized_data.append({
                        'sentence': index+1,
                        'token': token,
                        'tag': 'O'
                    })


df = pd.DataFrame(tokenized_data)
df.to_csv(f"{filename}.csv", index=False)
print(f"Article data saved to {filename}.csv")

Article data saved to tokenized_event_data.csv


In [76]:
datafilename = 'tagged_event_data.csv'
data = pd.read_csv(datafilename, encoding='unicode_escape')
data['tag'] = data['tag'].apply(str.upper)
data.head()

Unnamed: 0,sentence,token,tag
0,1,The,B
1,1,Bituminous,I
2,1,coal,I
3,1,strike,I
4,1,of,I


In [77]:
num_tags = len(data.tag.unique())
print(data.count(), "\n")
print(f"Number of tags: {num_tags} \n")
print(data.tag.value_counts())

sentence    142
token       142
tag         142
dtype: int64 

Number of tags: 3 

tag
O    132
I      8
B      2
Name: count, dtype: int64


In [78]:
labels_to_ids = {k: v for v, k in enumerate(data.tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.tag.unique())}
print(labels_to_ids)
print(ids_to_labels)

{'B': 0, 'I': 1, 'O': 2}
{0: 'B', 1: 'I', 2: 'O'}


In [80]:
# fill any missing tags
data = data.ffill()
# create a new column called "sequence", grouping words by sentence 
data['sequence'] = data[['sentence','token','tag']].groupby(['sentence'])['token'].transform(lambda x: ' '.join(x))
# create a new column called "word_labels", grouping tags by sentence 
data['word_labels'] = data[['sentence','token','tag']].groupby(['sentence'])['tag'].transform(lambda x: ' '.join(x))
# Only keep "sentence" and "word_labels" columns, and drop duplicates
data = data[["sequence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sequence,word_labels
0,The Bituminous coal strike of 1977-1978 was a ...,B I I I I I O O O O O O O O O O O O O O O O O O O
1,"It began December 6 , 1977 , and ended on Marc...",O O O O O O O O O O O O O O O
2,It is generally considered a successful union ...,O O O O O O O O O O O O O O O O O O O
3,"Since the 1940s , the United Mine Workers of A...",O O O O O O O O O O O O O O O O O B I I I O O ...
4,The three-year agreements covered national bar...,O O O O O O O O O O O O O O O O O O O O O O O O O


In [81]:
# Tokenize the sequences and the word_labels
data['tokens'] = data['sequence'].apply(lambda x: word_tokenize(x))
data['ner_tags'] = data['word_labels'].apply(lambda x: word_tokenize(x))
# Keep only the tokens and the ner_tags
data = data[["tokens", "ner_tags"]]
data.head()

Unnamed: 0,tokens,ner_tags
0,"[The, Bituminous, coal, strike, of, 1977-1978,...","[B, I, I, I, I, I, O, O, O, O, O, O, O, O, O, ..."
1,"[It, began, December, 6, ,, 1977, ,, and, ende...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,"[It, is, generally, considered, a, successful,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[Since, the, 1940s, ,, the, United, Mine, Work...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[The, three-year, agreements, covered, nationa...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
