In [60]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification, BertTokenizer

In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/siruspalsson1/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
corpus = "The Bituminous coal strike of 1977-1978 was a 110-day national coal strike in the United States led by the United Mine Workers of America. It began December 6, 1977, and ended on March 19, 1978. It is generally considered a successful union strike, although the contract was not beneficial to union members. Since the 1940s, the United Mine Workers of America (UMWA) had negotiated a nationwide National Coal Wage Agreement with the Bituminous Coal Operators Association (BCOA), a group of large coal mine operators. The three-year agreements covered national bargaining issues such as wages, health and pension benefits, workplace health and safety, and work rules. Local agreements, far more limited in scope, were negotiated by each individual local affiliate of UMWA."

sentences = sent_tokenize(corpus)

In [5]:
for sent in sentences: print(sent)

The Bituminous coal strike of 1977-1978 was a 110-day national coal strike in the United States led by the United Mine Workers of America.
It began December 6, 1977, and ended on March 19, 1978.
It is generally considered a successful union strike, although the contract was not beneficial to union members.
Since the 1940s, the United Mine Workers of America (UMWA) had negotiated a nationwide National Coal Wage Agreement with the Bituminous Coal Operators Association (BCOA), a group of large coal mine operators.
The three-year agreements covered national bargaining issues such as wages, health and pension benefits, workplace health and safety, and work rules.
Local agreements, far more limited in scope, were negotiated by each individual local affiliate of UMWA.


In [6]:
type(sentences)

list

In [7]:
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [8]:
print(tokenized_sentences)

[['The', 'Bituminous', 'coal', 'strike', 'of', '1977-1978', 'was', 'a', '110-day', 'national', 'coal', 'strike', 'in', 'the', 'United', 'States', 'led', 'by', 'the', 'United', 'Mine', 'Workers', 'of', 'America', '.'], ['It', 'began', 'December', '6', ',', '1977', ',', 'and', 'ended', 'on', 'March', '19', ',', '1978', '.'], ['It', 'is', 'generally', 'considered', 'a', 'successful', 'union', 'strike', ',', 'although', 'the', 'contract', 'was', 'not', 'beneficial', 'to', 'union', 'members', '.'], ['Since', 'the', '1940s', ',', 'the', 'United', 'Mine', 'Workers', 'of', 'America', '(', 'UMWA', ')', 'had', 'negotiated', 'a', 'nationwide', 'National', 'Coal', 'Wage', 'Agreement', 'with', 'the', 'Bituminous', 'Coal', 'Operators', 'Association', '(', 'BCOA', ')', ',', 'a', 'group', 'of', 'large', 'coal', 'mine', 'operators', '.'], ['The', 'three-year', 'agreements', 'covered', 'national', 'bargaining', 'issues', 'such', 'as', 'wages', ',', 'health', 'and', 'pension', 'benefits', ',', 'workplace

In [9]:
for token_sent in tokenized_sentences: print(token_sent)

['The', 'Bituminous', 'coal', 'strike', 'of', '1977-1978', 'was', 'a', '110-day', 'national', 'coal', 'strike', 'in', 'the', 'United', 'States', 'led', 'by', 'the', 'United', 'Mine', 'Workers', 'of', 'America', '.']
['It', 'began', 'December', '6', ',', '1977', ',', 'and', 'ended', 'on', 'March', '19', ',', '1978', '.']
['It', 'is', 'generally', 'considered', 'a', 'successful', 'union', 'strike', ',', 'although', 'the', 'contract', 'was', 'not', 'beneficial', 'to', 'union', 'members', '.']
['Since', 'the', '1940s', ',', 'the', 'United', 'Mine', 'Workers', 'of', 'America', '(', 'UMWA', ')', 'had', 'negotiated', 'a', 'nationwide', 'National', 'Coal', 'Wage', 'Agreement', 'with', 'the', 'Bituminous', 'Coal', 'Operators', 'Association', '(', 'BCOA', ')', ',', 'a', 'group', 'of', 'large', 'coal', 'mine', 'operators', '.']
['The', 'three-year', 'agreements', 'covered', 'national', 'bargaining', 'issues', 'such', 'as', 'wages', ',', 'health', 'and', 'pension', 'benefits', ',', 'workplace', 'h

In [30]:
tokenized_data = []

for index, sent in enumerate(tokenized_sentences):
    for token in sent:
        tokenized_data.append({
                        'Sentence #': f"Sentence: {index+1}",
                        'Word': token,
                        'Tag': 'O'
                    })


In [31]:
print(tokenized_data)

[{'Sentence #': 'Sentence: 1', 'Word': 'The', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'Bituminous', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'coal', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'strike', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'of', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': '1977-1978', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'was', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'a', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': '110-day', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'national', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'coal', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'strike', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'in', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'the', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'United', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': 'States', 'Tag': 'O'}, {'Sentence #': 'Sentence: 1', 'Word': '

In [32]:
filename = "tokenized_event_data"
df = pd.DataFrame(tokenized_data)
df.to_csv(f"{filename}.csv", index=False)
print(f"Article data saved to {filename}.csv")

Article data saved to tokenized_event_data.csv


In [33]:
datafilename = 'tagged_event_data.csv'
data = pd.read_csv(datafilename, encoding='unicode_escape')
data['Tag'] = data['Tag'].apply(str.upper)
data.head(20)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,The,B
1,Sentence: 1,Bituminous,I
2,Sentence: 1,coal,I
3,Sentence: 1,strike,I
4,Sentence: 1,of,I
5,Sentence: 1,1977-1978,I
6,Sentence: 1,was,O
7,Sentence: 1,a,O
8,Sentence: 1,110-day,O
9,Sentence: 1,national,O


In [34]:
data.count()

Sentence #    142
Word          142
Tag           142
dtype: int64

In [35]:
print("Number of tags: {}".format(len(data.Tag.unique())))
frequencies = data.Tag.value_counts()
frequencies

Number of tags: 3


Tag
O    132
I      8
B      2
Name: count, dtype: int64

In [38]:
labels_to_ids = {k: v for v, k in enumerate(data.Tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.Tag.unique())}
labels_to_ids

{'B': 0, 'I': 1, 'O': 2}

In [39]:
ids_to_labels

{0: 'B', 1: 'I', 2: 'O'}

In [42]:
# fill any missing tags
data = data.fillna(method='ffill')
data.head(10)

  data = data.fillna(method='ffill')


Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,The,B
1,Sentence: 1,Bituminous,I
2,Sentence: 1,coal,I
3,Sentence: 1,strike,I
4,Sentence: 1,of,I
5,Sentence: 1,1977-1978,I
6,Sentence: 1,was,O
7,Sentence: 1,a,O
8,Sentence: 1,110-day,O
9,Sentence: 1,national,O


In [43]:
# create a new column called "sentence", grouping words by sentence 
data['sentence'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# create a new column called "word_labels", grouping tags by sentence 
data['word_labels'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,Sentence #,Word,Tag,sentence,word_labels
0,Sentence: 1,The,B,The Bituminous coal strike of 1977-1978 was a ...,"B,I,I,I,I,I,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
1,Sentence: 1,Bituminous,I,The Bituminous coal strike of 1977-1978 was a ...,"B,I,I,I,I,I,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,Sentence: 1,coal,I,The Bituminous coal strike of 1977-1978 was a ...,"B,I,I,I,I,I,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
3,Sentence: 1,strike,I,The Bituminous coal strike of 1977-1978 was a ...,"B,I,I,I,I,I,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4,Sentence: 1,of,I,The Bituminous coal strike of 1977-1978 was a ...,"B,I,I,I,I,I,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"


In [44]:
# Only keep "sentence" and "word_labels" columns, and drop duplicates
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,The Bituminous coal strike of 1977-1978 was a ...,"B,I,I,I,I,I,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
1,"It began December 6 , 1977 , and ended on Marc...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,It is generally considered a successful union ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
3,"Since the 1940s , the United Mine Workers of A...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B,I,I,I,O,O,..."
4,The three-year agreements covered national bar...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"


In [45]:
len(data)

6

In [46]:
data.iloc[3].sentence

'Since the 1940s , the United Mine Workers of America ( UMWA ) had negotiated a nationwide National Coal Wage Agreement with the Bituminous Coal Operators Association ( BCOA ) , a group of large coal mine operators .'

In [47]:
data.iloc[3].word_labels

'O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B,I,I,I,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O'

In [72]:
data.sentence[0]

'The Bituminous coal strike of 1977-1978 was a 110-day national coal strike in the United States led by the United Mine Workers of America .'

In [73]:
data.word_labels[0].split(",") 

['B',
 'I',
 'I',
 'I',
 'I',
 'I',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [74]:
data.word_labels[0]

'B,I,I,I,I,I,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O'

In [61]:
# CHANGE TOKENIZER TO bert-base-cased

# Model HyperParameters and Tokenizer

MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [65]:
# PyTorch Dataset Class
#     pandas dataframe to pytorch tensors
#     each sentence tokenized
#     special BERT tokens added
#     tokens padded or truncated based on MAX_LENGTH
#     attention mask created
#     labels created from word_labels column

class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.sentence[index].strip().split()  
        word_labels = self.data.word_labels[index].split(",") 

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_pretokenized=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [66]:
# Train/test split

train_size = 0.75
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (6, 2)
TRAIN Dataset: (4, 2)
TEST Dataset: (2, 2)


In [67]:
for item in training_set:
    print(item)

Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': True} not recognized.
Keyword arguments {'is_pretokenized': Tr

KeyError: 'offset_mapping'

In [68]:
tokenizers.__version__

NameError: name 'tokenizers' is not defined