In [60]:
!pip install datasets



In [61]:
!pip install evaluate



In [62]:
!pip install seqeval



In [63]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import itertools
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from datasets import Dataset
import evaluate
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

In [64]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [67]:
# corpus = "The Bituminous coal strike of 1977-1978 was a 110-day national coal strike in the United States led by the United Mine Workers of America. It began December 6, 1977, and ended on March 19, 1978. It is generally considered a successful union strike, although the contract was not beneficial to union members. Since the 1940s, the United Mine Workers of America (UMWA) had negotiated a nationwide National Coal Wage Agreement with the Bituminous Coal Operators Association (BCOA), a group of large coal mine operators. The three-year agreements covered national bargaining issues such as wages, health and pension benefits, workplace health and safety, and work rules. Local agreements, far more limited in scope, were negotiated by each individual local affiliate of UMWA."

with open('corpus.txt', 'r') as file:
    corpus_raw = file.read()

corpus = corpus_raw.strip()

sentences = sent_tokenize(corpus)
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [68]:
tokenized_data = []
filename = "tokenized_event_data"

for index, sent in enumerate(tokenized_sentences):
    for token in sent:
        tokenized_data.append({
                        'sentence': index+1,
                        'token': token,
                        'tag': 'O'
                    })


df = pd.DataFrame(tokenized_data)
df.to_csv(f"{filename}.csv", index=False)
print(f"Article data saved to {filename}.csv")

Article data saved to tokenized_event_data.csv


In [139]:
# Manually Tag the event data from above

datafilename = 'tagged_corpus.csv'
data = pd.read_csv(datafilename, encoding='unicode_escape')
data['tag'] = data['tag'].apply(str.upper)
data.head()

Unnamed: 0,sentence,token,tag
0,1,The,B
1,1,1874,I
2,1,Nova,I
3,1,Scotia,I
4,1,general,I


In [140]:
num_tags = len(data.tag.unique())
label_list = list(data.tag.unique())
print(data.count(), "\n")
print(f"Number of tags: {num_tags} \n")
print(f"Label List: {label_list}\n")
print(data.tag.value_counts())

sentence    8022
token       8022
tag         8022
dtype: int64 

Number of tags: 3 

Label List: ['B', 'I', 'O']

tag
O    7179
I     666
B     177
Name: count, dtype: int64


In [141]:
labels_to_ids = {k: v for v, k in enumerate(data.tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.tag.unique())}
print(labels_to_ids)
print(ids_to_labels)
label_encoding_dict = labels_to_ids

{'B': 0, 'I': 1, 'O': 2}
{0: 'B', 1: 'I', 2: 'O'}


In [142]:
# fill any missing tags
data = data.ffill()
# create a new column called "sequence", grouping words by sentence
data['sequence'] = data[['sentence','token','tag']].groupby(['sentence'])['token'].transform(lambda x: ' '.join(x))
# create a new column called "word_labels", grouping tags by sentence
data['word_labels'] = data[['sentence','token','tag']].groupby(['sentence'])['tag'].transform(lambda x: ' '.join(x))
# Only keep "sentence" and "word_labels" columns, and drop duplicates
data = data[["sequence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sequence,word_labels
0,The 1874 Nova Scotia general election was held...,B I I I I I O O O O O O O O O O O B I I I I I ...
1,It was won by the Liberal party .,O O O O O O O O
2,The December 1981 windstorm was a severe storm...,B I I I O O O O O O O O O O O O O O O O O O O O
3,The storm formed as a secondary low .,O O O O O O O O
4,"In England , the storm started with violent wi...",O O O O O O O O O O O O O O O O O O O


In [143]:
# Tokenize the sequences and the word_labels
data['tokens'] = data['sequence'].apply(lambda x: word_tokenize(x))
data['ner_tags'] = data['word_labels'].apply(lambda x: word_tokenize(x))
# Keep only the tokens and the ner_tags
data = data[["tokens", "ner_tags"]]
data.head()

Unnamed: 0,tokens,ner_tags
0,"[The, 1874, Nova, Scotia, general, election, w...","[B, I, I, I, I, I, O, O, O, O, O, O, O, O, O, ..."
1,"[It, was, won, by, the, Liberal, party, .]","[O, O, O, O, O, O, O, O]"
2,"[The, December, 1981, windstorm, was, a, sever...","[B, I, I, I, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[The, storm, formed, as, a, secondary, low, .]","[O, O, O, O, O, O, O, O]"
4,"[In, England, ,, the, storm, started, with, vi...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [144]:
# Train/test split

train_size = 0.8
train_df = data.sample(frac=train_size,random_state=200)
test_df = data.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_df.shape))
print("TEST Dataset: {}".format(test_df.shape))

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

FULL Dataset: (323, 2)
TRAIN Dataset: (258, 2)
TEST Dataset: (65, 2)


In [145]:
# Check for mismatched lengths

for index, data in enumerate(train_dataset):
  if len(data['tokens']) != len(data['ner_tags']):
    print("!")
    print(len(data['tokens']))
    print(data['tokens'])
    print(len(data['ner_tags']))
    print(data['ner_tags'])

for index, data in enumerate(test_dataset):
  if len(data['tokens']) != len(data['ner_tags']):
    print("!")
    print(len(data['tokens']))
    print(data['tokens'])
    print(len(data['ner_tags']))
    print(data['ner_tags'])


In [146]:
task = "ner"
model_checkpoint = "bert-base-cased"
batch_size = 16

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)




In [147]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/258 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [148]:
train_tokenized_datasets

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 258
})

In [149]:
test_tokenized_datasets

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 65
})

In [153]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    report_to = "none",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

metric = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [151]:
#Train the model

trainer.train()
trainer.evaluate()
trainer.save_model('historical-event-ner.model')


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.289746,0.404762,0.354167,0.377778,0.930608
2,No log,0.278077,0.5,0.416667,0.454545,0.935987
3,No log,0.270615,0.478261,0.458333,0.468085,0.94029


In [152]:
# Predict on New text

predictTokenizer = AutoTokenizer.from_pretrained('./historical-event-ner.model/')

# paragraph = '''The Battle of Khirbet Al-Joz was fought between forces of the Syrian Army and the FSA for control of the town. On 6 October 2012, the FSA launched an attack on the government occupied village of Kherbet Eljoz, near the Turkish border. The FSA took control of the village after a 12-hour-long battle with government forces.'''
paragraph = '''On March 22, 1622, Powhatan Indians attacked and killed colonists in eastern Virginia. Known as the Jamestown Massacre, the bloodbath gave the English government an excuse to justify their efforts to attack Native Americans and confiscate their land.

In 1636, the Pequot War over trade expansion broke out between Pequot Indians and English settlers of the Massachusetts Bay Colony and Connecticut. The colonists’ Indian allies joined them in battle and helped defeat the Pequot.

A series of battles took place from 1636 to 1659 between New Netherlands settlers in New York and several Indian tribes (Lenape, Susquehannocks, Algonquians, Esopus). Some battles were especially violent and gruesome, sending many settlers fleeing back to the Netherlands.

The Beaver Wars of 1640-1701 occurred between the French and their Indian allies (Algonquian, Huron) and the powerful Iroquois Confederacy. The fierce fighting started over territory and fur trade dominance around the Great Lakes and ended with the signing of the Great Peace Treaty.'''

tokens = predictTokenizer(paragraph)
torch.tensor(tokens['input_ids']).unsqueeze(0).size()

model = AutoModelForTokenClassification.from_pretrained('./historical-event-ner.model/', num_labels=len(label_list))
predictions = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
predictions = torch.argmax(predictions.logits.squeeze(), axis=1)
value_predictions = [label_list[i] for i in predictions]

words = predictTokenizer.batch_decode(tokens['input_ids'])
pd.DataFrame({'ner': predictions, 'words': words}).to_csv('historical-event-ner.csv')