In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=6eaf49540d3d2937458fc6615bd2946d74fef82c2ad289c1194943f088278a17
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [4]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import itertools
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from datasets import Dataset
import evaluate
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
with open('corpus.txt', 'r') as file:
    corpus_raw = file.read()

corpus = corpus_raw.strip()

sentences = sent_tokenize(corpus)
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [None]:
tokenized_data = []
filename = "tokenized_event_data"

for index, sent in enumerate(tokenized_sentences):
    for token in sent:
        tokenized_data.append({
                        'sentence': index+1,
                        'token': token,
                        'tag': 'O'
                    })


df = pd.DataFrame(tokenized_data)
df.to_csv(f"{filename}.csv", index=False)
print(f"Article data saved to {filename}.csv")

Article data saved to tokenized_event_data.csv


In [6]:
# Manually Tag the event data from above

datafilename = 'tagged_corpus.csv'
data = pd.read_csv(datafilename, encoding='unicode_escape')
data['tag'] = data['tag'].apply(str.upper)
data.head()

Unnamed: 0,sentence,token,tag
0,1,The,B
1,1,1874,I
2,1,Nova,I
3,1,Scotia,I
4,1,general,I


In [7]:
num_tags = len(data.tag.unique())
label_list = list(data.tag.unique())
print(data.count(), "\n")
print(f"Number of tags: {num_tags} \n")
print(f"Label List: {label_list}\n")
print(data.tag.value_counts())

sentence    8022
token       8022
tag         8022
dtype: int64 

Number of tags: 3 

Label List: ['B', 'I', 'O']

tag
O    7179
I     666
B     177
Name: count, dtype: int64


In [8]:
labels_to_ids = {k: v for v, k in enumerate(data.tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.tag.unique())}
print(labels_to_ids)
print(ids_to_labels)
label_encoding_dict = labels_to_ids

{'B': 0, 'I': 1, 'O': 2}
{0: 'B', 1: 'I', 2: 'O'}


In [9]:
# fill any missing tags
data = data.ffill()
# create a new column called "sequence", grouping words by sentence
data['sequence'] = data[['sentence','token','tag']].groupby(['sentence'])['token'].transform(lambda x: ' '.join(x))
# create a new column called "word_labels", grouping tags by sentence
data['word_labels'] = data[['sentence','token','tag']].groupby(['sentence'])['tag'].transform(lambda x: ' '.join(x))
# Only keep "sentence" and "word_labels" columns, and drop duplicates
data = data[["sequence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sequence,word_labels
0,The 1874 Nova Scotia general election was held...,B I I I I I O O O O O O O O O O O B I I I I I ...
1,It was won by the Liberal party .,O O O O O O O O
2,The December 1981 windstorm was a severe storm...,B I I I O O O O O O O O O O O O O O O O O O O O
3,The storm formed as a secondary low .,O O O O O O O O
4,"In England , the storm started with violent wi...",O O O O O O O O O O O O O O O O O O O


In [10]:
# Tokenize the sequences and the word_labels
data['tokens'] = data['sequence'].apply(lambda x: word_tokenize(x))
data['ner_tags'] = data['word_labels'].apply(lambda x: word_tokenize(x))
# Keep only the tokens and the ner_tags
data = data[["tokens", "ner_tags"]]
data.head()

Unnamed: 0,tokens,ner_tags
0,"[The, 1874, Nova, Scotia, general, election, w...","[B, I, I, I, I, I, O, O, O, O, O, O, O, O, O, ..."
1,"[It, was, won, by, the, Liberal, party, .]","[O, O, O, O, O, O, O, O]"
2,"[The, December, 1981, windstorm, was, a, sever...","[B, I, I, I, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[The, storm, formed, as, a, secondary, low, .]","[O, O, O, O, O, O, O, O]"
4,"[In, England, ,, the, storm, started, with, vi...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [11]:
# Train/test split

train_size = 0.8
train_df = data.sample(frac=train_size,random_state=200)
test_df = data.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_df.shape))
print("TEST Dataset: {}".format(test_df.shape))

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

FULL Dataset: (323, 2)
TRAIN Dataset: (258, 2)
TEST Dataset: (65, 2)


In [12]:
# Check for mismatched lengths

for index, data in enumerate(train_dataset):
  if len(data['tokens']) != len(data['ner_tags']):
    print("!")
    print(len(data['tokens']))
    print(data['tokens'])
    print(len(data['ner_tags']))
    print(data['ner_tags'])

for index, data in enumerate(test_dataset):
  if len(data['tokens']) != len(data['ner_tags']):
    print("!")
    print(len(data['tokens']))
    print(data['tokens'])
    print(len(data['ner_tags']))
    print(data['ner_tags'])


In [13]:
task = "ner"
model_checkpoint = "bert-base-cased"
batch_size = 16

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [14]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/258 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [15]:
train_tokenized_datasets

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 258
})

In [16]:
test_tokenized_datasets

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 65
})

In [17]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    report_to = "none",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

metric = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [18]:
#Train the model

trainer.train()
trainer.evaluate()
trainer.save_model('historical-event-ner.model')


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.305686,0.368421,0.291667,0.325581,0.92738
2,No log,0.331854,0.393939,0.270833,0.320988,0.920925
3,No log,0.343911,0.512821,0.416667,0.45977,0.939215
4,No log,0.37088,0.552632,0.4375,0.488372,0.938139
5,No log,0.382662,0.525,0.4375,0.477273,0.937063


In [27]:
# Predict on New text

predictTokenizer = AutoTokenizer.from_pretrained('./historical-event-ner.model/')

# paragraph = '''The Battle of Khirbet Al-Joz was fought between forces of the Syrian Army and the FSA for control of the town. On 6 October 2012, the FSA launched an attack on the government occupied village of Kherbet Eljoz, near the Turkish border. The FSA took control of the village after a 12-hour-long battle with government forces.'''
# paragraph = '''On March 22, 1622, Powhatan Indians attacked and killed colonists in eastern Virginia. Known as the Jamestown Massacre, the bloodbath gave the English government an excuse to justify their efforts to attack Native Americans and confiscate their land. In 1636, the Pequot War over trade expansion broke out between Pequot Indians and English settlers of the Massachusetts Bay Colony and Connecticut. The colonists’ Indian allies joined them in battle and helped defeat the Pequot. A series of battles took place from 1636 to 1659 between New Netherlands settlers in New York and several Indian tribes (Lenape, Susquehannocks, Algonquians, Esopus). Some battles were especially violent and gruesome, sending many settlers fleeing back to the Netherlands. The Beaver Wars of 1640-1701 occurred between the French and their Indian allies (Algonquian, Huron) and the powerful Iroquois Confederacy. The fierce fighting started over territory and fur trade dominance around the Great Lakes and ended with the signing of the Great Peace Treaty.'''

with open('prediction_corpus.txt', 'r') as file:
    paragraph_raw = file.read(1500)

paragraph = paragraph_raw.strip()


tokens = predictTokenizer(paragraph)
torch.tensor(tokens['input_ids']).unsqueeze(0).size()

model = AutoModelForTokenClassification.from_pretrained('./historical-event-ner.model/', num_labels=len(label_list))
predictions = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
predictions = torch.argmax(predictions.logits.squeeze(), axis=1)
value_predictions = [label_list[i] for i in predictions]

words = predictTokenizer.batch_decode(tokens['input_ids'])
pd.DataFrame({'ner': predictions, 'words': words}).to_csv('historical-event-ner.csv')

In [20]:
# !zip -r /content/folder.zip /content/historical-event-ner.model

  adding: content/historical-event-ner.model/ (stored 0%)
  adding: content/historical-event-ner.model/vocab.txt (deflated 49%)
  adding: content/historical-event-ner.model/config.json (deflated 51%)
  adding: content/historical-event-ner.model/tokenizer.json (deflated 70%)
  adding: content/historical-event-ner.model/special_tokens_map.json (deflated 42%)
  adding: content/historical-event-ner.model/training_args.bin (deflated 51%)
  adding: content/historical-event-ner.model/tokenizer_config.json (deflated 76%)
  adding: content/historical-event-ner.model/model.safetensors (deflated 7%)


In [22]:
# from google.colab import files
# files.download('folder.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>