In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
path = '/content/drive/My Drive/NLPProject'
os.chdir(path)

import sys
sys.path.append('/content/drive/My Drive/NLPProject')


Mounted at /content/drive


In [2]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader, Dataset
import collections
from transformers import BertTokenizer, BertForTokenClassification
from torch.utils.data.sampler import SubsetRandomSampler
from datasets import load_dataset
from data_loader import EventSentenceLoader
from tqdm import tqdm
import re
import numpy as np
import argparse

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [4]:
use_freeze_model = False
validation_run = True
learning_rate = 0.00001

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')  # Changed to BERT tokenizer

def tokenize(batched_text):
    return tokenizer(batched_text['sentence'], padding=True, truncation=True, max_length=256)


In [6]:
print(device)

cuda:0


In [7]:
class TextClassificationDataset(Dataset):
    def __init__(self, data):
            self.data_points = data

    def __len__(self):
        return len(self.data_points)

    def __getitem__(self, idx):
        data = self.data_points[idx]
        return {'tokens': data['tokens'].flatten(), 'attention': data['attention'].flatten(), 'labels': torch.tensor(data['labels'])}

In [8]:
def load_bert_data():
    filepath = "events.txt"
    tokenizer_name = "bert-base-cased"
    loader = EventSentenceLoader(filepath, tokenizer_name)
    train_data = TextClassificationDataset(loader.load_data())

    train_size = int(0.9 * len(train_data))
    test_size = len(train_data) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(train_data, [train_size, test_size])
    train_data = list(map(lambda d: (torch.tensor(d['tokens']), d['labels'], d['attention']), train_dataset))
    test_data = list(map(lambda d: (torch.tensor(d['tokens']), d['labels'], d['attention']), test_dataset))

    return train_data, test_data

In [9]:
def test(model, data):
    confusion_matrix_size = 2
    confusion_matrix = []

    total = 0
    correct = 0
    total_real_words = 0

    for i in range(confusion_matrix_size):
        row = []
        for j in range(confusion_matrix_size):
            row.append(0)
        confusion_matrix.append(row)

    with torch.no_grad():
        for i, batch in tqdm(enumerate(data, 0), total=len(data), leave=False):
            inputs, labels, attention = batch
            inputs, labels, attention = inputs.to(device), labels.to(device), attention.to(device)

            outputs = model(inputs).logits.to(device)

            predicted = torch.argmax(outputs, 2).flatten()
            labels = labels.flatten()
            attention = attention.flatten()

            for j in range(len(predicted)):
                if attention[j]:
                    confusion_matrix[labels[j].item()][predicted[j].item()] += 1

            total += len(predicted)
            correct += predicted.eq(labels).sum().item()
    print(f"Total Real Words: {total_real_words}")
    print("Test Accuracy: {:.3f}".format(correct/total))
    print(confusion_matrix)
    return confusion_matrix

In [10]:
def train(model, data, epochs):
    n = len(data)

    # Define metrics to monitor change in performance during execution
    accuracy_history_epoch = []
    accuracy_history_step = []

    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    loss_func = nn.CrossEntropyLoss(weight=torch.tensor([0.1, 0.9])).to(device)

    for epoch in range(1, epochs + 1):
        correct = 0
        total = 0
        for i, batch in tqdm(enumerate(data, 0), total=len(data), leave=False):
            inputs, labels, attention = batch
            inputs, labels, attention = inputs.to(device), labels.to(device), attention.to(device)

            outputs = model(inputs).logits.to(device)
            predicted = torch.argmax(outputs, 2)

            optimizer.zero_grad()
            loss = loss_func(outputs.flatten(start_dim=0, end_dim=1), F.one_hot(labels).float().flatten(start_dim=0, end_dim=1))
            loss.backward()
            optimizer.step()

            total += len(predicted)
            correct += predicted.eq(labels).sum().item()
            accuracy_history_step.append((i+1, correct/total))

        accuracy_history_epoch.append(correct / total)
        print("Epoch: {:>3d} Accuracy: {:.3f}".format(epoch, accuracy_history_epoch[-1]))

    return accuracy_history_epoch, accuracy_history_step

In [11]:
filepath = "events.txt"
tokenizer_name = 'bert-base-cased'
loader = EventSentenceLoader(filepath, tokenizer_name)
train_data = TextClassificationDataset(loader.load_data())

train_data, test_data = load_bert_data()

  return {'tokens': data['tokens'].flatten(), 'attention': data['attention'].flatten(), 'labels': torch.tensor(data['labels'])}
  test_data = list(map(lambda d: (torch.tensor(d['tokens']), d['labels'], d['attention']), test_dataset))


In [12]:
model = BertForTokenClassification.from_pretrained("bert-base-cased").to(device)  # Changed to BERT model

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# file_path = "model_state.pth"
# model.load_state_dict(torch.load(file_path))

In [14]:
indices = list(range(len(train_data)))

train_dataloader = DataLoader(train_data, batch_size=32, num_workers=0, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=32, num_workers=0, shuffle=0)

In [15]:
train(model, train_dataloader, 50)

  0%|          | 0/94 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch:   1 Accuracy: 238.060




Epoch:   2 Accuracy: 237.538




Epoch:   3 Accuracy: 238.173




Epoch:   4 Accuracy: 239.486




Epoch:   5 Accuracy: 240.295




Epoch:   6 Accuracy: 241.973




Epoch:   7 Accuracy: 243.035




Epoch:   8 Accuracy: 244.436




Epoch:   9 Accuracy: 245.062




Epoch:  10 Accuracy: 245.889




Epoch:  11 Accuracy: 246.630




Epoch:  12 Accuracy: 247.351




Epoch:  13 Accuracy: 247.745




Epoch:  14 Accuracy: 248.136




Epoch:  15 Accuracy: 248.434




Epoch:  16 Accuracy: 248.782




Epoch:  17 Accuracy: 249.186




Epoch:  18 Accuracy: 249.671




Epoch:  19 Accuracy: 249.869




Epoch:  20 Accuracy: 250.099




Epoch:  21 Accuracy: 250.368




Epoch:  22 Accuracy: 250.573




Epoch:  23 Accuracy: 250.874




Epoch:  24 Accuracy: 251.030




Epoch:  25 Accuracy: 251.384




Epoch:  26 Accuracy: 251.311




Epoch:  27 Accuracy: 251.703




Epoch:  28 Accuracy: 251.962




Epoch:  29 Accuracy: 252.148




Epoch:  30 Accuracy: 252.229




Epoch:  31 Accuracy: 252.342




Epoch:  32 Accuracy: 252.508




Epoch:  33 Accuracy: 252.624




Epoch:  34 Accuracy: 252.712




Epoch:  35 Accuracy: 252.580




Epoch:  36 Accuracy: 252.900




Epoch:  37 Accuracy: 253.170




Epoch:  38 Accuracy: 253.230




Epoch:  39 Accuracy: 253.326




Epoch:  40 Accuracy: 253.399




Epoch:  41 Accuracy: 253.490




Epoch:  42 Accuracy: 253.487




Epoch:  43 Accuracy: 253.658




Epoch:  44 Accuracy: 253.687




Epoch:  45 Accuracy: 253.751




Epoch:  46 Accuracy: 253.834




Epoch:  47 Accuracy: 253.979




Epoch:  48 Accuracy: 254.188




Epoch:  49 Accuracy: 254.284


                                               

Epoch:  50 Accuracy: 254.277




([238.0602129075183,
  237.53792415169661,
  238.1729873586161,
  239.4863606121091,
  240.29540918163673,
  241.9733865602129,
  243.03493013972056,
  244.43646041250832,
  245.06220891550234,
  245.88888888888889,
  246.62974051896208,
  247.35096473719227,
  247.7451763140386,
  248.13639387890885,
  248.4337990685296,
  248.7821024617432,
  249.1859614105123,
  249.6713240186294,
  249.86892880904858,
  250.09946773120427,
  250.36759813705922,
  250.5731869594145,
  250.874251497006,
  251.0302727877578,
  251.3838988689288,
  251.31104457751164,
  251.7032601463739,
  251.96174318030606,
  252.14836992681305,
  252.228875582169,
  252.34231536926148,
  252.50798403193613,
  252.62375249500997,
  252.71190951430472,
  252.57950765136394,
  252.9001996007984,
  253.16999334664004,
  253.22954091816368,
  253.32568196939454,
  253.3992015968064,
  253.49001996007985,
  253.48669328010646,
  253.65768463073852,
  253.686626746507,
  253.75083166999335,
  253.83366600133067,
  253.978

In [16]:
test(model, test_dataloader)

                                               

Total Real Words: 0
Test Accuracy: 0.967
[[961, 604], [319, 3107]]




[[961, 604], [319, 3107]]

In [18]:
torch.save(model.state_dict(), '/content/drive/My Drive/NLPProject/model_state_bert.pth')