In [1]:
def read_sent(path):
    ents = []
    curEnts = []
    for line in open(path):
        line = line.strip()
        if line == '':
            ents.append(curEnts)
            curEnts = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            curEnts.append(line.split('\t')[1])
    return(ents)

def read_labels(path):
    ents = []
    curEnts = []
    for line in open(path):
        line = line.strip()
        if line == '':
            ents.append(curEnts)
            curEnts = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            curEnts.append(line.split('\t')[2])
    return(ents)

In [3]:
#returns list of lists
training_labels = read_labels("en_ewt-ud-train.iob2")
training_sent = read_sent("en_ewt-ud-train.iob2")

In [15]:
training_labels

[['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O'],
 ['B-LOC', 'I-LOC'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'I-LOC',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',


In [4]:
train_labels = sum(training_labels, [])
train_sent = sum(training_sent, [])

In [16]:
train_labels

[1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 3,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [6]:
dev_labels = read_labels("en_ewt-ud-dev.iob2")
dev_sent = read_sent("en_ewt-ud-dev.iob2")

In [7]:
dev_labels = sum(dev_labels, [])
dev_sent = sum(dev_sent, [])

In [11]:
test_labels = read_labels("en_ewt-ud-test-masked.iob2")
test_sent = read_sent("en_ewt-ud-test-masked.iob2")

In [12]:
test_labels = sum(test_labels, [])
test_sent = sum(test_sent, [])

In [13]:
"""
A basic classifier based on the transformers (https://github.com/huggingface/transformers) 
library. It loads a masked language model (by default distilbert), and adds a linear layer for
prediction. Example usage:

python3 bert-topic.py topic-data/train.txt topic-data/dev.txt
"""
from typing import List, Dict
import codecs
import torch
# import sys # I don't need you
import bert.myutils as myutils # I changed this to import from bert dir
from transformers import AutoModel, AutoTokenizer

# set seed for consistency
torch.manual_seed(8446)
# Set some constants
MLM = 'bert-base-cased'
BATCH_SIZE = 8
LEARNING_RATE = 0.00001
EPOCHS = 1
# We have an UNK label for robustness purposes, it makes it easier to run on
# data with other labels, or without labels.
UNK = "[UNK]"
MAX_TRAIN_SENTS=64
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"


class ClassModel(torch.nn.Module):
    def __init__(self, nlabels: int, mlm: str):
        """
        Model for classification with transformers.

        The architecture of this model is simple, we just have a transformer
        based language model, and add one linear layer to converts it output
        to our prediction.
    
        Parameters
        ----------
        nlabels : int
            Vocabulary size of output space (i.e. number of labels)
        mlm : str
            Name of the transformers language model to use, can be found on:
            https://huggingface.co/models
        """
        super().__init__()

        # The transformer model to use
        self.mlm = AutoModel.from_pretrained(mlm)

        # Find the size of the output of the masked language model
        if hasattr(self.mlm.config, 'hidden_size'):
            self.mlm_out_size = self.mlm.config.hidden_size
        elif hasattr(self.mlm.config, 'dim'):
            self.mlm_out_size = self.mlm.config.dim
        else: # if not found, guess
            self.mlm_out_size = 768

        # Create prediction layer
        self.hidden_to_label = torch.nn.Linear(self.mlm_out_size, nlabels)

    def forward(self, input: torch.tensor):
        """
        Forward pass
    
        Parameters
        ----------
        input : torch.tensor
            Tensor with wordpiece indices. shape=(batch_size, max_sent_len).

        Returns
        -------
        output_scores : torch.tensor
            ?. shape=(?,?)
        """
        # Run transformer model on input
        mlm_out = self.mlm(input)

        # Keep only the last layer: shape=(batch_size, max_len, DIM_EMBEDDING)
        mlm_out = mlm_out.last_hidden_state
        # Keep only the output for the first ([CLS]) token: shape=(batch_size, DIM_EMBEDDING)
        mlm_out = mlm_out[:,:1,:].squeeze()

        # Matrix multiply to get scores for each label: shape=(?,?)
        output_scores = self.hidden_to_label(mlm_out)

        return output_scores

    def run_eval(self, text_batched: List[torch.tensor], labels_batched: List[torch.tensor]):
        """
        Run evaluation: predict and score
    
        Parameters
        ----------
        text_batched : List[torch.tensor]
            list with batches of text, containing wordpiece indices.
        labels_batched : List[torch.tensor]
            list with batches of labels (converted to ints).
        model : torch.nn.module
            The model to use for prediction.
    
        Returns
        -------
        score : float
            accuracy of model on labels_batches given feats_batches
        """
        self.eval()
        match = 0
        total = 0
        pred_labels_list = []
        for sents, labels in zip(text_batched, labels_batched):
            output_scores = self.forward(sents)
            pred_labels = torch.argmax(output_scores, 1)
            pred_labels_list.append(pred_labels)
            for gold_label, pred_label in zip(labels, pred_labels):
                total += 1
                if gold_label.item() == pred_label.item():
                    match+= 1
        score = match/total
        return score, pred_labels_list      

In [14]:
# Change how we load the data to be specified as a function argument instead of a command-line argument
train_text, train_labels = train_sent, train_labels # train_text, train_labels = myutils.read_data(sys.argv[1])
train_text = train_text[:MAX_TRAIN_SENTS]
train_labels = train_labels[:MAX_TRAIN_SENTS]

id2label, label2id = myutils.labels2lookup(train_labels, UNK)
NLABELS = len(id2label)
print(train_labels)
print(label2id)
#train_labels = [label2id[label] for label in train_labels]
train_labels = [label2id.get(label, label2id[UNK]) for label in train_labels]

# Change how we load the data to be specified as a function argument instead of a command-line argument
dev_text, dev_labels = dev_sent, dev_labels # dev_text, dev_labels = myutils.read_data(sys.argv[2])
#dev_labels = [label2id[label] for label in dev_labels]
dev_labels = [label2id.get(label, label2id[UNK]) for label in dev_labels]

print('tokenizing...')
tokzr = AutoTokenizer.from_pretrained(MLM)
train_tokked = myutils.tok(train_text, tokzr)
dev_tokked = myutils.tok(dev_text, tokzr)
PAD = tokzr.pad_token_id

print('converting to batches...')
train_text_batched, train_labels_batched = myutils.to_batch(train_tokked, train_labels, BATCH_SIZE, PAD, DEVICE)
# Note, some data is trown away if len(text_tokked)%BATCH_SIZE!= 0
dev_text_batched, dev_labels_batched = myutils.to_batch(dev_tokked, dev_labels, BATCH_SIZE, PAD, DEVICE)

print('initializing model...')
model = ClassModel(NLABELS, MLM)
model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

print('training...')
for epoch in range(EPOCHS):
    print('=====================')
    print('starting epoch ' + str(epoch))
    model.train() 

    # Loop over batches
    loss = 0
    for batch_idx in range(0, len(train_text_batched)):
        optimizer.zero_grad()

        output_scores = model.forward(train_text_batched[batch_idx])
        batch_loss = loss_function(output_scores, train_labels_batched[batch_idx])
        loss += batch_loss.item()

        batch_loss.backward()

        optimizer.step()

    dev_score = model.run_eval(dev_text_batched, dev_labels_batched)
    print('Loss: {:.2f}'.format(loss))
    print('Acc(dev): {:.2f}'.format(100*dev_score[0]))
    print()

['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
{'[UNK]': 0, 'O': 1, 'B-LOC': 2, 'I-LOC': 3}
tokenizing...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

converting to batches...
initializing model...


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


training...
starting epoch 0


KeyboardInterrupt: 

In [12]:
test_text, test_labels = test_sent, test_labels # dev_text, dev_labels = myutils.read_data(sys.argv[2])


test_labels = [label2id.get(label, label2id[UNK]) for label in test_labels]

# Tokenize testing data
test_tokked = myutils.tok(test_text, tokzr)

# Convert testing data to batches
test_text_batched, test_labels_batched = myutils.to_batch(test_tokked, test_labels, BATCH_SIZE, PAD, DEVICE)

# Evaluate the model on testing data
print('evaluating on testing data...')
test_score = model.run_eval(test_text_batched, test_labels_batched)
print('Accuracy on test data: {:.2f}%'.format(100 * test_score[0]))

evaluating on testing data...
Accuracy on test data: 93.31%


In [41]:
id2label = {v: k for k, v in label2id.items()}

# Convert numerical labels back to text labels
test_labels = [id2label[label] for label in test_labels]

In [50]:
id2label

{0: '[UNK]', 1: 'O', 2: 'B-LOC', 3: 'I-LOC'}

In [13]:
# One list with predicted labels
flatten_pred_labels = [label.item() for batch_pred_labels in test_score[1] for label in batch_pred_labels]


In [44]:
flatten_pred_labels = [id2label[label] for label in flatten_pred_labels]

In [14]:
# indexes 
indexes = []
for id in range(len(test_labels)):
    indexes.append(id)

In [25]:
def save_to_iob2_file(index_list, word_list, tag_list, file_path):
    with open(file_path, 'w') as f:
        for index, word, tag in zip(index_list, word_list, tag_list):
            f.write(f"{index}\t{word}\t{tag}\n")
        f.write("\n")  # Add a newline to separate sentences

In [26]:
save_to_iob2_file(indexes, test_sent , flatten_pred_labels, "output.iob2")

In [46]:
# Define the file name
file_name = "output_bert.iob2"

# Write data to the file
with open(file_name, "w") as file:
    for i, (sentence, label) in enumerate(zip(test_sent, flatten_pred_labels), start=1):
        file.write(f"# sent_id = answers-20070404104007AAY1Chs_ans-{str(i).zfill(4)}\n")
        file.write("# text = " + " ".join(sentence) + "\n")
        for j, (token, lbl) in enumerate(zip(sentence, label), start=1):
            file.write(f"{j}\t{token}\t{lbl}\t-\t-\n")
        file.write("\n")