In [1]:
from pprint import pprint
from transformers import AutoTokenizer
import datasets
from datasets import load_metric
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_conll(file):
    examples = []
    # example = {col: [] for col in INPUT_COLUMNS}
    idx = 0
    example = {"id":idx, "tokens": [], "ner_tags":[]}
    
    with open(file) as f:
        for line in f:
            if line.startswith("-DOCSTART-") or line == "\n" or not line:
                assert len(example["tokens"]) == len(example["ner_tags"])
                examples.append(example)
                idx+=1
                example = {"id":idx, "tokens": [], "ner_tags":[]}
            else:
                row_cols = line.split()
                assert len(row_cols) == 3
                example["tokens"].append(row_cols[0])
                example["ner_tags"].append(row_cols[-1])

    return examples

In [3]:
def get_dataset(dataset_path):
    test_data = read_conll( dataset_path+"/test_500_v2.conll")
    train_data = read_conll( dataset_path+"/train_1500_v2.conll")
    ner_feature = datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                "O",
                                "B-TASK",
                                "I-TASK",
                                "B-METRIC",
                                "I-METRIC",
                                "B-DATASET",
                                "I-DATASET"
                            ]
                        )
                    )

    token_feature = datasets.Sequence(datasets.Value("string"))
    id_feature = datasets.Value("string")
    train_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=train_data), features=datasets.Features({
        "id":id_feature,
        "ner_tags":ner_feature,
        "tokens" : token_feature
    }))
    test_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=test_data), features=datasets.Features({
        "id":id_feature,
        "ner_tags":ner_feature,
        "tokens" : token_feature
    }))
    return train_dataset, test_dataset

In [4]:
dataset_path = "/Users/afreenshaikh/Library/CloudStorage/GoogleDrive-afreens@andrew.cmu.edu/.shortcut-targets-by-id/1tZMZ1hVZ12FuHdPiu88Shf3zG1ZgrK47/A2/ibm_data"
train_dataset, test_dataset = get_dataset(dataset_path)
task = "ner"
label_list = train_dataset.features[f"{task}_tags"].feature.names

In [7]:
# model_checkpoint = "distilbert-base-uncased"
# model_checkpoint = "xlm-roberta-large-finetuned-conll03-english"
model_checkpoint = "tner/roberta-large-conll2003"
batch_size = 16
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [8]:
example = train_dataset[4]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['<s>', 'Ġshows', 'Ġthe', 'Ġaccuracy', 'Ġof', 'Ġthe', 'Ġdifferent', 'Ġmodels', 'Ġin', 'Ġthe', 'Ġprediction', 'Ġtask', 'Ġfor', 'Ġthe', 'Ġthree', 'Ġdifferent', 'Ġdomains', 'Ġ.', '</s>']


In [9]:
print(len(example[f"{task}_tags"]), len(tokenized_input["input_ids"]))
print(tokenized_input.word_ids())

17 19
[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, None]


In [10]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"{task}_tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

19 19


In [12]:
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [13]:
tokenize_and_align_labels(train_dataset[:5])

{'input_ids': [[0, 166, 4830, 10, 2743, 542, 16101, 25376, 1548, 7, 2239, 18107, 868, 2156, 11170, 111, 2167, 10014, 3505, 31, 35237, 14286, 196, 2788, 479, 2], [0, 166, 10516, 5, 2435, 3505, 30, 14978, 49, 16782, 8611, 13, 33760, 7576, 11, 484, 30700, 479, 2], [0, 166, 2450, 16782, 8611, 77, 634, 5, 2435, 3505, 11, 10, 4230, 337, 20627, 3685, 13, 7690, 47041, 479, 2], [0, 166, 172, 8933, 209, 12535, 7, 5, 1907, 11, 5, 7091, 1637, 414, 8, 37357, 5, 8611, 13, 349, 4795, 737, 479, 2], [0, 924, 5, 8611, 9, 5, 430, 3092, 11, 5, 16782, 3685, 13, 5, 130, 430, 30700, 479, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [14]:
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 1522/1522 [00:00<00:00, 6040.21 examples/s]
Map: 100%|██████████| 486/486 [00:00<00:00, 8117.59 examples/s]


In [15]:
len(tokenized_test_dataset), len(tokenized_train_dataset)


(486, 1522)

In [18]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), ignore_mismatched_sizes=True)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at tner/roberta-large-conll2003 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([7, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01
)


In [20]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [21]:
labels = [label_list[i] for i in example[f"{task}_tags"]]
metric.compute(predictions=[labels], references=[labels])

{'METRIC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [22]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [23]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [24]:
trainer.train()

  0%|          | 0/960 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
  0%|          | 1/960 [00:07<1:56:17,  7.28s/it]

RuntimeError: MPS backend out of memory (MPS allocated: 17.38 GB, other allocations: 692.99 MB, max allowed: 18.13 GB). Tried to allocate 88.48 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
trainer.evaluate()

In [None]:
predictions, labels, _ = trainer.predict(tokenized_test_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results