In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from pprint import pprint
from transformers import AutoTokenizer
import datasets
from datasets import load_metric
import pandas as pd

In [None]:
def read_conll(file):
    examples = []
    # example = {col: [] for col in INPUT_COLUMNS}
    idx = 0
    example = {"id":idx, "tokens": [], "ner_tags":[]}
    
    with open(file) as f:
        for line in f:
            if line.startswith("-DOCSTART-") or line == "\n" or not line:
                assert len(example["tokens"]) == len(example["ner_tags"])
                examples.append(example)
                idx+=1
                example = {"id":idx, "tokens": [], "ner_tags":[]}
            else:
                row_cols = line.split()
                assert len(row_cols) == 3
                example["tokens"].append(row_cols[0])
                example["ner_tags"].append(row_cols[-1])

    return examples

In [None]:
def get_dataset(dataset_path):
    test_data = read_conll( dataset_path+"/test_500_v2.conll")
    train_data = read_conll( dataset_path+"/train_1500_v2.conll")
    ner_feature = datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                "O",
                                "B-TASK",
                                "I-TASK",
                                "B-METRIC",
                                "I-METRIC",
                                "B-DATASET",
                                "I-DATASET"
                            ]
                        )
                    )

    token_feature = datasets.Sequence(datasets.Value("string"))
    id_feature = datasets.Value("string")
    train_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=train_data), features=datasets.Features({
        "id":id_feature,
        "ner_tags":ner_feature,
        "tokens" : token_feature
    }))
    test_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=test_data), features=datasets.Features({
        "id":id_feature,
        "ner_tags":ner_feature,
        "tokens" : token_feature
    }))
    return train_dataset, test_dataset

In [None]:
dataset_path = "/home/afreens/projects/nlp_from_scratch_assignment"
train_dataset, test_dataset = get_dataset(dataset_path)
task = "ner"
label_list = train_dataset.features[f"{task}_tags"].feature.names

In [None]:
# model_checkpoint = "distilbert-base-uncased"

model_checkpoint = "tner/roberta-large-conll2003"
batch_size = 2
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [None]:
example = train_dataset[4]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

In [None]:
print(len(example[f"{task}_tags"]), len(tokenized_input["input_ids"]))
print(tokenized_input.word_ids())

In [None]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"{task}_tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

In [None]:
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenize_and_align_labels(train_dataset[:5])

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
len(tokenized_test_dataset), len(tokenized_train_dataset)


In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from zmq import device

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), ignore_mismatched_sizes=True)
device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

# tokenized_train_dataset.to(device)
# tokenized_test_dataset.to(device)


In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}1",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=15,
    weight_decay=0.01,
    save_total_limit = 1
    # per_device_train_batch_size=4,
    # per_device_eval_batch_size=4
)

# args.device = device


In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

In [None]:
labels = [label_list[i] for i in example[f"{task}_tags"]]
metric.compute(predictions=[labels], references=[labels])

In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
predictions, labels, _ = trainer.predict(tokenized_test_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

In [None]:
import torch
torch.cuda.empty_cache()