In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
from pprint import pprint
from transformers import AutoTokenizer
import datasets
from datasets import load_metric
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def read_conll(file):
    examples = []
    # example = {col: [] for col in INPUT_COLUMNS}
    idx = 0
    example = {"id":idx, "tokens": [], "ner_tags":[]}
    
    with open(file) as f:
        for line in f:
            if line.startswith("-DOCSTART-") or line == "\n" or not line:
                assert len(example["tokens"]) == len(example["ner_tags"])
                examples.append(example)
                idx+=1
                example = {"id":idx, "tokens": [], "ner_tags":[]}
            else:
                row_cols = line.split()
                assert len(row_cols) == 4
                example["tokens"].append(row_cols[0])
                example["ner_tags"].append(row_cols[-1])

    return examples

In [4]:
from sklearn.model_selection import train_test_split

def get_dataset(dataset_paths):
    all_data  =sum([read_conll(x) for x in dataset_paths], [])
    train_data, test_data = train_test_split(all_data)
    ner_feature = datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                "O",
                                "B-MethodName",
                                "I-MethodName",
                                "B-HyperparameterName",
                                "I-HyperparameterName",
                                "B-HyperparameterValue",
                                "I-HyperparameterValue",
                                "B-MetricName",
                                "I-MetricName",
                                "B-MetricValue",
                                "I-MetricValue",
                                "B-TaskName",
                                "I-TaskName",
                                "B-DatasetName",
                                "I-DatasetName"
                            ]
                        )
                    )
    



    token_feature = datasets.Sequence(datasets.Value("string"))
    id_feature = datasets.Value("string")
    train_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=train_data), features=datasets.Features({
        "id":id_feature,
        "ner_tags":ner_feature,
        "tokens" : token_feature
    }))
    test_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=test_data), features=datasets.Features({
        "id":id_feature,
        "ner_tags":ner_feature,
        "tokens" : token_feature
    }))
    return train_dataset, test_dataset

In [5]:
import glob
dataset_paths = "/home/afreens/projects/nlp_from_scratch_assignment/labelled_data/*"
train_dataset, test_dataset = get_dataset(glob.glob(dataset_paths))
task = "ner"
label_list = train_dataset.features[f"{task}_tags"].feature.names

In [6]:
# model_checkpoint = "distilbert-base-uncased"

# model_checkpoint = "/data/user_data/afreens/roberta-large-conll2003-finetuned-ner1/checkpoint-1500"
model_checkpoint = "/data/user_data/afreens/checkpoint-1500-finetuned-ner_v2/checkpoint-1500/"
batch_size = 2
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [7]:
example = train_dataset[4]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['<s>', 'ĠWe', 'Ġevaluate', 'Ġthe', 'Ġstatistical', 'Ġsignificance', 'Ġof', 'Ġthe', 'Ġpair', 'wise', 'Ġdifferences', 'Ġin', 'Ġthe', 'Ġproportions', 'Ġof', 'Ġcorrect', 'Ġand', 'Ġhalluc', 'inated', 'Ġtranslations', 'Ġusing', 'Ġtwo', 'Ġ-', 'Ġsided', 'ĠStudent', 'Ġtest', 'Ġfor', 'Ġtwo', 'Ġrelated', 'Ġsamples', 'Ġwith', 'Ġ5', 'Ġ%', 'Ġconfidence', 'Ġlevel', 'Ġ.', 'ĠWe', 'Ġprovide', 'Ġmore', 'Ġdetails', 'Ġon', 'Ġthe', 'Ġannotation', 'Ġguidelines', 'Ġand', 'Ġinter', 'Ġ-', 'Ġannotation', 'Ġagreement', 'Ġin', 'ĠAppendix', '</s>']


In [8]:
print(len(example[f"{task}_tags"]), len(tokenized_input["input_ids"]))
print(tokenized_input.word_ids())

48 52
[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, None]


In [9]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"{task}_tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

58 58


In [9]:
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
tokenize_and_align_labels(train_dataset[:5])

{'input_ids': [[0, 39848, 9, 42806, 4052, 8738, 449, 20057, 111, 14077, 7753, 11282, 5, 4472, 227, 10, 25860, 37681, 8, 762, 44493, 634, 42806, 25, 1602, 11, 7162, 155, 4, 176, 479, 20, 5838, 9, 42806, 11, 305, 11674, 108, 1646, 926, 111, 2271, 16, 7646, 11, 9513, 361, 479, 20, 775, 311, 14, 22, 885, 1589, 42806, 22, 16, 3667, 204, 7, 195, 498, 3845, 87, 22, 885, 1589, 1021, 42806, 22, 479, 2], [0, 96, 42, 173, 2156, 52, 5393, 442, 20181, 3693, 11857, 26492, 7891, 268, 55, 5693, 1241, 5, 24934, 1938, 9, 49, 23341, 8, 30264, 1635, 479, 28256, 2787, 5, 23341, 9, 10, 26739, 1546, 16, 5616, 13, 26640, 8, 2386, 5, 1421, 7, 28, 10696, 55, 14146, 479, 635, 2156, 26640, 1937, 473, 45, 1888, 44042, 1042, 187, 5, 1546, 30264, 1635, 240, 7, 28, 43547, 11, 455, 15339, 479, 28256, 2787, 258, 23341, 8, 30264, 1635, 2386, 44042, 7, 28, 3744, 19, 795, 15339, 2156, 2905, 981, 7, 1233, 5838, 3077, 6122, 15, 5, 24934, 1938, 672, 8, 6554, 5574, 479, 28256, 2787, 26739, 4836, 33, 10, 251, 750, 2156, 8, 153

In [11]:
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/2460 [00:00<?, ? examples/s]

Map: 100%|██████████| 2460/2460 [00:00<00:00, 2735.31 examples/s]
Map: 100%|██████████| 821/821 [00:00<00:00, 3689.93 examples/s]


In [12]:
len(tokenized_test_dataset), len(tokenized_train_dataset)


(821, 2460)

In [13]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from zmq import device
from torch import nn

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), ignore_mismatched_sizes=True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model= nn.DataParallel(model,device_ids = [1, 3])
model = model.to(device)

# tokenized_train_dataset.to(device)
# tokenized_test_dataset.to(device)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"/data/user_data/afreens/{model_name}-finetuned-{task}_v2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit = 1
    # per_device_train_batch_size=4,
    # per_device_eval_batch_size=4
)

# args.device = device


In [15]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [16]:
labels = [label_list[i] for i in example[f"{task}_tags"]]
metric.compute(predictions=[labels], references=[labels])

{'MetricName': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'MetricValue': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [17]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [18]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
# trainer.train()

In [None]:
trainer.evaluate()

In [None]:
predictions, labels, _ = trainer.predict(tokenized_test_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

In [None]:
predictions, labels, _ = trainer.predict(tokenized_test_dataset[:5])
