<a href="https://colab.research.google.com/github/AbeHandler/AbeHandler.github.io/blob/master/HW3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Preliminaries

In order complete the assignment you will need to take three preliminary steps.

1. Uncomment the next line and install the required dependencies using pip. You don't have to use a GPU runtime to complete this assignment.

DO WE NEED THESE?
1. Create a free account on [Huggingface](https://huggingface.co/docs/hub/security-tokens). You don't need to provide credit card information, and do not need to pay for anything.
2. Create an access token by following the steps outlined [here](https://huggingface.co/docs/hub/security-tokens).
3. Login to huggingface with your access token by running the cell below. You will be asked to provide your login token.

In [4]:
# ! pip install transformers datasets evaluate seqeval transformers[torch]

In [3]:
#from datasets import load_dataset
#dataset = load_dataset("json", data_files="data.jsonl")

In [1]:
# load the necessary dependencies

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report


In [33]:
def read_conll_file(file_path):
    lines = 0
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
            lines += 1
    return data


train_data = read_conll_file("train.conll.txt")
validation_data = read_conll_file("validation.conll.txt")
#test_data = read_conll_file("test.txt")


def convert_to_dataset(data, label_map):
    formatted_data = {"tokens": [], "ner_tags": []}
    for sentence in data:
        if sentence[0] != []:
            try:
                tokens = [token_data[0] for token_data in sentence]
                ner_tags = [label_map[token_data[1]] for token_data in sentence]
                formatted_data["tokens"].append(tokens)
                formatted_data["ner_tags"].append(ner_tags)
            except IndexError:
                pass
            except KeyError:
                print(sentence)
    return Dataset.from_dict(formatted_data)


label_set = set()
for sentence in train_data:
    for token_data in sentence:
        try:
            label_set.add(token_data[1])
        except IndexError:
            pass

label_list = sorted(list(label_set))
label_map = {label: i for i, label in enumerate(label_list)}


train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
#test_dataset = convert_to_dataset(test_data, label_map)


datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    #"test": test_dataset,
})


In [6]:
#def read_conll_file(file_path):
#    lines = 0
#    with open(file_path, "r") as f:
#        content = f.read().strip()
#        sentences = content.split("\n\n")
#        data = []
#        for sentence in sentences:
#            tokens = sentence.split("\n")
#            token_data = []
#            for token in tokens:
#                token_data.append(token.split())
#            data.append(token_data)
#            lines += 1
#    return data
# train_data = read_conll_file("train.txt")

In [8]:
#from huggingface_hub import notebook_login
#notebook_login()

In [None]:
#

If everything worked, you should see something like this.

<img src="https://i.ibb.co/68MdmdT/meta.png" alt="Alternative text" />

### Inspect the data

In [7]:
#from datasets import load_dataset
#wnut = load_dataset("wnut_17")
#wnut

In [34]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)


    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        # "classification_report": classification_report(true_labels, true_predictions),
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [42]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    learning_rate=5e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


Map:   0%|          | 0/5120 [00:00<?, ? examples/s]

Map:   0%|          | 0/474 [00:00<?, ? examples/s]

In [43]:
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]


    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [28]:
sentence = "I have lupus"


tokenized_input = tokenizer(sentence, return_tensors="pt").to(model.device)


outputs = model(**tokenized_input)


predicted_labels = outputs.logits.argmax(-1)[0]


named_entities = [(tokenizer.decode([token]), label) for token, label in zip(tokenized_input["input_ids"][0], predicted_labels) if label != 0 and label != label_map['O']]


print("Named Entities - Example 1:", named_entities)


Named Entities - Example 1: [('lu', tensor(2, device='cuda:0')), ('##pus', tensor(2, device='cuda:0'))]


In [32]:
label_map

{'B-I': 0, 'B-O': 1, 'B-P': 2, 'I-I': 3, 'I-O': 4, 'I-P': 5, 'O': 6}

In [19]:
# dataset["train"]

Dataset({
    features: ['ner_tags', 'tokens', 'id'],
    num_rows: 2265
})

### BIO tags question

### Load the tokenizer

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer question here

- [word piece](https://huggingface.co/learn/nlp-course/chapter6/6?fw=pt)
- CLS and SEP

In [6]:
# Example text

text = "He told me that the Nissan funduplication surgery was a success for 80% of cases after 15 years."

inputs = tokenizer(text, return_tensors="pt")
tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())

['[CLS]',
 'he',
 'told',
 'me',
 'that',
 'the',
 'nissan',
 'fund',
 '##up',
 '##lica',
 '##tion',
 'surgery',
 'was',
 'a',
 'success',
 'for',
 '80',
 '%',
 'of',
 'cases',
 'after',
 '15',
 'years',
 '.',
 '[SEP]']

In [9]:
V = tokenizer.vocab
"he" in V

True

In [12]:
len(V)

30522

In [13]:
"funduplication" in V # sparsity of text

False

In [13]:
example = wnut["train"][0]
example

{'id': '0',
 'tokens': ['@paulwalk',
  'It',
  "'s",
  'the',
  'view',
  'from',
  'where',
  'I',
  "'m",
  'living',
  'for',
  'two',
  'weeks',
  '.',
  'Empire',
  'State',
  'Building',
  '=',
  'ESB',
  '.',
  'Pretty',
  'bad',
  'storm',
  'here',
  'last',
  'evening',
  '.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  8,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [6]:
example = wnut["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 '@',
 'paul',
 '##walk',
 'it',
 "'",
 's',
 'the',
 'view',
 'from',
 'where',
 'i',
 "'",
 'm',
 'living',
 'for',
 'two',
 'weeks',
 '.',
 'empire',
 'state',
 'building',
 '=',
 'es',
 '##b',
 '.',
 'pretty',
 'bad',
 'storm',
 'here',
 'last',
 'evening',
 '.',
 '[SEP]']

In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

In [8]:
import evaluate

seqeval = evaluate.load("seqeval")


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [9]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [10]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [19]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.267852,0.547718,0.367006,0.439512,0.945022
2,No log,0.280791,0.59816,0.361446,0.450607,0.946518
3,0.072800,0.320172,0.595016,0.354032,0.443928,0.945919
4,0.072800,0.314644,0.55291,0.387396,0.455586,0.947031
5,0.033100,0.30468,0.535055,0.403151,0.459831,0.946261


TrainOutput(global_step=1065, training_loss=0.05105608259568192, metrics={'train_runtime': 132.4988, 'train_samples_per_second': 128.077, 'train_steps_per_second': 8.038, 'total_flos': 230447974115460.0, 'train_loss': 0.05105608259568192, 'epoch': 5.0})

In [13]:
model.save_pretrained('./my_awesome_wnut_model')
tokenizer.save_pretrained('./my_awesome_wnut_model')


('./my_awesome_wnut_model/tokenizer_config.json',
 './my_awesome_wnut_model/special_tokens_map.json',
 './my_awesome_wnut_model/vocab.txt',
 './my_awesome_wnut_model/added_tokens.json',
 './my_awesome_wnut_model/tokenizer.json')

In [14]:
from transformers import pipeline

text = "The Golden State Warriors are an American professional basketball team based in San Francisco."

[{'entity': 'B-location',
  'score': 0.49131355,
  'index': 2,
  'word': 'golden',
  'start': 4,
  'end': 10},
 {'entity': 'B-location',
  'score': 0.234397,
  'index': 3,
  'word': 'state',
  'start': 11,
  'end': 16},
 {'entity': 'B-location',
  'score': 0.55225897,
  'index': 13,
  'word': 'san',
  'start': 80,
  'end': 83},
 {'entity': 'B-location',
  'score': 0.48205167,
  'index': 14,
  'word': 'francisco',
  'start': 84,
  'end': 93}]

In [24]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification


text = "I love New York"

tokenizer = AutoTokenizer.from_pretrained("./my_awesome_wnut_model")
inputs = tokenizer(text, return_tensors="pt")

model = AutoModelForTokenClassification.from_pretrained("./my_awesome_wnut_model")
with torch.no_grad():
    logits = model(**inputs).logits

In [25]:
predictions = torch.argmax(logits, dim=2)
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
predicted_token_class

['O', 'O', 'O', 'B-location', 'B-location', 'O']

In [29]:
inputs["input_ids"]

tensor([[ 101, 1045, 2293, 2047, 2259,  102]])

In [30]:
tokenizer("this is a test")

{'input_ids': [101, 2023, 2003, 1037, 3231, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [32]:
# Get predictions
with torch.no_grad():
    logits = model(**inputs).logits

# Get the predicted token ids
predicted_token_ids = torch.argmax(logits, dim=-1)

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())

# Map tokens to their predictions
for token, prediction in zip(tokens, predicted_token_ids.squeeze().tolist()):
    print(f"{token}: {prediction}")

[CLS]: 0
i: 0
love: 0
new: 7
york: 7
[SEP]: 0


In [33]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())

['[CLS]', 'i', 'love', 'new', 'york', '[SEP]']