In [8]:
from transformers import RobertaTokenizer, RobertaForTokenClassification
import torch


In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

[0, 20920, 232, 2]

In [7]:
model = RobertaForTokenClassification.from_pretrained("roberta-base")

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
print(inputs)
labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)
print(labels)

{'input_ids': tensor([[    0, 31414,     6,   127,  2335,    16, 11962,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[1, 1, 1, 1, 1, 1, 1, 1]])


In [12]:
outputs = model(**inputs, labels=labels)
print(outputs)
loss, scores = outputs[:2]
print(loss)
print(scores)

TokenClassifierOutput(loss=tensor(0.9804, grad_fn=<NllLossBackward0>), logits=tensor([[[ 0.4220, -0.0332],
         [ 0.5415,  0.0371],
         [ 0.4046,  0.0037],
         [ 0.4498, -0.0976],
         [ 0.5704,  0.1460],
         [ 0.6600, -0.0101],
         [ 0.5079, -0.0814],
         [ 0.4309, -0.0475]]], grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)
tensor(0.9804, grad_fn=<NllLossBackward0>)
tensor([[[ 0.4220, -0.0332],
         [ 0.5415,  0.0371],
         [ 0.4046,  0.0037],
         [ 0.4498, -0.0976],
         [ 0.5704,  0.1460],
         [ 0.6600, -0.0101],
         [ 0.5079, -0.0814],
         [ 0.4309, -0.0475]]], grad_fn=<ViewBackward0>)


In [14]:
from transformers import AutoTokenizer, RobertaForQuestionAnswering
import torch

tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

# target is "nice puppet"
target_start_index = torch.tensor([14])
target_end_index = torch.tensor([15])

outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
loss = outputs.loss
round(loss.item(), 2)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


0.86

In [19]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()
predict_answer_tokens = inputs.input_ids[0, target_start_index : target_end_index + 1]
tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

' nice puppet'

In [20]:
from transformers import AutoTokenizer, RobertaForTokenClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
model = RobertaForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")

inputs = tokenizer(
    "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
)

with torch.no_grad():
    logits = model(**inputs).logits

predicted_token_class_ids = logits.argmax(-1)

# Note that tokens are classified rather then input words which means that
# there might be more predicted token classes than words.
# Multiple token classes might account for the same word
predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
predicted_tokens_classes

labels = predicted_token_class_ids
loss = model(**inputs, labels=labels).loss
round(loss.item(), 2)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


0.01

In [41]:
import torch
from transformers import RobertaTokenizerFast, RobertaForTokenClassification, Trainer, TrainingArguments


# Load the dataset (replace 'your_dataset' with the actual dataset)
# The dataset should be in a format compatible with the Hugging Face datasets library
dataset = {
    "sentence": [
        "My skills include project management and communication.",
        "I have expertise in Python and data analysis.",
        "Proficient in Java, C++, and machine learning.",
        "Experienced in financial analysis and strategic planning.",
        "Skilled in cloud computing, databases, and network security."
    ],
    "labels": [
        ["O", "O", "O", "O", "B-SKILL", "O", "B-SKILL", "I-SKILL"],
        ["O", "O", "O", "B-SKILL", "I-SKILL", "O", "B-SKILL"],
        ["O", "O", "B-SKILL", "B-SKILL", "O", "B-SKILL", "I-SKILL"],
        ["O", "O", "B-SKILL", "I-SKILL", "O", "B-SKILL", "I-SKILL"],
        ["O", "O", "B-SKILL", "I-SKILL", "O", "B-SKILL", "O", "B-SKILL", "I-SKILL"]
    ]
}

# Load tokenizer and model
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)
model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=3)

# Define label mappings
label_list = ["O", "B-SKILL", "I-SKILL"]
label_map = {label: i for i, label in enumerate(label_list)}

# Tokenize the dataset
def tokenize_and_align_labels(examples):
    text = examples["text"].split(" ")
    print(text)
    tokenized_inputs = tokenizer(text, truncation=True, is_split_into_words=True)
    labels = []
    print("tokenized_inputs", tokenized_inputs)

    word_ids = tokenized_inputs.word_ids()
    word_ids = word_ids[1:-1]
    for label in examples[f"ner_tags"]:
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_map[label])
            else:
                label_ids.append(label_map[label] if label != "O" else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = []
for i in range(len(dataset["sentence"])-1):
    tokenized_datasets.append(tokenize_and_align_labels({"text": dataset["sentence"][i], "ner_tags": dataset["labels"][i]}))
print(tokenized_datasets)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['My', 'skills', 'include', 'project', 'management', 'and', 'communication.']
tokenized_inputs {'input_ids': [0, 1308, 2417, 680, 695, 1052, 8, 4358, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['I', 'have', 'expertise', 'in', 'Python', 'and', 'data', 'analysis.']
tokenized_inputs {'input_ids': [0, 38, 33, 6424, 11, 31886, 8, 414, 1966, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['Proficient', 'in', 'Java,', 'C++,', 'and', 'machine', 'learning.']
tokenized_inputs {'input_ids': [0, 6853, 35056, 11, 24549, 6, 230, 42964, 6, 8, 3563, 2239, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['Experienced', 'in', 'financial', 'analysis', 'and', 'strategic', 'planning.']
tokenized_inputs {'input_ids': [0, 26403, 33582, 11, 613, 1966, 8, 3461, 1884, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[{'input_ids': [0, 1308, 2417, 680, 695, 1052, 8, 4358, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [[0, 0, 0, 0, 0,

In [47]:
from datasets import load_metric
import numpy as np

# Define metrics for evaluation
metric = load_metric("seqeval")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
# Example text
text = "I have expertise in Python and data analysis."

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, is_split_into_words=True)

# Predict
with torch.no_grad():
    logits = model(**inputs).logits

# Get the predicted class indices
predicted_token_class_ids = logits.argmax(-1)

# Map indices to labels
predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]

# Print the result
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
result = list(zip(tokens, predicted_tokens_classes))
for token, label in result:
    print(f"{token}: {label}")