In [1]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

In [2]:
# Create a small dataset
data = {
    "sentence": [
        "Braulio Jose Cespedes Acosta",
        "Toronto, ON",
        "(437) 733-7952",
        "brauliojose.cespedesacosta@georgebrown.ca",
        "Java",
        "Python",
        "Oracle",
        "MS SQL server",
        "MySQL",
        "Git",
        "Microsoft Office",
        "Java Mockito",
        "Python Pytest",
        "Tableau",
        "Hadoop",
        "Spark",
        "Hive",
        "Apache Pig"
    ],
    "label": [
        "O O O B-NAME",
        "O O",
        "O O O O O O",
        "O O O O O",
        "B-SKILL",
        "B-SKILL",
        "B-SKILL",
        "B-SKILL I-SKILL",
        "B-SKILL",
        "B-SKILL",
        "B-SKILL I-SKILL",
        "B-SKILL I-SKILL",
        "B-SKILL I-SKILL",
        "B-SKILL",
        "B-SKILL",
        "B-SKILL",
        "B-SKILL",
        "B-SKILL I-SKILL"
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

In [3]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [4]:
for i, label in enumerate(dataset["label"]):
    print(f"i:{i}, label:{label}")

i:0, label:O O O B-NAME
i:1, label:O O
i:2, label:O O O O O O
i:3, label:O O O O O
i:4, label:B-SKILL
i:5, label:B-SKILL
i:6, label:B-SKILL
i:7, label:B-SKILL I-SKILL
i:8, label:B-SKILL
i:9, label:B-SKILL
i:10, label:B-SKILL I-SKILL
i:11, label:B-SKILL I-SKILL
i:12, label:B-SKILL I-SKILL
i:13, label:B-SKILL
i:14, label:B-SKILL
i:15, label:B-SKILL
i:16, label:B-SKILL
i:17, label:B-SKILL I-SKILL


In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenized_inputs = None
tokenized_inputs = tokenizer(dataset["label"], truncation=True, padding=True, is_split_into_words=True)

In [12]:
tokenized_inputs.word_ids(batch_index=0)

[None,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 5,
 5,
 5,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 13,
 13,
 13,
 14,
 14,
 14,
 15,
 15,
 15,
 16,
 16,
 16,
 17,
 17,
 17,
 17,
 17,
 17,
 None]

In [5]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the sentences
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["sentence"], truncation=True, padding=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["label"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label.split()[word_idx])
            else:
                label_ids.append(label.split()[word_idx] if label.split()[word_idx].startswith("I-") else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

IndexError: list index out of range

In [None]:
# Initialize the model
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model
trainer.train()