In [48]:
from datasets import *

file_path = "datasets/ner-large-dataset-train.json"

# Load the dataset from the file
dataset = load_dataset('json', data_files=file_path)

train_valid = dataset['train'].train_test_split(test_size=0.2)
train = train_valid['train']
valid = train_valid['test']

ds = DatasetDict({"train": train, "valid": valid})

ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'tokens'],
        num_rows: 245
    })
    valid: Dataset({
        features: ['labels', 'tokens'],
        num_rows: 62
    })
})

In [49]:
unique_labels = set(label for example in train for label in example["labels"])
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

In [50]:
id2label

{0: 'O',
 1: 'B-Experience_Level',
 2: 'B-Job_Role',
 3: 'B-Domain',
 4: 'B-Job_role',
 5: 'B-Skill'}

In [51]:
label2id

{'O': 0,
 'B-Experience_Level': 1,
 'B-Job_Role': 2,
 'B-Domain': 3,
 'B-Job_role': 4,
 'B-Skill': 5}

In [56]:
labels_train = ds['train']['labels']
labels_valid = ds['valid']['labels']
encoded_labels_train = []
for label_sequence in labels_train:
    encoded_sequence = [label2id[label] for label in label_sequence]
    encoded_labels_train.append(encoded_sequence)
    
encoded_labels_valid = []
for label_sequence in labels_valid:
    encoded_sequence = [label2id[label] for label in label_sequence]
    encoded_labels_valid.append(encoded_sequence)

encoded_dataset = ds['train'].add_column('encoded_labels', encoded_labels_train)
encoded_valid = ds['valid'].add_column('encoded_labels', encoded_labels_valid)

In [44]:
for i in range(3):
    print('tokens: ', encoded_dataset[i]['tokens'])
    print('labels: ', encoded_dataset[i]['labels'])
    print('encoded_labels: ', encoded_dataset[i]['encoded_labels'])

tokens:  ['We', 'are', 'seeking', 'an', 'experienced', 'Senior', 'Software', 'Engineer', 'with', 'expertise', 'in', 'Python', ',', 'Django', ',', 'and', 'RESTful', 'API', 'development', 'to', 'join', 'our', 'backend', 'team', '.', 'The', 'ideal', 'candidate', 'should', 'have', '5+', 'years', 'of', 'experience', 'in', 'building', 'scalable', 'and', 'high-performance', 'web', 'applications', 'in', 'the', 'tech', 'industry', '.']
labels:  ['O', 'O', 'O', 'O', 'O', 'B-Experience_Level', 'B-Experience_Level', 'B-Job_Role', 'O', 'O', 'O', 'O', 'B-Skill', 'O', 'B-Skill', 'O', 'B-Skill', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Domain', 'O']
encoded_labels:  [0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 5, 0, 5, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0]
tokens:  ['A', 'leading', 'healthcare', 'company', 'is', 'looking', 'for', 'a', 'Mid-level', 'Data', 'Scientist', 'proficient', 'in', 'machine', 'learning', 'algorithms,', 'data', 'mining,', 'and', 'Python.', 'Experience', 'with', 'T

In [57]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

In [58]:

def tokenize_and_align_tags(records):
    # Tokenize the input words. This will break words into subtokens if necessary.
    # For instance, "ChatGPT" might become ["Chat", "##G", "##PT"].
    tokenized_results = tokenizer(records["tokens"], truncation=True, is_split_into_words=True)

    input_tags_list = []

    # Iterate through each set of tags in the records.
    for i, given_tags in enumerate(records["encoded_labels"]):
        # Get the word IDs corresponding to each token. This tells us to which original word each token corresponds.
        word_ids = tokenized_results.word_ids(batch_index=i)

        previous_word_id = None
        input_tags = []

        # For each token, determine which tag it should get.
        for wid in word_ids:
            # If the token does not correspond to any word (e.g., it's a special token), set its tag to -100.
            if wid is None:
                input_tags.append(-100)
            # If the token corresponds to a new word, use the tag for that word.
            elif wid != previous_word_id:
                if wid < len(given_tags):
                    input_tags.append(given_tags[wid])
                else:
                    input_tags.append(-100)
            # If the token is a subtoken (i.e., part of a word we've already tagged), set its tag to -100.
            else:
                input_tags.append(-100)
            previous_word_id = wid

        input_tags_list.append(input_tags)

    # Add the assigned tags to the tokenized results.
    # In the Hugging Face Transformers library, a model recognizes the labels parameter
    # for computing losses along with logits (predictions)
    tokenized_results["labels"] = input_tags_list

    return tokenized_results

tokenized_encoded_dataset = encoded_dataset.map(tokenize_and_align_tags, batched=True)
tokenized_encoded_valid = encoded_valid.map(tokenize_and_align_tags, batched=True)


Map:   0%|          | 0/245 [00:00<?, ? examples/s]

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

In [59]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [60]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "dslim/bert-base-NER", num_labels=len(id2label), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) 

### Metric for evaluation

In [70]:
label_list = [label for id, label in sorted(id2label.items())]
label_list

['O', 'B-Experience_Level', 'B-Job_Role', 'B-Domain', 'B-Job_role', 'B-Skill']

In [72]:
from datasets import load_metric
import numpy as np

metric = load_metric("seqeval")


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [63]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="my_finetuned_ner_model",
    evaluation_strategy="epoch",
)

In [73]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_encoded_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    eval_dataset=tokenized_encoded_valid,  # Pass the validation dataset
    compute_metrics= compute_metrics
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/93 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.5960123538970947, 'eval_precision': 0.8107255520504731, 'eval_recall': 0.7648809523809523, 'eval_f1': 0.7871362940275649, 'eval_accuracy': 0.8094534711964549, 'eval_runtime': 2.4673, 'eval_samples_per_second': 25.128, 'eval_steps_per_second': 3.242, 'epoch': 1.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.6159459352493286, 'eval_precision': 0.782991202346041, 'eval_recall': 0.7946428571428571, 'eval_f1': 0.7887740029542099, 'eval_accuracy': 0.8079763663220089, 'eval_runtime': 2.5714, 'eval_samples_per_second': 24.111, 'eval_steps_per_second': 3.111, 'epoch': 2.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.6356953382492065, 'eval_precision': 0.7905604719764012, 'eval_recall': 0.7976190476190477, 'eval_f1': 0.7940740740740742, 'eval_accuracy': 0.8094534711964549, 'eval_runtime': 2.5935, 'eval_samples_per_second': 23.905, 'eval_steps_per_second': 3.085, 'epoch': 3.0}
{'train_runtime': 167.8537, 'train_samples_per_second': 4.379, 'train_steps_per_second': 0.554, 'train_loss': 0.20442570922195272, 'epoch': 3.0}


TrainOutput(global_step=93, training_loss=0.20442570922195272, metrics={'train_runtime': 167.8537, 'train_samples_per_second': 4.379, 'train_steps_per_second': 0.554, 'train_loss': 0.20442570922195272, 'epoch': 3.0})

In [65]:
model_path="/home/abdallah/Documents/new AI/Transformers/NER-fine-tuning/my_finetuned_ner_model"


model.save_pretrained(model_path)

tokenizer.save_pretrained(model_path)

('/home/abdallah/Documents/new AI/Transformers/NER-fine-tuning/my_finetuned_ner_model/tokenizer_config.json',
 '/home/abdallah/Documents/new AI/Transformers/NER-fine-tuning/my_finetuned_ner_model/special_tokens_map.json',
 '/home/abdallah/Documents/new AI/Transformers/NER-fine-tuning/my_finetuned_ner_model/vocab.txt',
 '/home/abdallah/Documents/new AI/Transformers/NER-fine-tuning/my_finetuned_ner_model/added_tokens.json',
 '/home/abdallah/Documents/new AI/Transformers/NER-fine-tuning/my_finetuned_ner_model/tokenizer.json')

In [66]:
from transformers import BertConfig

# Load pre-trained BERT model configuration
pretrained_config = BertConfig.from_pretrained("dslim/bert-base-NER")

# Modify configuration parameters for fine-tuning
pretrained_config.num_labels = len(id2label)  # Set the number of labels
pretrained_config.id2label = id2label  # Set the id to label mapping
pretrained_config.label2id = label2id  # Set the label to id mapping
pretrained_config.ignore_mismatched_sizes = True  # Ignore mismatched sizes

# Serialize modified configuration object to JSON string
config_json = pretrained_config.to_json_string()

# Save JSON string to config.json file
with open("config.json", "w") as config_file:
    config_file.write(config_json)


In [67]:
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer


# Load the fine-tuned model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Create a pipeline for named entity recognition (NER)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)



In [68]:
# Example input text
tokens = "Junior Data Scientist and machine learning developer"
results = ner_pipeline(tokens)

# Display the results
for result in results:
    print(f"Entity: {result['word']}, Type: {result['entity']}")


Entity: Junior, Type: B-Experience_Level
Entity: Data, Type: B-Job_Role
Entity: Scientist, Type: B-Job_Role
Entity: machine, Type: B-Skill


In [None]:
# Entity: Junior, Type: B-Experience_Level
# Entity: Data, Type: B-Job_Role
# Entity: Scientist, Type: B-Job_Role
# Entity: machine, Type: B-Skill
# Entity: learning, Type: B-Skill
# Entity: Python, Type: B-Skill
# Entity: rub, Type: B-Skill