In [13]:
from datasets import load_dataset

file_path = "datasets/ner-large-dataset-train.json"

# Load the dataset from the file
dataset = load_dataset('json', data_files=file_path)

# Print the dataset to verify
print(dataset)
data = dataset["train"]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 226
    })
})


In [14]:
unique_labels = set(label for example in data for label in example["labels"])
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

In [15]:
id2label

{0: 'B-Experience_Level',
 1: 'B-Job_role',
 2: 'B-Skill',
 3: 'O',
 4: 'B-Job_Role',
 5: 'B-Domain'}

In [16]:
label2id

{'B-Experience_Level': 0,
 'B-Job_role': 1,
 'B-Skill': 2,
 'O': 3,
 'B-Job_Role': 4,
 'B-Domain': 5}

In [17]:
labels = dataset['train']['labels']

encoded_labels = []
for label_sequence in labels:
    encoded_sequence = [label2id[label] for label in label_sequence]
    encoded_labels.append(encoded_sequence)

encoded_dataset = dataset['train'].add_column('encoded_labels', encoded_labels)

In [18]:
for i in range(3):
    print('tokens: ', encoded_dataset[i]['tokens'])
    print('labels: ', encoded_dataset[i]['labels'])
    print('encoded_labels: ', encoded_dataset[i]['encoded_labels'])

tokens:  ['We', 'are', 'seeking', 'an', 'experienced', 'Senior', 'Software', 'Engineer', 'with', 'expertise', 'in', 'Python', ',', 'Django', ',', 'and', 'RESTful', 'API', 'development', 'to', 'join', 'our', 'backend', 'team', '.', 'The', 'ideal', 'candidate', 'should', 'have', '5+', 'years', 'of', 'experience', 'in', 'building', 'scalable', 'and', 'high-performance', 'web', 'applications', 'in', 'the', 'tech', 'industry', '.']
labels:  ['O', 'O', 'O', 'O', 'O', 'B-Experience_Level', 'B-Experience_Level', 'B-Job_Role', 'O', 'O', 'O', 'O', 'B-Skill', 'O', 'B-Skill', 'O', 'B-Skill', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Domain', 'O']
encoded_labels:  [3, 3, 3, 3, 3, 0, 0, 4, 3, 3, 3, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 3]
tokens:  ['A', 'leading', 'healthcare', 'company', 'is', 'looking', 'for', 'a', 'Mid-level', 'Data', 'Scientist', 'proficient', 'in', 'machine', 'learning', 'algorithms,', 'data', 'mining,', 'and', 'Python.', 'Experience', 'with', 'T

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

In [20]:

def tokenize_and_align_tags(records):
    # Tokenize the input words. This will break words into subtokens if necessary.
    # For instance, "ChatGPT" might become ["Chat", "##G", "##PT"].
    tokenized_results = tokenizer(records["tokens"], truncation=True, is_split_into_words=True)

    input_tags_list = []

    # Iterate through each set of tags in the records.
    for i, given_tags in enumerate(records["encoded_labels"]):
        # Get the word IDs corresponding to each token. This tells us to which original word each token corresponds.
        word_ids = tokenized_results.word_ids(batch_index=i)

        previous_word_id = None
        input_tags = []

        # For each token, determine which tag it should get.
        for wid in word_ids:
            # If the token does not correspond to any word (e.g., it's a special token), set its tag to -100.
            if wid is None:
                input_tags.append(-100)
            # If the token corresponds to a new word, use the tag for that word.
            elif wid != previous_word_id:
                if wid < len(given_tags):
                    input_tags.append(given_tags[wid])
                else:
                    input_tags.append(-100)
            # If the token is a subtoken (i.e., part of a word we've already tagged), set its tag to -100.
            else:
                input_tags.append(-100)
            previous_word_id = wid

        input_tags_list.append(input_tags)

    # Add the assigned tags to the tokenized results.
    # In the Hugging Face Transformers library, a model recognizes the labels parameter
    # for computing losses along with logits (predictions)
    tokenized_results["labels"] = input_tags_list

    return tokenized_results

tokenized_encoded_dataset = encoded_dataset.map(tokenize_and_align_tags, batched=True)


Map:   0%|          | 0/226 [00:00<?, ? examples/s]

In [21]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-20 15:04:29.219223: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-20 15:04:29.501490: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-20 15:04:29.501624: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
20

In [22]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "dslim/bert-base-NER", num_labels=len(id2label), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) 

In [23]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="my_finetuned_ner_model",
)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_encoded_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/87 [00:00<?, ?it/s]

{'train_runtime': 212.9294, 'train_samples_per_second': 3.184, 'train_steps_per_second': 0.409, 'train_loss': 0.6573095650508486, 'epoch': 3.0}


TrainOutput(global_step=87, training_loss=0.6573095650508486, metrics={'train_runtime': 212.9294, 'train_samples_per_second': 3.184, 'train_steps_per_second': 0.409, 'train_loss': 0.6573095650508486, 'epoch': 3.0})

In [25]:
model_path="/home/abdallah/Documents/new AI/Transformers/NER-fine-tuning/my_finetuned_ner_model"


model.save_pretrained(model_path)

tokenizer.save_pretrained(model_path)

('/home/abdallah/Documents/new AI/Transformers/NER-fine-tuning/my_finetuned_ner_model/tokenizer_config.json',
 '/home/abdallah/Documents/new AI/Transformers/NER-fine-tuning/my_finetuned_ner_model/special_tokens_map.json',
 '/home/abdallah/Documents/new AI/Transformers/NER-fine-tuning/my_finetuned_ner_model/vocab.txt',
 '/home/abdallah/Documents/new AI/Transformers/NER-fine-tuning/my_finetuned_ner_model/added_tokens.json',
 '/home/abdallah/Documents/new AI/Transformers/NER-fine-tuning/my_finetuned_ner_model/tokenizer.json')

In [26]:
from transformers import BertConfig

# Load pre-trained BERT model configuration
pretrained_config = BertConfig.from_pretrained("dslim/bert-base-NER")

# Modify configuration parameters for fine-tuning
pretrained_config.num_labels = len(id2label)  # Set the number of labels
pretrained_config.id2label = id2label  # Set the id to label mapping
pretrained_config.label2id = label2id  # Set the label to id mapping
pretrained_config.ignore_mismatched_sizes = True  # Ignore mismatched sizes

# Serialize modified configuration object to JSON string
config_json = pretrained_config.to_json_string()

# Save JSON string to config.json file
with open("config.json", "w") as config_file:
    config_file.write(config_json)


In [27]:
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer


# Load the fine-tuned model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Create a pipeline for named entity recognition (NER)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)



In [33]:
# Example input text
tokens = "Junior Data Scientist with expertise in machine learning and Python and ruby."
results = ner_pipeline(tokens)

# Display the results
for result in results:
    print(f"Entity: {result['word']}, Type: {result['entity']}")


Entity: Junior, Type: B-Experience_Level
Entity: Data, Type: B-Job_Role
Entity: Scientist, Type: B-Job_Role
Entity: machine, Type: B-Skill
Entity: learning, Type: B-Skill
Entity: Python, Type: B-Skill
Entity: rub, Type: B-Skill
