# Importing Libs' & DataSet

In [15]:
import numpy as np
import pandas as pd
from datasets import Dataset

df = pd.read_csv('/content/ner_dataset.csv')
df.head()

Unnamed: 0,Text,Entities
0,Emma 2022 New York.,"[{'entity': 'PERSON', 'start': 0, 'end': 4}, {..."
1,Emma.,"[{'entity': 'PERSON', 'start': 0, 'end': 4}]"
2,January 2023 Amazon Boston.,"[{'entity': 'DATE', 'start': 0, 'end': 12}, {'..."
3,Google Google.,"[{'entity': 'ORG', 'start': 0, 'end': 6}, {'en..."
4,New York.,"[{'entity': 'LOC', 'start': 0, 'end': 8}]"


# EDA

In [16]:
import ast

def safe_literal_eval(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return []

df['Entities'] = df['Entities'].apply(safe_literal_eval)
dataset = Dataset.from_pandas(df)

In [17]:
# Extracts entity's like ['O', 'PERSON']
unique_lbels = ['O'] + [ent['entity'] for record in df['Entities'] for ent in record]
label_to_id = {label : idx for idx, label in enumerate(set(unique_lbels))}
id_to_label = {idx : label for label, idx in label_to_id.items()}

# Importing Model & Tokenizer

In [18]:
from transformers import AutoTokenizer
from datasets import ClassLabel

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')

# Tokenization & Label Align (NER Data Prep)

In [19]:
def tokenize_and_align_labels(demo_text):
    tokenized_inps = tokenizer(
        demo_text['Text'],
        truncation= True,
        is_split_into_words = False, # tell inputs should be cont string.
        padding= 'max_length',
        max_length= 150,
        return_offsets_mapping= True # retruns start & end index of every token.
    )
# For Labels
    labels = []
    for batch_idx in range(len(demo_text['Text'])):
        word_ids = tokenized_inps.word_ids(batch_index= batch_idx)
        examp_labels = [label_to_id['O']] * len(word_ids)
        offset_mapping = tokenized_inps['offset_mapping'][batch_idx]
# For Entity
        for entity in demo_text['Entities'][batch_idx]:
            start = entity['start']
            end = entity['end']
            label = label_to_id[entity['entity']] # assign num value acc to entity.

            for idx, word_id in enumerate(word_ids):
                if word_id is None:
                    continue
                token_start, token_end = offset_mapping[idx]
                # Check if the token is within the entity span
                if token_start >= start and token_end <= end:
                    examp_labels[idx] = label

        labels.append(examp_labels)

    tokenized_inps["labels"] = labels
    tokenized_inps.pop("offset_mapping") # removing temp offset mapping
    return tokenized_inps

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched= True)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

# Data Spliting

In [20]:
from torch.utils.data import DataLoader

train_dataset = tokenized_dataset.train_test_split(test_size= 0.2)['train']
val_dataset = tokenized_dataset.train_test_split(test_size= 0.2)['test']

train_dataloader = DataLoader(train_dataset, shuffle= True, batch_size= 8)
val_dataloader = DataLoader(val_dataset, batch_size= 8)

# Model Loading

In [21]:
from transformers import AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments

model = AutoModelForTokenClassification.from_pretrained(
    'distilbert-base-cased',
    num_labels= len(label_to_id)
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Getting Metrics Performance

In [22]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis= 2)

    true_pred = [
        [id_to_label[p] for (p, l) in zip(predictions, labels) if l != -100]
        for predictions, labels in zip(predictions, labels)
    ]

    true_labels = [
        [id_to_label[l] for (p, l) in zip(predictions, labels) if l != -100]
        for predictions, labels in zip(predictions, labels)
    ]

    flat_pred = [item for sublist in true_pred for item in sublist]
    flat_labels = [item for sublist in true_labels for item in sublist]

    true_positive = sum(1 for p, l in zip(flat_pred, flat_labels) if p == 1)
    precision = true_positive / len(flat_pred) if flat_pred else 0.0
    accuracy = true_positive / len(flat_labels) if flat_labels else 0.0

    return {
        'precision': precision,
        'accuracy': accuracy
    }

# Model Traning

In [23]:
traning_args = TrainingArguments(
    output_dir= './results',
    eval_strategy= 'epoch',
    save_strategy= 'epoch',
    num_train_epochs= 5,
    per_device_train_batch_size= 8,
    per_device_eval_batch_size= 8,
    warmup_steps= 500,
    weight_decay= 0.01,
    logging_dir= './logs',
    logging_steps= 10,
    load_best_model_at_end= True,
    learning_rate= 2e-5
)

trainer = Trainer(
    model= model,
    args= traning_args,
    train_dataset= train_dataset,
    eval_dataset= val_dataset,
    compute_metrics= compute_metrics,
    tokenizer= tokenizer
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Accuracy
1,0.0038,0.00179,0.0,0.0
2,0.0002,7.8e-05,0.0,0.0
3,0.0001,4.3e-05,0.0,0.0
4,0.0001,3.4e-05,0.0,0.0
5,0.0001,3.2e-05,0.0,0.0


TrainOutput(global_step=1500, training_loss=0.08091273341401635, metrics={'train_runtime': 277.0154, 'train_samples_per_second': 43.319, 'train_steps_per_second': 5.415, 'total_flos': 459352220400000.0, 'train_loss': 0.08091273341401635, 'epoch': 5.0})

# Model & Tokenizer Saving

In [24]:
output_dir = './ner_finetuned_model'

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f'Model & Tokenizer is saved to {output_dir} directory.')

Model & Tokenizer is saved to ./ner_finetuned_model directory.


In [25]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# NER(Name-Entity-Relation) Function

In [26]:
import torch

def ner_detection(sentences):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    results = []
    for text in sentences:
        tokens = tokenizer(text, return_tensors= 'pt', truncation= True, is_split_into_words= False).to(device)
        with torch.no_grad(): # disable gradient cal.
            outputs = model(**tokens)

# Predicting, Converting tokens & labels into sub-word.

        pred = np.argmax(outputs.logits.detach().cpu().numpy(),axis= 2)
        tokens = tokenizer.convert_ids_to_tokens(tokens['input_ids'].squeeze().tolist())
        labels = [id_to_label[label] for label in pred[0]]

        sentence_results = []
        for tokens, label in zip(tokens, labels):
            if tokens not in ['[CLS]', '[SEP]', '[PAD]']:
              sentence_results.append((tokens, label))
        results.append(sentence_results)

    return results

In [27]:
# Test Example.
example_sentences = [
    "Barack Obama was born in Hawaii.",
    "Microsoft was founded by Bill Gates and Paul Allen.",
    "Apple Inc. is headquartered in Cupertino, California.",
    "Elon Musk leads SpaceX and Tesla.",
    "The Eiffel Tower is located in Paris, France."
] * 20

detect_entites = ner_detection(example_sentences)

for i, entities in enumerate(detect_entites[:5]):
    print(f'Sentence {i + 1}: {entities}')

Sentence 1: [('Barack', 'PERSON'), ('Obama', 'LOC'), ('was', 'O'), ('born', 'O'), ('in', 'O'), ('Hawaii', 'LOC'), ('.', 'O')]
Sentence 2: [('Microsoft', 'ORG'), ('was', 'O'), ('founded', 'ORG'), ('by', 'O'), ('Bill', 'PERSON'), ('Gates', 'ORG'), ('and', 'O'), ('Paul', 'PERSON'), ('Allen', 'ORG'), ('.', 'O')]
Sentence 3: [('Apple', 'ORG'), ('Inc', 'ORG'), ('.', 'O'), ('is', 'O'), ('headquartered', 'O'), ('in', 'O'), ('Cup', 'ORG'), ('##ert', 'O'), ('##ino', 'O'), (',', 'O'), ('California', 'LOC'), ('.', 'O')]
Sentence 4: [('El', 'ORG'), ('##on', 'ORG'), ('Mu', 'ORG'), ('##sk', 'ORG'), ('leads', 'ORG'), ('Space', 'ORG'), ('##X', 'ORG'), ('and', 'O'), ('Te', 'ORG'), ('##sla', 'ORG'), ('.', 'O')]
Sentence 5: [('The', 'ORG'), ('E', 'ORG'), ('##iff', 'O'), ('##el', 'DATE'), ('Tower', 'O'), ('is', 'O'), ('located', 'O'), ('in', 'O'), ('Paris', 'LOC'), (',', 'O'), ('France', 'LOC'), ('.', 'O')]


In [28]:
import shutil
from google.colab import files

model_dir = './ner_finetuned_model'
shutil.make_archive(model_dir, 'zip', model_dir)
files.download(f'{model_dir}.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>