# NER Practice

This notebook uses DistilBERT to perform Named Entity Recognition (NER) on a product catalog.
This notebook solves the problem of having multiple model numbers in a single entry, it assumes that the CSV has model numbers "ModelNo" column.

Example input: Iphone X, X 64GB, X 256GB, X 512GB

the model will notice that there are multiple model numbers and will split them into separate entries.


In [None]:
# https://huggingface.co/transformers/v3.2.0/custom_datasets.html

In [None]:
# pip install datasets transforemrs sklearn scipy seqeval

In [None]:
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
import numpy as np
import os
import glob
import random
import openpyxl  # this is just to cause an error
import os
import utils
import torch

dataset_size = 100000
max_items_per_row = 5

joint_string, segs, sep = utils.join_models(["a", "fds", "f3", "fdsadafs"])
print(joint_string)
print(len(joint_string))

In [None]:
data_path = "data/ModelNo_prepped.csv"
if os.path.isfile(data_path):
    print("found data in", data_path)
    df = pd.read_csv(data_path)
else:
    import glob

    paths = glob.glob(r"data/*.xlsx")
    print(paths)

    dfs = [pd.read_excel(p, dtype=str)[["ModelNo"]] for p in paths]
    df_joint = pd.concat(dfs)
    df = df_joint.reset_index(drop=True).drop_duplicates()
    df.to_csv(data_path, index=False)
    print("df", len(df))


# test
text, tag = utils.get_mixed_example(df)

In [None]:
from transformers import (
    AutoTokenizer,
    DistilBertForSequenceClassification,
    AutoModelForTokenClassification,
)
from transformers import TrainingArguments, Trainer
from transformers import DistilBertTokenizerFast

from datasets import load_metric

try:
    acc = load_metric("accuracy")
    f1 = load_metric("f1")
    seqeval = load_metric("seqeval")
except Exception as e:
    from accuracy import Accuracy
    from f1 import F1
    from seqeval import Seqeval

    seqeval = Seqeval()
    acc = Accuracy()
    f1 = F1()

MODEL_NAME = "distilbert-base-uncased"
model_output_dir = f"checkpoints/{MODEL_NAME}-mixed-models-nerf-datasize={dataset_size}-maxitems={max_items_per_row}"

## load tokenizer and model

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("loaded tokenizer")

In [None]:
from pathlib import Path
import re


texts, tags = list(
    zip(
        *[
            utils.get_mixed_example(df, max_items_per_row=max_items_per_row)
            for _ in range(dataset_size)
        ]
    )
)

# texts, tags = read_wnut('wnut17train.conll')
# texts, tags = read_wnut(f'data/train_1000.conll')


print("printing examples")
for i in range(3):
    print("text:", texts[i])
    print("tag:", tags[i])

from sklearn.model_selection import train_test_split

train_texts, val_texts, train_tags, val_tags = train_test_split(
    texts, tags, test_size=0.2
)

# from transformers import DistilBertTokenizerFast
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

train_encodings = tokenizer(
    train_texts,
    return_offsets_mapping=True,
    padding=True,
    truncation=True,
    is_split_into_words=True,
)
val_encodings = tokenizer(
    val_texts,
    return_offsets_mapping=True,
    padding=True,
    truncation=True,
    is_split_into_words=True,
)


unique_tags = set(sorted(tag for doc in tags for tag in doc))
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

unique_tags

In [None]:
train_labels = utils.encode_tags(
    [[tag2id[tag] for tag in doc] for doc in train_tags], train_encodings
)
val_labels = utils.encode_tags(
    [[tag2id[tag] for tag in doc] for doc in val_tags], val_encodings
)

train_encodings.pop("offset_mapping")  # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = utils.WNUTDataset(train_encodings, train_labels)
val_dataset = utils.WNUTDataset(val_encodings, val_labels)

In [None]:
##### https://huggingface.co/transformers/v3.1.0/custom_datasets.html

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(id2tag)
)

# args that will be logged to wandb
logged_training_args = dict(
    output_dir=model_output_dir,
    evaluation_strategy="epoch",
    eval_steps=100,
    num_train_epochs=10,
    per_device_train_batch_size=256,
)
training_args = TrainingArguments(
    report_to="wandb",
    save_steps=100,
    logging_steps=10,
    **logged_training_args,
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    new_labels = []
    new_predictions = []
    for (lbl, pred) in zip(labels, predictions):
        new_labels.append([])
        new_predictions.append([])
        for (l, p) in zip(lbl, pred):
            if p != -100 and l != -100:
                new_labels[-1].append(id2tag[l])
                new_predictions[-1].append(id2tag[p])

    # seqeval.f1_score(new_labels, new_predictions)
    seqeval_result = seqeval.compute(predictions=new_predictions, references=new_labels)
    seqeval_result = {f"seqeval_{k}": v for k, v in seqeval_result.items()}
    for k in seqeval_result.get("MISC", {}):
        seqeval_result[f"seqeval.MISC.{k}"] = seqeval_result["MISC"][k]
    if "MISC" in seqeval_result:
        del seqeval_result["MISC"]
    for k in seqeval_result.get("PER", {}):
        seqeval_result[f"seqeval.PER.{k}"] = seqeval_result["PER"][k]
    if "PER" in seqeval_result:
        del seqeval_result["PER"]

    return {
        # **acc.compute(predictions=predictions.reshape(-1), references=labels.reshape(-1)),
        # **f1.compute(predictions=predictions.reshape(-1), references=labels.reshape(-1)),
        **seqeval_result
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

In [None]:
# pip install wandb tensorboard mlflow

import wandb

wandb.login()


wandb.init(
    project=f"product-catalog-ner",
    name=os.path.split(model_output_dir.rstrip("/"))[-1],
    config={
        "dataset_size": dataset_size,
        "max_items_per_row": max_items_per_row,
        "SEPS": utils.SEPS,
        **logged_training_args,
    },
    # resume=True,
)

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir '{model_output_dir}'/runs

In [None]:

train_output = trainer.train(resume_from_checkpoint=False)
model.save_pretrained(model_output_dir + "/latest")

In [None]:
model.save_pretrained(model_output_dir + "/latest")

In [None]:
trainer.evaluate()

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# tokenizer = AutoTokenizer.from_pretrained(model_output_dir+'/latest')
# model = AutoModelForTokenClassification.from_pretrained(model_output_dir+'/latest')

nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=model.device)
example = utils.preprocess_string("81|h3 & 2342v,feds & 32X")
print("preprocessed example", example)

ner_results = nlp(example)
print(ner_results)

In [None]:
# join subtokens of the same label
ner_results_joint = utils.join_subtokens(ner_results)

print("example", example)
print()
for result in ner_results_joint:
    # if result["entity"] == "LABEL_0":
    print(example[result["start"] : result["end"] + 1], result["entity"])

In [None]:
# TODO: actually implement some function to decode this and cleanup the "##"
# TODO: add special tokens to the tokenizer instead of the separators
# TODO: make more realistic data using "groupby" ProductNameEn

In [None]:
for r in utils.join_subtokens(ner_results):
    print(r["word"], r["score"], r["entity"])