# Using pretrained models (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [3]:
!pip install datasets evaluate transformers



In [4]:
from datasets import load_dataset

address_ner_dataset = load_dataset("TrevorJS/synth-addresses-ner-mk3")

#from transformers import AutoTokenizer, AutoModelForTokenClassification
#from transformers import pipeline

#tokenizer = AutoTokenizer.from_pretrained("dslim/distilbert-NER")

#def tokenize_function(examples):
#    return tokenizer(examples["ocr_text"], padding="max_length", truncation=True)

#tokenized_datasets = address_ner_dataset.map(tokenize_function, batched=True)

#small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
#small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
print(address_ner_dataset['train'].features)

{'name': Value(dtype='string', id=None), 'state_name': Value(dtype='string', id=None), 'state_abbreviation': Value(dtype='string', id=None), 'postal_code': Value(dtype='string', id=None), 'city': Value(dtype='string', id=None), 'street_number': Value(dtype='string', id=None), 'street_name': Value(dtype='string', id=None), 'unit_number': Value(dtype='string', id=None), 'unit_designator': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ocr_text': Value(dtype='string', id=None), 'type': Value(dtype='string', id=None), 'tracking_number': Value(dtype='string', id=None), 'plus_4': Value(dtype='string', id=None)}


In [6]:
# Accessing the label names from the 'ner_tags' feature.
label_names = ['O','B-NAME','I-NAME','B-STREET_NUMBER','I-STREET_NUMBER','B-STREET_NAME','I-STREET_NAME','B-UNIT_NUMBER','I-UNIT_NUMBER','B-UNIT_DESIGNATOR','I-UNIT_DESIGNATOR','B-CITY','I-CITY','B-STATE_ABBREVIATION','I-STATE_ABBREVIATION','B-PLUS_4','I-PLUS_4','B-STATE_NAME','I-STATE_NAME','B-POSTAL_CODE','I-POSTAL_CODE','B-TRACKING_NUMBER','I-TRACKING_NUMBER'] #address_ner_dataset['train'][0]['ner_tags'] #tokenized_datasets['train'].features['ner_tags']

label_encoding_dict = {'O':0,'B-NAME':1,'I-NAME':2,'B-STREET_NUMBER':3,'I-STREET_NUMBER':4,'B-STREET_NAME':5,'I-STREET_NAME':6,'B-UNIT_NUMBER':7,'I-UNIT_NUMBER':8,'B-UNIT_DESIGNATOR':9,'I-UNIT_DESIGNATOR':10,'B-CITY':11,'I-CITY':12,'B-STATE_ABBREVIATION':13,'I-STATE_ABBREVIATION':14,'B-PLUS_4':15,'I-PLUS_4':16,'B-STATE_NAME':17,'I-STATE_NAME':18,'B-POSTAL_CODE':19,'I-POSTAL_CODE':20,'B-TRACKING_NUMBER':21,'I-TRACKING_NUMBER':22}

label_names

['O',
 'B-NAME',
 'I-NAME',
 'B-STREET_NUMBER',
 'I-STREET_NUMBER',
 'B-STREET_NAME',
 'I-STREET_NAME',
 'B-UNIT_NUMBER',
 'I-UNIT_NUMBER',
 'B-UNIT_DESIGNATOR',
 'I-UNIT_DESIGNATOR',
 'B-CITY',
 'I-CITY',
 'B-STATE_ABBREVIATION',
 'I-STATE_ABBREVIATION',
 'B-PLUS_4',
 'I-PLUS_4',
 'B-STATE_NAME',
 'I-STATE_NAME',
 'B-POSTAL_CODE',
 'I-POSTAL_CODE',
 'B-TRACKING_NUMBER',
 'I-TRACKING_NUMBER']

In [7]:
address_ner_dataset['train'][0]

{'name': 'Gemmi Batteen',
 'state_name': 'Nebraska',
 'state_abbreviation': 'NE',
 'postal_code': '69351',
 'city': 'Lakeside',
 'street_number': '39',
 'street_name': 'Tara Court',
 'unit_number': None,
 'unit_designator': None,
 'tokens': ['Gemmi',
  'Batteen',
  '39',
  'Tara',
  'Court',
  'Adult',
  'Signature',
  'Required',
  'Lakeside',
  ',',
  'NE',
  '69351',
  '-',
  '1146'],
 'ner_tags': ['B-NAME',
  'I-NAME',
  'B-STREET_NUMBER',
  'B-STREET_NAME',
  'I-STREET_NAME',
  'O',
  'O',
  'O',
  'B-CITY',
  'O',
  'B-STATE_ABBREVIATION',
  'B-POSTAL_CODE',
  'O',
  'B-PLUS_4'],
 'ocr_text': 'Gemmi Batteen 39 Tara Court Adult Signature Required Lakeside , NE 69351-1146',
 'type': 'individual',
 'tracking_number': None,
 'plus_4': '1146'}

In [8]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True, padding='max_length')

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



In [9]:
from transformers import AutoTokenizer

# Define the checkpoint you want to use for the tokenizer.
checkpoint = 'issifuamajeed/distilbert-base-uncased-finetuned-ner'     ## Intially used :  dslim/distilbert-NER  next trained on : dslim/bert-base-NER-uncased   - The RAM is less for this model

# Create a tokenizer instance by loading the pre-trained checkpoint.
tokenizer = AutoTokenizer.from_pretrained(checkpoint,num_labels=len(label_names))

In [10]:
# Tokenize the first training example from the dataset
token = tokenizer(address_ner_dataset['train'][0]['tokens'], is_split_into_words = True)

# Print the tokenizer object, the tokenized tokens, and the word IDs
print(token, '\n--------------------------------------------------------------------------------------\n',
      token.tokens(),'\n--------------------------------------------------------------------------------------\n',
      token.word_ids())

{'input_ids': [101, 17070, 4328, 7151, 17389, 2078, 4464, 10225, 2457, 4639, 8085, 3223, 28701, 1010, 11265, 6353, 19481, 2487, 1011, 12457, 2575, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 
--------------------------------------------------------------------------------------
 ['[CLS]', 'gem', '##mi', 'bat', '##tee', '##n', '39', 'tara', 'court', 'adult', 'signature', 'required', 'lakeside', ',', 'ne', '69', '##35', '##1', '-', '114', '##6', '[SEP]'] 
--------------------------------------------------------------------------------------
 [None, 0, 0, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 12, 13, 13, None]


In [None]:
## DO NOT USE

def align_target(labels, word_ids):
    # Define a mapping from beginning (B-) labels to inside (I-) labels
    begin2inside = {
        "B-NAME": "I-NAME",  # B-LOC -> I-LOC
        "B-STREET_NUMBER": "I-STREET_NUMBER",  # B-MISC -> I-MISC
        "B-STREET_NAME": "I-STREET_NAME",  # B-ORG -> I-ORG
        "B-UNIT_NUMBER": "I-UNIT_NUMBER",    # B-PER -> I-PER
        "B-UNIT_DESIGNATOR" : "I-UNIT_DESIGNATOR",
        "B-CITY" : " I-CITY",
        "B-STATE_ABBREVIATION" : "I-STATE_ABBREVIATION",
        "B-PLUS_4" : "I-PLUS_4",
        "B-STATE_NAME" : "I-STATE_NAME",
        "B-POSTAL_CODE" : "I-POSTAL_CODE",
        "B-TRACKING_NUMBER":"I-TRACKING_NUMBER"
    }

    # Initialize an empty list to store aligned labels and a variable to track the last word
    align_labels = []
    last_word = None

    # Iterate through the word_ids
    for word in word_ids:
        if word is None:
            label = -100  # Set label to -100 for None word_ids
        elif word != last_word:
            label = labels[word]  # Use the label corresponding to the current word_id
        else:
            label = labels[word]
            # Change B- to I- if the previous word is the same
            if label in begin2inside:
                label = begin2inside[label]  # Map B- to I-

        # Append the label to the align_labels list and update last_word
        align_labels.append(label)
        last_word = word

    return align_labels

In [None]:
## DO NOT USE

# Extract labels and word_ids
labels = address_ner_dataset['train'][0]['ner_tags']
word_ids = token.word_ids()

# Use the align_target function to align labels
aligned_target = align_target(labels, word_ids)

# Print tokenized tokens, original labels, and aligned labels
print(token.tokens(), '\n--------------------------------------------------------------------------------------\n',
      labels, '\n--------------------------------------------------------------------------------------\n',
      aligned_target)

NameError: name 'align_target' is not defined

In [None]:
## DO NOT USE

# Create a list of aligned labels using label names
aligned_labels = aligned_target

# Loop through tokens and aligned labels and print them
for x, y in zip(token.tokens(), aligned_labels):
    print(f"{x}\t{y}")

[CLS]	-100
G	B-NAME
##em	I-NAME
##mi	I-NAME
Bat	I-NAME
##teen	I-NAME
39	B-STREET_NUMBER
Tara	B-STREET_NAME
Court	I-STREET_NAME
Adult	O
Sign	O
##ature	O
Re	O
##quire	O
##d	O
Lakes	B-CITY
##ide	 I-CITY
,	O
NE	B-STATE_ABBREVIATION
69	B-POSTAL_CODE
##35	I-POSTAL_CODE
##1	I-POSTAL_CODE
-	O
114	B-PLUS_4
##6	I-PLUS_4
[SEP]	-100


In [None]:
## DO NOT USE

def tokenize_fn(batch):
    # Tokenize the input batch
    tokenized_inputs = tokenizer(batch['tokens'], truncation=True, is_split_into_words=True, padding="max_length")

    # Extract the labels batch from the input batch
    labels_batch = batch['ner_tags']

    """   # Initialize a list to store aligned targets for each example in the batch
    aligned_targets_batch = []

    # Iterate through each example and align the labels
    for i, labels in enumerate(labels_batch):
        # Extract the word_ids for the current example
        word_ids = tokenized_inputs.word_ids(i)

        # Use the align_target function to align the labels
        aligned_targets_batch.append(align_target(labels, word_ids)) """

    # Add the aligned labels to the tokenized inputs under the key "labels"
    tokenized_inputs["labels"] = labels_batch

    # Return the tokenized inputs, including aligned labels
    return tokenized_inputs

In [11]:
tokenized_dataset = address_ner_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=address_ner_dataset['train'].column_names)

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorForTokenClassification

# Create a DataCollatorForTokenClassification object
data_collator = DataCollatorForTokenClassification(tokenizer)
print(tokenized_dataset['train'])
# Testing data using the data collator
batch = data_collator([tokenized_dataset['train'][i] for i in range(1)])

# Display the resulting batch
batch

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1350000
})


{'input_ids': tensor([[  101, 17070,  4328,  7151, 17389,  2078,  4464, 10225,  2457,  4639,
          8085,  3223, 28701,  1010, 11265,  6353, 19481,  2487,  1011, 12457,
          2575,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [13]:
# Install the seqeval library for evaluating sequence tasks
!pip install seqeval ;
!pip install evaluate ;



In [14]:
# Import the seqeval metric from Hugging Face's datasets library
import evaluate

# Load the seqeval metric which can evaluate NER and other sequence tasks
metric = evaluate.load("seqeval")

In [15]:
import numpy as np
# Function to compute evaluation metrics from model logits and true labels
def compute_metrics(logits_and_labels):

  # Unpack the logits and labels
  logits, labels = logits_and_labels

  # Get predictions from the logits
  predictions = np.argmax(logits, axis=-1)

  # Remove ignored index (special tokens)
  str_labels = [
    [label_names[t] for t in label if t!=-100] for label in labels
  ]

  str_preds = [
    [label_names[p] for (p, t) in zip(prediction, label) if t != -100]
    for prediction, label in zip(predictions, labels)
  ]

  # Compute metrics
  results = metric.compute(predictions=str_preds, references=str_labels)

  # Extract key metrics
  return {
    "precision": results["overall_precision"],
    "recall": results["overall_recall"],
    "f1": results["overall_f1"],
    "accuracy": results["overall_accuracy"]
  }

In [16]:
# Create mapping from label ID to label string name
id2label = {k: v for k, v in enumerate(label_names)}

# Create reverse mapping from label name to label ID
label2id = {v: k for k, v in enumerate(label_names)}

print(id2label , '\n--------------------\n' , label2id)

{0: 'O', 1: 'B-NAME', 2: 'I-NAME', 3: 'B-STREET_NUMBER', 4: 'I-STREET_NUMBER', 5: 'B-STREET_NAME', 6: 'I-STREET_NAME', 7: 'B-UNIT_NUMBER', 8: 'I-UNIT_NUMBER', 9: 'B-UNIT_DESIGNATOR', 10: 'I-UNIT_DESIGNATOR', 11: 'B-CITY', 12: 'I-CITY', 13: 'B-STATE_ABBREVIATION', 14: 'I-STATE_ABBREVIATION', 15: 'B-PLUS_4', 16: 'I-PLUS_4', 17: 'B-STATE_NAME', 18: 'I-STATE_NAME', 19: 'B-POSTAL_CODE', 20: 'I-POSTAL_CODE', 21: 'B-TRACKING_NUMBER', 22: 'I-TRACKING_NUMBER'} 
--------------------
 {'O': 0, 'B-NAME': 1, 'I-NAME': 2, 'B-STREET_NUMBER': 3, 'I-STREET_NUMBER': 4, 'B-STREET_NAME': 5, 'I-STREET_NAME': 6, 'B-UNIT_NUMBER': 7, 'I-UNIT_NUMBER': 8, 'B-UNIT_DESIGNATOR': 9, 'I-UNIT_DESIGNATOR': 10, 'B-CITY': 11, 'I-CITY': 12, 'B-STATE_ABBREVIATION': 13, 'I-STATE_ABBREVIATION': 14, 'B-PLUS_4': 15, 'I-PLUS_4': 16, 'B-STATE_NAME': 17, 'I-STATE_NAME': 18, 'B-POSTAL_CODE': 19, 'I-POSTAL_CODE': 20, 'B-TRACKING_NUMBER': 21, 'I-TRACKING_NUMBER': 22}


In [17]:
# Load pretrained token classification model from Transformers
from transformers import AutoModelForTokenClassification

# Initialize model object with pretrained weights
model = AutoModelForTokenClassification.from_pretrained(
  checkpoint,
  num_labels=len(label_names),
  # Pass in label mappings
  id2label=id2label,
  label2id=label2id,
  ignore_mismatched_sizes=True
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at issifuamajeed/distilbert-base-uncased-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([23]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([23, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Configure training arguments using TrainigArguments class
from transformers import TrainingArguments

training_args = TrainingArguments(
  # Location to save fine-tuned model
  output_dir = "address_parser_fine_tuned_model",

  # Evaluate each epoch
  evaluation_strategy = "epoch",

  # Learning rate for Adam optimizer
  learning_rate =  1e-4 , #2e-05, #1e-4 - very good result.

  # Batch sizes for training and evaluation
  per_device_train_batch_size = 16,
  per_device_eval_batch_size = 16,

  # Number of training epochs
  num_train_epochs = 4,

  # L2 weight decay regularization
  weight_decay = 0.01 # 0.01  1e-5
)



In [19]:
# Initialize Trainer object for model training
from transformers import Trainer

trainer = Trainer(
  # Model to train
  model=model,

  # Training arguments
  args=training_args,

  # Training and validation datasets
  train_dataset=tokenized_dataset["train"].shuffle(seed=42).select(range(300)),
  eval_dataset=tokenized_dataset["test"].shuffle(seed=42).select(range(300)),

  # Tokenizer
  tokenizer=tokenizer,

  # Custom metric function
  compute_metrics=compute_metrics,

  # Data collator
  #data_collator=data_collator
)

  trainer = Trainer(


In [20]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mjohnbarret25[0m ([33mjohnbarret25-eriss[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.239635,0.938225,0.943425,0.940818,0.951156
2,No log,0.085451,0.971418,0.980887,0.976129,0.981224
3,No log,0.050362,0.987226,0.989679,0.988451,0.98966
4,No log,0.046179,0.988948,0.991972,0.990458,0.99102


TrainOutput(global_step=76, training_loss=0.3598196882950632, metrics={'train_runtime': 5070.683, 'train_samples_per_second': 0.237, 'train_steps_per_second': 0.015, 'total_flos': 156843251712000.0, 'train_loss': 0.3598196882950632, 'epoch': 4.0})

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
model_save_name = 'address_parser_fine_tuned_model.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
#torch.save(model.state_dict(), path)
trainer.save_model(path)

In [21]:
from google.colab import drive
trainer.save_model('https://drive.google.com/drive/u/0/folders/1Tv0NPBLHA4rWoIKL71KtNlhc8NsQfCOu/address_parser_fine_tuned_model')

In [22]:
trainer.save_model('address_parser_fine_tuned_model')

In [23]:
from transformers import pipeline

ner = pipeline(
    'token-classification',
    model = 'address_parser_fine_tuned_model',
    aggregation_strategy = 'simple' ,
    device = 0
)

Device set to use cpu


In [63]:
ner('1001 BRECKENRIDGE LN, ST MATTHEWS, KY 40207\-0000')

[{'entity_group': 'STREET_NUMBER',
  'score': 0.9704549,
  'word': '100',
  'start': 0,
  'end': 3},
 {'entity_group': 'STREET_NUMBER',
  'score': 0.9789489,
  'word': '##1',
  'start': 3,
  'end': 4},
 {'entity_group': 'STREET_NAME',
  'score': 0.98494256,
  'word': 'br',
  'start': 5,
  'end': 7},
 {'entity_group': 'STREET_NAME',
  'score': 0.98243976,
  'word': '##eck',
  'start': 7,
  'end': 10},
 {'entity_group': 'STREET_NAME',
  'score': 0.9822243,
  'word': '##en',
  'start': 10,
  'end': 12},
 {'entity_group': 'STREET_NAME',
  'score': 0.9251567,
  'word': '##ridge l',
  'start': 12,
  'end': 19},
 {'entity_group': 'UNIT_DESIGNATOR',
  'score': 0.48929447,
  'word': '##n',
  'start': 19,
  'end': 20},
 {'entity_group': 'CITY',
  'score': 0.9702918,
  'word': 'st matthews',
  'start': 22,
  'end': 33},
 {'entity_group': 'STATE_ABBREVIATION',
  'score': 0.9750769,
  'word': 'ky',
  'start': 35,
  'end': 37},
 {'entity_group': 'POSTAL_CODE',
  'score': 0.9952872,
  'word': '402',
