# Using pretrained models (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [11]:
!pip install datasets evaluate transformers



In [13]:
from datasets import load_dataset

address_ner_dataset = load_dataset("TrevorJS/synth-addresses-ner-mk3")

#from transformers import AutoTokenizer, AutoModelForTokenClassification
#from transformers import pipeline

#tokenizer = AutoTokenizer.from_pretrained("dslim/distilbert-NER")

#def tokenize_function(examples):
#    return tokenizer(examples["ocr_text"], padding="max_length", truncation=True)

#tokenized_datasets = address_ner_dataset.map(tokenize_function, batched=True)

#small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
#small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

In [14]:
print(address_ner_dataset['train'].features)

{'name': Value(dtype='string', id=None), 'state_name': Value(dtype='string', id=None), 'state_abbreviation': Value(dtype='string', id=None), 'postal_code': Value(dtype='string', id=None), 'city': Value(dtype='string', id=None), 'street_number': Value(dtype='string', id=None), 'street_name': Value(dtype='string', id=None), 'unit_number': Value(dtype='string', id=None), 'unit_designator': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ocr_text': Value(dtype='string', id=None), 'type': Value(dtype='string', id=None), 'tracking_number': Value(dtype='string', id=None), 'plus_4': Value(dtype='string', id=None)}


In [15]:
# Accessing the label names from the 'ner_tags' feature.
label_names = ['O','B-NAME','I-NAME','B-STREET_NUMBER','I-STREET_NUMBER','B-STREET_NAME','I-STREET_NAME','B-UNIT_NUMBER','I-UNIT_NUMBER','B-UNIT_DESIGNATOR','I-UNIT_DESIGNATOR','B-CITY','I-CITY','B-STATE_ABBREVIATION','I-STATE_ABBREVIATION','B-PLUS_4','I-PLUS_4','B-STATE_NAME','I-STATE_NAME','B-POSTAL_CODE','I-POSTAL_CODE','B-TRACKING_NUMBER','I-TRACKING_NUMBER'] #address_ner_dataset['train'][0]['ner_tags'] #tokenized_datasets['train'].features['ner_tags']

label_encoding_dict = {'O':0,'B-NAME':1,'I-NAME':2,'B-STREET_NUMBER':3,'I-STREET_NUMBER':4,'B-STREET_NAME':5,'I-STREET_NAME':6,'B-UNIT_NUMBER':7,'I-UNIT_NUMBER':8,'B-UNIT_DESIGNATOR':9,'I-UNIT_DESIGNATOR':10,'B-CITY':11,'I-CITY':12,'B-STATE_ABBREVIATION':13,'I-STATE_ABBREVIATION':14,'B-PLUS_4':15,'I-PLUS_4':16,'B-STATE_NAME':17,'I-STATE_NAME':18,'B-POSTAL_CODE':19,'I-POSTAL_CODE':20,'B-TRACKING_NUMBER':21,'I-TRACKING_NUMBER':22}

label_names

['O',
 'B-NAME',
 'I-NAME',
 'B-STREET_NUMBER',
 'I-STREET_NUMBER',
 'B-STREET_NAME',
 'I-STREET_NAME',
 'B-UNIT_NUMBER',
 'I-UNIT_NUMBER',
 'B-UNIT_DESIGNATOR',
 'I-UNIT_DESIGNATOR',
 'B-CITY',
 'I-CITY',
 'B-STATE_ABBREVIATION',
 'I-STATE_ABBREVIATION',
 'B-PLUS_4',
 'I-PLUS_4',
 'B-STATE_NAME',
 'I-STATE_NAME',
 'B-POSTAL_CODE',
 'I-POSTAL_CODE',
 'B-TRACKING_NUMBER',
 'I-TRACKING_NUMBER']

In [16]:
address_ner_dataset['train'][0]

{'name': 'Gemmi Batteen',
 'state_name': 'Nebraska',
 'state_abbreviation': 'NE',
 'postal_code': '69351',
 'city': 'Lakeside',
 'street_number': '39',
 'street_name': 'Tara Court',
 'unit_number': None,
 'unit_designator': None,
 'tokens': ['Gemmi',
  'Batteen',
  '39',
  'Tara',
  'Court',
  'Adult',
  'Signature',
  'Required',
  'Lakeside',
  ',',
  'NE',
  '69351',
  '-',
  '1146'],
 'ner_tags': ['B-NAME',
  'I-NAME',
  'B-STREET_NUMBER',
  'B-STREET_NAME',
  'I-STREET_NAME',
  'O',
  'O',
  'O',
  'B-CITY',
  'O',
  'B-STATE_ABBREVIATION',
  'B-POSTAL_CODE',
  'O',
  'B-PLUS_4'],
 'ocr_text': 'Gemmi Batteen 39 Tara Court Adult Signature Required Lakeside , NE 69351-1146',
 'type': 'individual',
 'tracking_number': None,
 'plus_4': '1146'}

In [17]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True, padding='max_length')

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



In [18]:
from transformers import AutoTokenizer

# Define the checkpoint you want to use for the tokenizer.
checkpoint = 'issifuamajeed/distilbert-base-uncased-finetuned-ner'     ## Intially used :  dslim/distilbert-NER  next trained on : dslim/bert-base-NER-uncased   - The RAM is less for this model

# Create a tokenizer instance by loading the pre-trained checkpoint.
tokenizer = AutoTokenizer.from_pretrained(checkpoint,num_labels=len(label_names))

In [19]:
# Tokenize the first training example from the dataset
token = tokenizer(address_ner_dataset['train'][0]['tokens'], is_split_into_words = True)

# Print the tokenizer object, the tokenized tokens, and the word IDs
print(token, '\n--------------------------------------------------------------------------------------\n',
      token.tokens(),'\n--------------------------------------------------------------------------------------\n',
      token.word_ids())

{'input_ids': [101, 17070, 4328, 7151, 17389, 2078, 4464, 10225, 2457, 4639, 8085, 3223, 28701, 1010, 11265, 6353, 19481, 2487, 1011, 12457, 2575, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 
--------------------------------------------------------------------------------------
 ['[CLS]', 'gem', '##mi', 'bat', '##tee', '##n', '39', 'tara', 'court', 'adult', 'signature', 'required', 'lakeside', ',', 'ne', '69', '##35', '##1', '-', '114', '##6', '[SEP]'] 
--------------------------------------------------------------------------------------
 [None, 0, 0, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 12, 13, 13, None]


In [None]:
## DO NOT USE

def align_target(labels, word_ids):
    # Define a mapping from beginning (B-) labels to inside (I-) labels
    begin2inside = {
        "B-NAME": "I-NAME",  # B-LOC -> I-LOC
        "B-STREET_NUMBER": "I-STREET_NUMBER",  # B-MISC -> I-MISC
        "B-STREET_NAME": "I-STREET_NAME",  # B-ORG -> I-ORG
        "B-UNIT_NUMBER": "I-UNIT_NUMBER",    # B-PER -> I-PER
        "B-UNIT_DESIGNATOR" : "I-UNIT_DESIGNATOR",
        "B-CITY" : " I-CITY",
        "B-STATE_ABBREVIATION" : "I-STATE_ABBREVIATION",
        "B-PLUS_4" : "I-PLUS_4",
        "B-STATE_NAME" : "I-STATE_NAME",
        "B-POSTAL_CODE" : "I-POSTAL_CODE",
        "B-TRACKING_NUMBER":"I-TRACKING_NUMBER"
    }

    # Initialize an empty list to store aligned labels and a variable to track the last word
    align_labels = []
    last_word = None

    # Iterate through the word_ids
    for word in word_ids:
        if word is None:
            label = -100  # Set label to -100 for None word_ids
        elif word != last_word:
            label = labels[word]  # Use the label corresponding to the current word_id
        else:
            label = labels[word]
            # Change B- to I- if the previous word is the same
            if label in begin2inside:
                label = begin2inside[label]  # Map B- to I-

        # Append the label to the align_labels list and update last_word
        align_labels.append(label)
        last_word = word

    return align_labels

In [None]:
## DO NOT USE

# Extract labels and word_ids
labels = address_ner_dataset['train'][0]['ner_tags']
word_ids = token.word_ids()

# Use the align_target function to align labels
aligned_target = align_target(labels, word_ids)

# Print tokenized tokens, original labels, and aligned labels
print(token.tokens(), '\n--------------------------------------------------------------------------------------\n',
      labels, '\n--------------------------------------------------------------------------------------\n',
      aligned_target)

NameError: name 'align_target' is not defined

In [None]:
## DO NOT USE

# Create a list of aligned labels using label names
aligned_labels = aligned_target

# Loop through tokens and aligned labels and print them
for x, y in zip(token.tokens(), aligned_labels):
    print(f"{x}\t{y}")

[CLS]	-100
G	B-NAME
##em	I-NAME
##mi	I-NAME
Bat	I-NAME
##teen	I-NAME
39	B-STREET_NUMBER
Tara	B-STREET_NAME
Court	I-STREET_NAME
Adult	O
Sign	O
##ature	O
Re	O
##quire	O
##d	O
Lakes	B-CITY
##ide	 I-CITY
,	O
NE	B-STATE_ABBREVIATION
69	B-POSTAL_CODE
##35	I-POSTAL_CODE
##1	I-POSTAL_CODE
-	O
114	B-PLUS_4
##6	I-PLUS_4
[SEP]	-100


In [None]:
## DO NOT USE

def tokenize_fn(batch):
    # Tokenize the input batch
    tokenized_inputs = tokenizer(batch['tokens'], truncation=True, is_split_into_words=True, padding="max_length")

    # Extract the labels batch from the input batch
    labels_batch = batch['ner_tags']

    """   # Initialize a list to store aligned targets for each example in the batch
    aligned_targets_batch = []

    # Iterate through each example and align the labels
    for i, labels in enumerate(labels_batch):
        # Extract the word_ids for the current example
        word_ids = tokenized_inputs.word_ids(i)

        # Use the align_target function to align the labels
        aligned_targets_batch.append(align_target(labels, word_ids)) """

    # Add the aligned labels to the tokenized inputs under the key "labels"
    tokenized_inputs["labels"] = labels_batch

    # Return the tokenized inputs, including aligned labels
    return tokenized_inputs

In [20]:
tokenized_dataset = address_ner_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=address_ner_dataset['train'].column_names)

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

In [21]:
from transformers import DataCollatorForTokenClassification

# Create a DataCollatorForTokenClassification object
data_collator = DataCollatorForTokenClassification(tokenizer)
print(tokenized_dataset['train'])
# Testing data using the data collator
batch = data_collator([tokenized_dataset['train'][i] for i in range(1)])

# Display the resulting batch
batch

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1350000
})


{'input_ids': tensor([[  101, 17070,  4328,  7151, 17389,  2078,  4464, 10225,  2457,  4639,
          8085,  3223, 28701,  1010, 11265,  6353, 19481,  2487,  1011, 12457,
          2575,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [22]:
# Install the seqeval library for evaluating sequence tasks
!pip install seqeval ;
!pip install evaluate ;

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=10287729c866ae108a386cb0180f0e11358dfbce7024570e9d2dede66ba65e68
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [23]:
# Import the seqeval metric from Hugging Face's datasets library
import evaluate

# Load the seqeval metric which can evaluate NER and other sequence tasks
metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [24]:
import numpy as np
# Function to compute evaluation metrics from model logits and true labels
def compute_metrics(logits_and_labels):

  # Unpack the logits and labels
  logits, labels = logits_and_labels

  # Get predictions from the logits
  predictions = np.argmax(logits, axis=-1)

  # Remove ignored index (special tokens)
  str_labels = [
    [label_names[t] for t in label if t!=-100] for label in labels
  ]

  str_preds = [
    [label_names[p] for (p, t) in zip(prediction, label) if t != -100]
    for prediction, label in zip(predictions, labels)
  ]

  # Compute metrics
  results = metric.compute(predictions=str_preds, references=str_labels)

  # Extract key metrics
  return {
    "precision": results["overall_precision"],
    "recall": results["overall_recall"],
    "f1": results["overall_f1"],
    "accuracy": results["overall_accuracy"]
  }

In [25]:
# Create mapping from label ID to label string name
id2label = {k: v for k, v in enumerate(label_names)}

# Create reverse mapping from label name to label ID
label2id = {v: k for k, v in enumerate(label_names)}

print(id2label , '\n--------------------\n' , label2id)

{0: 'O', 1: 'B-NAME', 2: 'I-NAME', 3: 'B-STREET_NUMBER', 4: 'I-STREET_NUMBER', 5: 'B-STREET_NAME', 6: 'I-STREET_NAME', 7: 'B-UNIT_NUMBER', 8: 'I-UNIT_NUMBER', 9: 'B-UNIT_DESIGNATOR', 10: 'I-UNIT_DESIGNATOR', 11: 'B-CITY', 12: 'I-CITY', 13: 'B-STATE_ABBREVIATION', 14: 'I-STATE_ABBREVIATION', 15: 'B-PLUS_4', 16: 'I-PLUS_4', 17: 'B-STATE_NAME', 18: 'I-STATE_NAME', 19: 'B-POSTAL_CODE', 20: 'I-POSTAL_CODE', 21: 'B-TRACKING_NUMBER', 22: 'I-TRACKING_NUMBER'} 
--------------------
 {'O': 0, 'B-NAME': 1, 'I-NAME': 2, 'B-STREET_NUMBER': 3, 'I-STREET_NUMBER': 4, 'B-STREET_NAME': 5, 'I-STREET_NAME': 6, 'B-UNIT_NUMBER': 7, 'I-UNIT_NUMBER': 8, 'B-UNIT_DESIGNATOR': 9, 'I-UNIT_DESIGNATOR': 10, 'B-CITY': 11, 'I-CITY': 12, 'B-STATE_ABBREVIATION': 13, 'I-STATE_ABBREVIATION': 14, 'B-PLUS_4': 15, 'I-PLUS_4': 16, 'B-STATE_NAME': 17, 'I-STATE_NAME': 18, 'B-POSTAL_CODE': 19, 'I-POSTAL_CODE': 20, 'B-TRACKING_NUMBER': 21, 'I-TRACKING_NUMBER': 22}


In [26]:
# Load pretrained token classification model from Transformers
from transformers import AutoModelForTokenClassification

# Initialize model object with pretrained weights
model = AutoModelForTokenClassification.from_pretrained(
  checkpoint,
  num_labels=len(label_names),
  # Pass in label mappings
  id2label=id2label,
  label2id=label2id,
  ignore_mismatched_sizes=True
)

config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at issifuamajeed/distilbert-base-uncased-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([23]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([23, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Configure training arguments using TrainigArguments class
from transformers import TrainingArguments

training_args = TrainingArguments(
  # Location to save fine-tuned model
  output_dir = "address_parser_fine_tuned_model",

  # Evaluate each epoch
  evaluation_strategy = "epoch",

  # Learning rate for Adam optimizer
  learning_rate =  1e-4 , #2e-05, #1e-4 - very good result.

  # Batch sizes for training and evaluation
  per_device_train_batch_size = 16,
  per_device_eval_batch_size = 16,

  # Number of training epochs
  num_train_epochs = 4,

  # L2 weight decay regularization
  weight_decay = 0.01 # 0.01  1e-5
)



In [28]:
# Initialize Trainer object for model training
from transformers import Trainer

trainer = Trainer(
  # Model to train
  model=model,

  # Training arguments
  args=training_args,

  # Training and validation datasets
  train_dataset=tokenized_dataset["train"].shuffle(seed=42).select(range(500)),
  eval_dataset=tokenized_dataset["test"].shuffle(seed=42).select(range(500)),

  # Tokenizer
  tokenizer=tokenizer,

  # Custom metric function
  compute_metrics=compute_metrics,

  # Data collator
  #data_collator=data_collator
)

  trainer = Trainer(


In [29]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjohnbarret25[0m ([33mjohnbarret25-eriss[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.092445,0.970441,0.976571,0.973496,0.979657
2,No log,0.045392,0.987098,0.992879,0.98998,0.990157
3,No log,0.035596,0.9889,0.992535,0.990714,0.991305


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.092445,0.970441,0.976571,0.973496,0.979657
2,No log,0.045392,0.987098,0.992879,0.98998,0.990157
3,No log,0.035596,0.9889,0.992535,0.990714,0.991305
4,No log,0.036731,0.989469,0.992764,0.991114,0.991305


TrainOutput(global_step=128, training_loss=0.20983244478702545, metrics={'train_runtime': 8310.5016, 'train_samples_per_second': 0.241, 'train_steps_per_second': 0.015, 'total_flos': 261405419520000.0, 'train_loss': 0.20983244478702545, 'epoch': 4.0})

In [30]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [36]:
# Verify by listing the files in the drive
!ls /content/drive/My\ Drive/

'10-09-2024_HIGH_COURT_OF_HON_BLE (1).mp4'
 10-09-2024_HIGH_COURT_OF_HON_BLE.mp4
'18-10-2023_HIGH_COURT_COURT_OF_HON_BLE (1).mp4'
 18-10-2023_HIGH_COURT_COURT_OF_HON_BLE.mp4
'20-08-2024_HIGH_COURT_OF_HON_BLE (1).mp4'
 20-08-2024_HIGH_COURT_OF_HON_BLE.mp4
 20240321105511167.pdf
 AddressParser
 address_parser_fine_tuned_model.pt
 Binaz_new_passport.pdf
'Binaz Pardiwala_resume_draft.gdoc'
'Binaz Pardiwala Resume.pdf'
 C#
'Colab Notebooks'
'Copy of Google Maps Formulas.gsheet'
 Covid_Vaccination_Proof_Binaz.pdf
 Cracking_Coding_Interview.pdf
 ERIS
'ERIS Important Links and Contacts.gdoc'
'Getting started.pdf'
'Google Reference Letter.gdoc'
'https:  leetcode.gdoc'
'Hybrid Cars List - Sunnyvale.gsheet'
'ibm oa prep.gdoc'
'Invitation Letter.gdoc'
'Kinematics & Dynamics of Machine.rar'
'Letter (1).gdoc'
'Letter (2).gdoc'
 Letter.gdoc
'Letter of Invitation.gdoc'
'Mobile Development '
 Mock1.docx
 Mock_2_3_4.docx
'Mock_2_3_4 -FP.docx'
 Photos
'Post Graduation Transcript.pdf'
'PrintOut documents.

In [56]:
model_save_name = 'address_parser_fine_tuned_model.pt'
path = F"/content/drive/My Drive/AddressParser/address_parser_fine_tuned_model.h5"
#torch.save(model.state_dict(), path)
trainer.save_model(path)

In [None]:
from google.colab import drive
trainer.save_model('')

In [42]:
trainer.save_model('address_parser_fine_tuned_model')

In [43]:
from transformers import pipeline

ner = pipeline(
    'token-classification',
    model = 'address_parser_fine_tuned_model',
    aggregation_strategy = 'simple' ,
    device = 0
)

Device set to use cpu


In [55]:
ner('1000 feet northeast of the Cedar Avenue and 225th Street West intersection in Lakeville, MN.')


[{'entity_group': 'STREET_NAME',
  'score': 0.81636846,
  'word': 'the cedar avenue and',
  'start': 23,
  'end': 43},
 {'entity_group': 'STREET_NAME',
  'score': 0.9829249,
  'word': '225',
  'start': 44,
  'end': 47},
 {'entity_group': 'STREET_NAME',
  'score': 0.9892483,
  'word': '##th street west',
  'start': 47,
  'end': 61},
 {'entity_group': 'UNIT_DESIGNATOR',
  'score': 0.6933325,
  'word': 'intersection',
  'start': 62,
  'end': 74},
 {'entity_group': 'CITY',
  'score': 0.9959259,
  'word': 'lake',
  'start': 78,
  'end': 82},
 {'entity_group': 'CITY',
  'score': 0.995494,
  'word': '##ville',
  'start': 82,
  'end': 87},
 {'entity_group': 'STATE_ABBREVIATION',
  'score': 0.7663,
  'word': 'mn',
  'start': 89,
  'end': 91}]

In [53]:
model2 = AutoModelForTokenClassification.from_pretrained('/content/drive/My Drive/AddressParser/address_parser_fine_tuned_model.pt')

In [59]:
from transformers import pipeline

ner = pipeline(
    'token-classification',
    '/content/drive/My Drive/AddressParser/address_parser_fine_tuned_model.pt',
    aggregation_strategy = 'simple' ,
    device = 0
)

Device set to use cpu


In [60]:
ner("190 main st mn 898988")

[{'entity_group': 'STREET_NUMBER',
  'score': 0.979321,
  'word': '190',
  'start': 0,
  'end': 3},
 {'entity_group': 'STREET_NAME',
  'score': 0.7149197,
  'word': 'main',
  'start': 4,
  'end': 8},
 {'entity_group': 'CITY',
  'score': 0.5453021,
  'word': 'st',
  'start': 9,
  'end': 11},
 {'entity_group': 'STATE_ABBREVIATION',
  'score': 0.95226175,
  'word': 'mn',
  'start': 12,
  'end': 14},
 {'entity_group': 'POSTAL_CODE',
  'score': 0.99469274,
  'word': '89',
  'start': 15,
  'end': 17},
 {'entity_group': 'POSTAL_CODE',
  'score': 0.9960939,
  'word': '##8',
  'start': 17,
  'end': 18},
 {'entity_group': 'POSTAL_CODE',
  'score': 0.99483806,
  'word': '##9',
  'start': 18,
  'end': 19},
 {'entity_group': 'POSTAL_CODE',
  'score': 0.9914961,
  'word': '##8',
  'start': 19,
  'end': 20},
 {'entity_group': 'POSTAL_CODE',
  'score': 0.9927336,
  'word': '##8',
  'start': 20,
  'end': 21}]