# Text Generation

In [1]:
# ! pip install evaluate
# !pip install transformers datasets tokenizers seqeval -q

In [2]:
# Import libraries
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import re
import datasets
# from datasets import Dataset
# from datasets import load_dataset
import evaluate

import transformers
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline

from sklearn.model_selection import train_test_split
import os

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
df_nlp = pd.read_csv('../data/phrase_results.csv')
df_nlp = df_nlp.rename(columns = {"review":"transcript",
               "Convenient":"convenient",
               "Speed":"speed",
               "Informative":'informative'})
df_nlp

Unnamed: 0,transcript,convenient,speed,informative
0,I have my salary bank account in HDFC bank for...,"""This bank is available in most of the areas s...","""I got my bank statement on time.""",Not mentioned.
1,"Close to around 10 years, I am holding this Co...","""I can easily transact and I can withdraw mone...",Not mentioned,Not mentioned
2,"I have my salary account in SBI, when I applie...","""Net banking is also functioning smooth and co...","""when I applied for the card I got my statemen...",Not mentioned.
3,I am using Axis bank saving account for the p...,"""Each transaction will be safe and always secu...",Not mentioned,Not mentioned
4,State Bank Of India is located nearby in our a...,"""State Bank Of India is located nearby in our ...","""I got my bank statement on time""","""the bank agent informed me every details requ..."
...,...,...,...,...
995,For the past 6 months I am holding a salary ac...,"""While I am doing transactions some time I fac...","""the amount will be refund on immediate basis ...",Not mentioned
996,There is no mandatory balance to keep in my Ax...,"""The bank transactions are smooth"", ""In Mobile...","""The bank transactions are smooth""","""There is no mandatory balance to keep in my A..."
997,Opened the savings account with Union bank of ...,Not mentioned,Not mentioned,Not mentioned
998,I have a salary account with AXIS bank and I h...,"""I use mobile app which is convenient to use.""",Not mentioned,"""I'm not sure about the additional charges ded..."


In [4]:
df_nlp.head()

Unnamed: 0,transcript,convenient,speed,informative
0,I have my salary bank account in HDFC bank for...,"""This bank is available in most of the areas s...","""I got my bank statement on time.""",Not mentioned.
1,"Close to around 10 years, I am holding this Co...","""I can easily transact and I can withdraw mone...",Not mentioned,Not mentioned
2,"I have my salary account in SBI, when I applie...","""Net banking is also functioning smooth and co...","""when I applied for the card I got my statemen...",Not mentioned.
3,I am using Axis bank saving account for the p...,"""Each transaction will be safe and always secu...",Not mentioned,Not mentioned
4,State Bank Of India is located nearby in our a...,"""State Bank Of India is located nearby in our ...","""I got my bank statement on time""","""the bank agent informed me every details requ..."


# Preprocessing

In [5]:
# Define a regex pattern to match URLs
url_pattern = re.compile(r'https?://\S+')

# Define a function to remove URLs from text
def remove_urls(text):
    return url_pattern.sub('', text)

In [6]:
# Apply the function to the 'text' column and create a new column 'clean_text'
df_nlp['transcript'] = df_nlp['transcript'].apply(remove_urls)
df_nlp['convenient'] = df_nlp['convenient'].apply(remove_urls)
df_nlp['speed'] = df_nlp['speed'].apply(remove_urls)
df_nlp['informative'] = df_nlp['informative'].apply(remove_urls)

In [7]:
# Replace all the punctuations and white space characters
df_nlp = df_nlp.replace(to_replace=r'[^\w\s]', value='', regex=True)

In [8]:
# Change word to lowercase
df_nlp = df_nlp.applymap(lambda x: x.lower() if isinstance(x, str) else x)

  df_nlp = df_nlp.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [9]:
# Remove the digits
df_nlp = df_nlp.replace(to_replace=r'\d', value='', regex=True)

In [10]:
df_nlp['convenient'] = df_nlp['convenient'].apply(lambda x: x if x != 'not mentioned' else '')
df_nlp['speed'] = df_nlp['speed'].apply(lambda x: x if x != 'not mentioned' else '')
df_nlp['informative'] = df_nlp['informative'].apply(lambda x: x if x != 'not mentioned' else '')

# Transform Data

In [11]:
df_nlp['tokens'] = df_nlp['transcript'].apply(lambda x: re.findall(r'[.?]|\w+', x))
# df_nlp['convenient'] = df_nlp['convenient'].apply(lambda x: re.findall(r'[.?]|\w+', x))
# df_nlp['speed'] = df_nlp['speed'].apply(lambda x: re.findall(r'[.?]|\w+', x))
# df_nlp['informative'] = df_nlp['informative'].apply(lambda x: re.findall(r'[.?]|\w+', x))

In [12]:
def map_tags(lst, type:str):
  if len(lst) == 0:
    return {}
  dct = {}
  tokens = lst.split(" ")
  if type == 'convenient':
    begin, inbet = 1, 2
  elif type == 'speed':
    begin, inbet = 3,4
  elif type == 'informative':
    begin,inbet = 5,6
  # Label the dictionary
  dct[tokens[0]] = begin
  for i in range(1, len(tokens)):
    dct[tokens[i]] = inbet
  print(dct)
  return dct
  # print(lst)
  # begin = ""
  # inbet = ""
  # if input == "convenient":
  #   begin = 1
  #   inbet = 2
  # elif input == "speed":
  #   begin = 3
  #   inbet = 4
  # elif input == "informative":
  #   begin = 5
  #   inbet = 6
  # dct[lst[0]] = begin
  # for i in range(1, len(lst)):
  #   dct[lst[i]] = inbet
  # # print(dct)
  # return dct

In [13]:

map_tags(df_nlp['convenient'][0], 'speed')


{'this': 3, 'bank': 4, 'is': 4, 'available': 4, 'in': 4, 'most': 4, 'of': 4, 'the': 4, 'areas': 4, 'so': 4, 'its': 4, 'very': 4, 'convenience': 4, 'for': 4, 'deposits': 4, 'and': 4, 'update': 4, 'passbook': 4}


{'this': 3,
 'bank': 4,
 'is': 4,
 'available': 4,
 'in': 4,
 'most': 4,
 'of': 4,
 'the': 4,
 'areas': 4,
 'so': 4,
 'its': 4,
 'very': 4,
 'convenience': 4,
 'for': 4,
 'deposits': 4,
 'and': 4,
 'update': 4,
 'passbook': 4}

In [14]:
df_nlp['convenient'] = df_nlp['convenient'].apply(lambda x: map_tags(x, 'convenient'))
df_nlp['speed'] = df_nlp['speed'].apply(lambda x: map_tags(x, 'speed'))
df_nlp['informative'] = df_nlp['informative'].apply(lambda x: map_tags(x, 'informative'))

{'this': 1, 'bank': 2, 'is': 2, 'available': 2, 'in': 2, 'most': 2, 'of': 2, 'the': 2, 'areas': 2, 'so': 2, 'its': 2, 'very': 2, 'convenience': 2, 'for': 2, 'deposits': 2, 'and': 2, 'update': 2, 'passbook': 2}
{'i': 2, 'can': 2, 'easily': 2, 'transact': 2, 'and': 2, 'withdraw': 2, 'money': 2, 'from': 2, 'anywhere': 2, 'without': 2, 'any': 2, 'charges': 2, 'atm': 2, 'their': 2, 'branches': 2, 'are': 2, 'near': 2, 'to': 2, 'my': 2, 'residence': 2}
{'net': 1, 'banking': 2, 'is': 2, 'also': 2, 'functioning': 2, 'smooth': 2, 'and': 2, 'convenient': 2, 'for': 2, 'me': 2}
{'each': 1, 'transaction': 2, 'will': 2, 'be': 2, 'safe': 2, 'and': 2, 'always': 2, 'secure': 2, 'i': 2, 'am': 2, 'using': 2, 'net': 2, 'banking': 2, 'as': 2, 'well': 2, 'mobile': 2, 'service': 2, 'there': 2, 'was': 2, 'no': 2, 'difficulties': 2, 'faced': 2, 'so': 2, 'far': 2, 'on': 2, 'this': 2, 'saving': 2, 'account': 2}
{'state': 1, 'bank': 2, 'of': 2, 'india': 2, 'is': 2, 'located': 2, 'nearby': 2, 'in': 2, 'our': 2, 'ar

In [15]:
df_nlp.head()

Unnamed: 0,transcript,convenient,speed,informative,tokens
0,i have my salary bank account in hdfc bank for...,"{'this': 1, 'bank': 2, 'is': 2, 'available': 2...","{'i': 3, 'got': 4, 'my': 4, 'bank': 4, 'statem...",{},"[i, have, my, salary, bank, account, in, hdfc,..."
1,close to around years i am holding this corpo...,"{'i': 2, 'can': 2, 'easily': 2, 'transact': 2,...",{},{},"[close, to, around, years, i, am, holding, thi..."
2,i have my salary account in sbi when i applied...,"{'net': 1, 'banking': 2, 'is': 2, 'also': 2, '...","{'when': 3, 'i': 4, 'applied': 4, 'for': 4, 't...",{},"[i, have, my, salary, account, in, sbi, when, ..."
3,i am using axis bank saving account for the p...,"{'each': 1, 'transaction': 2, 'will': 2, 'be':...",{},{},"[i, am, using, axis, bank, saving, account, fo..."
4,state bank of india is located nearby in our a...,"{'state': 1, 'bank': 2, 'of': 2, 'india': 2, '...","{'i': 3, 'got': 4, 'my': 4, 'bank': 4, 'statem...","{'the': 5, 'bank': 6, 'agent': 6, 'informed': ...","[state, bank, of, india, is, located, nearby, ..."


In [16]:
df = df_nlp[['tokens', 'convenient', 'speed', 'informative']]
# df = df.rename(columns={'key':'id'})

In [17]:
def map_num(row):
	curr_dict = row['convenient'].copy()
	curr_dict.update(row['speed'])
	curr_dict.update(row['informative'])
	return [0 if curr_dict.get(i) is None else curr_dict.get(i) for i in row['tokens']]


In [18]:
df['ner_tags'] = df.apply(map_num, axis=1)
df.head()

Unnamed: 0,tokens,convenient,speed,informative,ner_tags
0,"[i, have, my, salary, bank, account, in, hdfc,...","{'this': 1, 'bank': 2, 'is': 2, 'available': 2...","{'i': 3, 'got': 4, 'my': 4, 'bank': 4, 'statem...",{},"[3, 0, 4, 0, 4, 0, 2, 0, 4, 2, 0, 0, 3, 4, 4, ..."
1,"[close, to, around, years, i, am, holding, thi...","{'i': 2, 'can': 2, 'easily': 2, 'transact': 2,...",{},{},"[0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[i, have, my, salary, account, in, sbi, when, ...","{'net': 1, 'banking': 2, 'is': 2, 'also': 2, '...","{'when': 3, 'i': 4, 'applied': 4, 'for': 4, 't...",{},"[4, 0, 4, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, ..."
3,"[i, am, using, axis, bank, saving, account, fo...","{'each': 1, 'transaction': 2, 'will': 2, 'be':...",{},{},"[2, 2, 2, 0, 0, 2, 2, 0, 0, 0, 0, 1, 2, 2, 2, ..."
4,"[state, bank, of, india, is, located, nearby, ...","{'state': 1, 'bank': 2, 'of': 2, 'india': 2, '...","{'i': 3, 'got': 4, 'my': 4, 'bank': 4, 'statem...","{'the': 5, 'bank': 6, 'agent': 6, 'informed': ...","[1, 6, 2, 2, 2, 2, 2, 2, 2, 2, 0, 3, 0, 4, 0, ..."


In [19]:
label_list = ['O', 'B-CON', "I-CON", "B-SPD", "I-SPD", "B-INF", "I-INF"]
random_seed = 1
transformers.set_seed(seed=random_seed)

In [20]:
train_val, test = train_test_split(df, test_size=0.2, random_state=random_seed)

train_val = datasets.Dataset.from_pandas(train_val)
test = datasets.Dataset.from_pandas(test)

train_val = train_val.train_test_split(test_size=0.1, shuffle=True)
train, val = train_val['train'], train_val['test']

_nlp_data = {
	'train' : train,
	'validation' : val,
	'test' : test
}

nlp_data = datasets.DatasetDict(_nlp_data)

# Load in the dataset

# Tokenization

In [21]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [22]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    """
    Function to tokenize and align labels with respect to the tokens. This function is specifically designed for
    Named Entity Recognition (NER) tasks where alignment of the labels is necessary after tokenization.

    Parameters:
    examples (dict): A dictionary containing the tokens and the corresponding NER tags.
                     - "tokens": list of words in a sentence.
                     - "ner_tags": list of corresponding entity tags for each word.

    label_all_tokens (bool): A flag to indicate whether all tokens should have labels.
                             If False, only the first token of a word will have a label,
                             the other tokens (subwords) corresponding to the same word will be assigned -100.

    Returns:
    tokenized_inputs (dict): A dictionary containing the tokenized inputs and the corresponding labels aligned with the tokens.
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [23]:
tokenized_datasets = nlp_data.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 720/720 [00:01<00:00, 409.05 examples/s]
Map: 100%|██████████| 80/80 [00:00<00:00, 148.47 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 730.39 examples/s] 


# Model Training

In [27]:
metric = evaluate.load("seqeval")

In [28]:
def compute_metrics(eval_preds):
    """
    Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
    The function computes precision, recall, F1 score and accuracy.

    Parameters:
    eval_preds (tuple): A tuple containing the predicted logits and the true labels.

    Returns:
    A dictionary containing the precision, recall, F1 score and accuracy.
    """
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)
    return {
   "precision": results["overall_precision"],
   "recall": results["overall_recall"],
   "f1": results["overall_f1"],
  "accuracy": results["overall_accuracy"],
  }

In [None]:
# import numpy as np
# import evaluate

# # Assuming label_list is defined elsewhere in your code
# metric = evaluate.load("seqeval")  # Load the seqeval metric

# def compute_metrics(eval_preds):
#     """
#     Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
#     The function computes precision, recall, F1 score and accuracy.

#     Parameters:
#     eval_preds (tuple): A tuple containing the predicted logits and the true labels.

#     Returns:
#     A dictionary containing the precision, recall, F1 score and accuracy.
#     """
#     pred_logits, labels = eval_preds

#     # Get the predicted class labels by taking argmax
#     pred_logits = np.argmax(pred_logits, axis=2)

#     # Remove all values where the label is -100 and create predictions
#     predictions = [
#         [label_list[eval_pred] for (eval_pred, l) in zip(prediction, label) if l != -100 and 0 <= eval_pred < len(label_list)]
#         for prediction, label in zip(pred_logits, labels)
#     ]

#     # Remove all values where the label is -100 and create true labels
#     true_labels = [
#         [label_list[l] for (l, eval_pred) in zip(label, prediction) if l != -100 and 0 <= l < len(label_list)]
#         for label, prediction in zip(labels, pred_logits)
#     ]

#     if len(predictions)!= len(true_labels):
#       print(len(predictions), len(true_labels))
#       print("not equal len!")

#     # Calculate metrics
#     results = metric.compute(predictions=predictions, references=true_labels)
#     return {
#         "precision": results["overall_precision"],
#         "recall": results["overall_recall"],
#         "f1": results["overall_f1"],
#         "accuracy": results["overall_accuracy"],
#     }


Using the latest cached version of the module from C:\Users\Yang LiTing\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--seqeval\541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Sun Oct  6 17:50:17 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.


ModuleNotFoundError: No module named 'seqeval'

In [29]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=7)

args = TrainingArguments(
	"test-ner",
	evaluation_strategy = "epoch",
	learning_rate=2e-5,
	per_device_train_batch_size=16,
	per_device_eval_batch_size=16,
	num_train_epochs=30,
	weight_decay=0.01,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

# I changed this part to the evaluate function
metric = evaluate.load("seqeval")

trainer = Trainer(
    model,
	args,
	train_dataset=tokenized_datasets["train"],
	eval_dataset=tokenized_datasets["validation"],
	data_collator=data_collator,
	tokenizer=tokenizer,
	compute_metrics=compute_metrics
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


$$ F1-score = \frac{1}{Precision^{-1} + Recall^{-1}} $$

In [30]:
trainer.train()

                                                   
  3%|▎         | 45/1350 [06:04<3:04:00,  8.46s/it]

{'eval_loss': 1.1335071325302124, 'eval_precision': 0.06960950764006792, 'eval_recall': 0.05093167701863354, 'eval_f1': 0.05882352941176471, 'eval_accuracy': 0.5561214096557343, 'eval_runtime': 13.5351, 'eval_samples_per_second': 5.911, 'eval_steps_per_second': 0.369, 'epoch': 1.0}


                                                   
  7%|▋         | 90/1350 [12:23<3:00:09,  8.58s/it]

{'eval_loss': 1.1070010662078857, 'eval_precision': 0.08349900596421471, 'eval_recall': 0.05217391304347826, 'eval_f1': 0.06422018348623854, 'eval_accuracy': 0.5597881442248931, 'eval_runtime': 20.1626, 'eval_samples_per_second': 3.968, 'eval_steps_per_second': 0.248, 'epoch': 2.0}


                                                      
 10%|█         | 135/1350 [44:45<3:13:23,  9.55s/it]

{'eval_loss': 1.0696722269058228, 'eval_precision': 0.11785714285714285, 'eval_recall': 0.08198757763975155, 'eval_f1': 0.09670329670329672, 'eval_accuracy': 0.5923813403951925, 'eval_runtime': 20.6845, 'eval_samples_per_second': 3.868, 'eval_steps_per_second': 0.242, 'epoch': 3.0}


                                                    
 13%|█▎        | 180/1350 [52:31<3:00:41,  9.27s/it]

{'eval_loss': 1.0903074741363525, 'eval_precision': 0.14852941176470588, 'eval_recall': 0.12546583850931678, 'eval_f1': 0.13602693602693602, 'eval_accuracy': 0.5878997759217763, 'eval_runtime': 25.4319, 'eval_samples_per_second': 3.146, 'eval_steps_per_second': 0.197, 'epoch': 4.0}


                                                    
 17%|█▋        | 225/1350 [1:00:13<3:06:54,  9.97s/it]

{'eval_loss': 1.1514309644699097, 'eval_precision': 0.14087301587301587, 'eval_recall': 0.08819875776397515, 'eval_f1': 0.10847975553857907, 'eval_accuracy': 0.5740476675493991, 'eval_runtime': 22.304, 'eval_samples_per_second': 3.587, 'eval_steps_per_second': 0.224, 'epoch': 5.0}


                                                      
 20%|██        | 270/1350 [1:07:58<2:56:25,  9.80s/it]

{'eval_loss': 1.1883957386016846, 'eval_precision': 0.1350844277673546, 'eval_recall': 0.08944099378881988, 'eval_f1': 0.10762331838565023, 'eval_accuracy': 0.5750662049297209, 'eval_runtime': 22.6961, 'eval_samples_per_second': 3.525, 'eval_steps_per_second': 0.22, 'epoch': 6.0}


                                                      
 23%|██▎       | 315/1350 [1:16:03<3:04:20, 10.69s/it]

{'eval_loss': 1.2108676433563232, 'eval_precision': 0.182328190743338, 'eval_recall': 0.16149068322981366, 'eval_f1': 0.17127799736495386, 'eval_accuracy': 0.5905479731106131, 'eval_runtime': 23.976, 'eval_samples_per_second': 3.337, 'eval_steps_per_second': 0.209, 'epoch': 7.0}


                                                      
 27%|██▋       | 360/1350 [1:25:06<3:43:04, 13.52s/it]

{'eval_loss': 1.3038713932037354, 'eval_precision': 0.16139240506329114, 'eval_recall': 0.1267080745341615, 'eval_f1': 0.14196242171189977, 'eval_accuracy': 0.5746587899775922, 'eval_runtime': 27.2065, 'eval_samples_per_second': 2.94, 'eval_steps_per_second': 0.184, 'epoch': 8.0}


                                                      
 30%|███       | 405/1350 [1:35:59<4:42:40, 17.95s/it]

{'eval_loss': 1.3550693988800049, 'eval_precision': 0.21052631578947367, 'eval_recall': 0.18385093167701863, 'eval_f1': 0.19628647214854114, 'eval_accuracy': 0.5874923609696476, 'eval_runtime': 33.6368, 'eval_samples_per_second': 2.378, 'eval_steps_per_second': 0.149, 'epoch': 9.0}


                                                      
 33%|███▎      | 450/1350 [1:47:49<3:42:30, 14.83s/it]

{'eval_loss': 1.4245316982269287, 'eval_precision': 0.19877675840978593, 'eval_recall': 0.16149068322981366, 'eval_f1': 0.1782042494859493, 'eval_accuracy': 0.5862701161132614, 'eval_runtime': 22.679, 'eval_samples_per_second': 3.527, 'eval_steps_per_second': 0.22, 'epoch': 10.0}


                                                      
 37%|███▋      | 495/1350 [1:55:29<2:01:55,  8.56s/it]

{'eval_loss': 1.4686262607574463, 'eval_precision': 0.20678513731825526, 'eval_recall': 0.15900621118012423, 'eval_f1': 0.17977528089887643, 'eval_accuracy': 0.5936035852515787, 'eval_runtime': 19.369, 'eval_samples_per_second': 4.13, 'eval_steps_per_second': 0.258, 'epoch': 11.0}


 37%|███▋      | 500/1350 [1:56:12<2:25:35, 10.28s/it]

{'loss': 0.7606, 'grad_norm': 3.2864878177642822, 'learning_rate': 1.2592592592592593e-05, 'epoch': 11.11}


                                                      
 40%|████      | 540/1350 [2:02:03<2:05:10,  9.27s/it]

{'eval_loss': 1.532699704170227, 'eval_precision': 0.19781718963165076, 'eval_recall': 0.18012422360248448, 'eval_f1': 0.18855656697009107, 'eval_accuracy': 0.5895294357302913, 'eval_runtime': 17.9065, 'eval_samples_per_second': 4.468, 'eval_steps_per_second': 0.279, 'epoch': 12.0}


                                                      
 43%|████▎     | 585/1350 [2:08:20<2:01:31,  9.53s/it]

{'eval_loss': 1.5677850246429443, 'eval_precision': 0.18811881188118812, 'eval_recall': 0.16521739130434782, 'eval_f1': 0.17592592592592593, 'eval_accuracy': 0.5764921572621715, 'eval_runtime': 22.0503, 'eval_samples_per_second': 3.628, 'eval_steps_per_second': 0.227, 'epoch': 13.0}


                                                      
 47%|████▋     | 630/1350 [2:14:24<1:56:05,  9.67s/it]

{'eval_loss': 1.5649099349975586, 'eval_precision': 0.20136986301369864, 'eval_recall': 0.1826086956521739, 'eval_f1': 0.19153094462540718, 'eval_accuracy': 0.591566510490935, 'eval_runtime': 21.8067, 'eval_samples_per_second': 3.669, 'eval_steps_per_second': 0.229, 'epoch': 14.0}


                                                        
 50%|█████     | 675/1350 [2:31:01<1:31:12,  8.11s/it]

{'eval_loss': 1.6145893335342407, 'eval_precision': 0.20887728459530025, 'eval_recall': 0.19875776397515527, 'eval_f1': 0.20369191597708464, 'eval_accuracy': 0.5946221226319006, 'eval_runtime': 8.8725, 'eval_samples_per_second': 9.017, 'eval_steps_per_second': 0.564, 'epoch': 15.0}


                                                      
 53%|█████▎    | 720/1350 [2:36:11<1:35:11,  9.07s/it]

{'eval_loss': 1.6821467876434326, 'eval_precision': 0.20168067226890757, 'eval_recall': 0.17888198757763976, 'eval_f1': 0.18959842001316654, 'eval_accuracy': 0.5866775310653901, 'eval_runtime': 23.005, 'eval_samples_per_second': 3.478, 'eval_steps_per_second': 0.217, 'epoch': 16.0}


                                                      
 57%|█████▋    | 765/1350 [2:43:55<1:30:09,  9.25s/it]

{'eval_loss': 1.7209333181381226, 'eval_precision': 0.1958762886597938, 'eval_recall': 0.16521739130434782, 'eval_f1': 0.1792452830188679, 'eval_accuracy': 0.5730291301690772, 'eval_runtime': 20.0294, 'eval_samples_per_second': 3.994, 'eval_steps_per_second': 0.25, 'epoch': 17.0}


                                                      
 60%|██████    | 810/1350 [2:50:49<1:37:12, 10.80s/it]

{'eval_loss': 1.7061996459960938, 'eval_precision': 0.18115942028985507, 'eval_recall': 0.15527950310559005, 'eval_f1': 0.16722408026755853, 'eval_accuracy': 0.5891220207781626, 'eval_runtime': 22.7963, 'eval_samples_per_second': 3.509, 'eval_steps_per_second': 0.219, 'epoch': 18.0}


                                                      
 63%|██████▎   | 855/1350 [2:57:53<1:14:58,  9.09s/it]

{'eval_loss': 1.7474794387817383, 'eval_precision': 0.20854922279792745, 'eval_recall': 0.2, 'eval_f1': 0.20418516169942927, 'eval_accuracy': 0.5921776329191282, 'eval_runtime': 20.8093, 'eval_samples_per_second': 3.844, 'eval_steps_per_second': 0.24, 'epoch': 19.0}


                                                        
 67%|██████▋   | 900/1350 [3:29:41<44:12,  5.90s/it]

{'eval_loss': 1.710153341293335, 'eval_precision': 0.21882951653944022, 'eval_recall': 0.21366459627329193, 'eval_f1': 0.21621621621621623, 'eval_accuracy': 0.6007333469138317, 'eval_runtime': 18.0665, 'eval_samples_per_second': 4.428, 'eval_steps_per_second': 0.277, 'epoch': 20.0}


                                                      
 70%|███████   | 945/1350 [3:36:11<52:22,  7.76s/it]

{'eval_loss': 1.8074556589126587, 'eval_precision': 0.22476446837146702, 'eval_recall': 0.20745341614906831, 'eval_f1': 0.2157622739018088, 'eval_accuracy': 0.5917702179669994, 'eval_runtime': 19.3084, 'eval_samples_per_second': 4.143, 'eval_steps_per_second': 0.259, 'epoch': 21.0}


                                                      
 73%|███████▎  | 990/1350 [3:42:37<48:25,  8.07s/it]

{'eval_loss': 1.8406864404678345, 'eval_precision': 0.2127659574468085, 'eval_recall': 0.19875776397515527, 'eval_f1': 0.20552344251766216, 'eval_accuracy': 0.5942147076797718, 'eval_runtime': 18.9198, 'eval_samples_per_second': 4.228, 'eval_steps_per_second': 0.264, 'epoch': 22.0}


 74%|███████▍  | 1000/1350 [3:44:05<47:42,  8.18s/it] 

{'loss': 0.2943, 'grad_norm': 4.058948040008545, 'learning_rate': 5.185185185185185e-06, 'epoch': 22.22}


                                                     
 77%|███████▋  | 1035/1350 [3:49:17<46:39,  8.89s/it]

{'eval_loss': 1.863502860069275, 'eval_precision': 0.21578947368421053, 'eval_recall': 0.20372670807453416, 'eval_f1': 0.20958466453674118, 'eval_accuracy': 0.5899368506824201, 'eval_runtime': 18.7478, 'eval_samples_per_second': 4.267, 'eval_steps_per_second': 0.267, 'epoch': 23.0}


                                                       
 80%|████████  | 1080/1350 [3:55:56<35:34,  7.91s/it]

{'eval_loss': 1.8469524383544922, 'eval_precision': 0.21119592875318066, 'eval_recall': 0.2062111801242236, 'eval_f1': 0.2086737900691389, 'eval_accuracy': 0.595436952536158, 'eval_runtime': 18.9262, 'eval_samples_per_second': 4.227, 'eval_steps_per_second': 0.264, 'epoch': 24.0}


                                                       
 83%|████████▎ | 1125/1350 [4:02:34<31:04,  8.28s/it]

{'eval_loss': 1.8539445400238037, 'eval_precision': 0.21510883482714468, 'eval_recall': 0.20869565217391303, 'eval_f1': 0.21185372005044137, 'eval_accuracy': 0.5995111020574455, 'eval_runtime': 18.7819, 'eval_samples_per_second': 4.259, 'eval_steps_per_second': 0.266, 'epoch': 25.0}


                                                     
 87%|████████▋ | 1170/1350 [4:09:11<24:21,  8.12s/it]

{'eval_loss': 1.871177077293396, 'eval_precision': 0.21739130434782608, 'eval_recall': 0.21739130434782608, 'eval_f1': 0.21739130434782608, 'eval_accuracy': 0.6009370543898961, 'eval_runtime': 18.9467, 'eval_samples_per_second': 4.222, 'eval_steps_per_second': 0.264, 'epoch': 26.0}


                                                     
 90%|█████████ | 1215/1350 [4:15:51<18:13,  8.10s/it]

{'eval_loss': 1.9016809463500977, 'eval_precision': 0.22391857506361323, 'eval_recall': 0.2186335403726708, 'eval_f1': 0.22124450031426776, 'eval_accuracy': 0.6007333469138317, 'eval_runtime': 18.6769, 'eval_samples_per_second': 4.283, 'eval_steps_per_second': 0.268, 'epoch': 27.0}


                                                     
 93%|█████████▎| 1260/1350 [4:23:03<12:41,  8.47s/it]

{'eval_loss': 1.8991972208023071, 'eval_precision': 0.2141057934508816, 'eval_recall': 0.2111801242236025, 'eval_f1': 0.21263289555972484, 'eval_accuracy': 0.5984925646771236, 'eval_runtime': 23.1177, 'eval_samples_per_second': 3.461, 'eval_steps_per_second': 0.216, 'epoch': 28.0}


                                                     
 97%|█████████▋| 1305/1350 [4:30:37<07:05,  9.46s/it]

{'eval_loss': 1.8973066806793213, 'eval_precision': 0.22025316455696203, 'eval_recall': 0.21614906832298136, 'eval_f1': 0.2181818181818182, 'eval_accuracy': 0.5997148095335099, 'eval_runtime': 20.8033, 'eval_samples_per_second': 3.846, 'eval_steps_per_second': 0.24, 'epoch': 29.0}


                                                     
100%|██████████| 1350/1350 [4:39:04<00:00, 12.40s/it]


{'eval_loss': 1.9132938385009766, 'eval_precision': 0.2236180904522613, 'eval_recall': 0.22111801242236026, 'eval_f1': 0.22236102435977514, 'eval_accuracy': 0.598696272153188, 'eval_runtime': 25.8556, 'eval_samples_per_second': 3.094, 'eval_steps_per_second': 0.193, 'epoch': 30.0}
{'train_runtime': 16744.7003, 'train_samples_per_second': 1.29, 'train_steps_per_second': 0.081, 'train_loss': 0.43938456782588253, 'epoch': 30.0}


TrainOutput(global_step=1350, training_loss=0.43938456782588253, metrics={'train_runtime': 16744.7003, 'train_samples_per_second': 1.29, 'train_steps_per_second': 0.081, 'total_flos': 927596423088192.0, 'train_loss': 0.43938456782588253, 'epoch': 30.0})

# Save model

In [32]:
model.save_pretrained("ner_model")
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.txt',
 'tokenizer\\added_tokens.json',
 'tokenizer\\tokenizer.json')

In [33]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id

# Model Prediction

In [34]:
json.dump(config, open("ner_model/config.json","w"))

In [35]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")

In [36]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

example = "The ATM at Bank XYZ was super convenient and had English instructions."

ner_results = nlp(example)

for i in ner_results:
	print(i)

{'entity': 'I-CON', 'score': 0.7842831, 'index': 1, 'word': 'the', 'start': 0, 'end': 3}
{'entity': 'I-CON', 'score': 0.9835593, 'index': 2, 'word': 'atm', 'start': 4, 'end': 7}
{'entity': 'I-CON', 'score': 0.93292665, 'index': 3, 'word': 'at', 'start': 8, 'end': 10}
{'entity': 'I-CON', 'score': 0.9725945, 'index': 4, 'word': 'bank', 'start': 11, 'end': 15}
{'entity': 'I-CON', 'score': 0.51641023, 'index': 5, 'word': 'x', 'start': 16, 'end': 17}
{'entity': 'I-CON', 'score': 0.48383063, 'index': 6, 'word': '##y', 'start': 17, 'end': 18}
{'entity': 'I-CON', 'score': 0.81034875, 'index': 7, 'word': '##z', 'start': 18, 'end': 19}
{'entity': 'I-CON', 'score': 0.9212654, 'index': 8, 'word': 'was', 'start': 20, 'end': 23}
{'entity': 'I-CON', 'score': 0.98793334, 'index': 9, 'word': 'super', 'start': 24, 'end': 29}
{'entity': 'I-CON', 'score': 0.99185646, 'index': 10, 'word': 'convenient', 'start': 30, 'end': 40}
{'entity': 'I-CON', 'score': 0.6610174, 'index': 11, 'word': 'and', 'start': 41, 

In [None]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

example = "I appreciated how quickly my transaction was processed by the teller."

ner_results = nlp(example)

for i in ner_results:
	print(i)