In [152]:
%pip install transformers -q
%pip install datasets transformers evaluate -q
%pip install accelerate -U



In [153]:
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from datasets import Dataset, DatasetDict
from transformers import TrainingArguments, Trainer, pipeline

from transformers import TFRobertaForTokenClassification, RobertaTokenizerFast, DataCollatorForTokenClassification, create_optimizer, TFBertForSequenceClassification

In [154]:
import json

with open("annotations.json", "r") as json_file:
  data = json.load(json_file)

In [155]:
data


{'classes': ['NONE',
  'B-ORG',
  'I-ORG',
  'B-DATE',
  'I-DATE',
  'B-TIME',
  'I-TIME',
  'B-PER',
  'I-PER',
  'B-CURR',
  'I-CURR',
  'B-LOC',
  'I-LOC',
  'B-AGRE',
  'I-AGRE',
  'B-MISC',
  'I-MISC'],
 'annotations': [['EXECUTION VERSION Inveniam Private Equity Fund Demo IV. . 8500 World Trade New York, New York 10022 January 1, 2019 Maria Sharapova 31 Blandford Street London, W1U 3DN Ladies and Gentlemen: RE: Inveniam Private Equity Fund Demo IV, a Delaware limited partnership (the "Fund") This letter agreement (this "Letter Agreement") is entered into in connection with the purchase by Sharapova (the "Investor") of a limited partnership interest in the Fund. Capitalized terms used and not defined herein shall have the meanings given to them in the Fourth Amended and Restated Limited Partnership Agreement of the Fund dated as of August 27, 2021 (the "Limited Partnership Agreement"), the Subscription Agreement between the Investor and the Fund dated as of the date hereof (the "S

In [156]:
text = data["annotations"][0][0]
classes=data['classes']
tags=data['annotations'][0][1]['entities']

In [157]:
print('classes are -->', classes)
print('tag are -->', tags)

classes are --> ['NONE', 'B-ORG', 'I-ORG', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-PER', 'I-PER', 'B-CURR', 'I-CURR', 'B-LOC', 'I-LOC', 'B-AGRE', 'I-AGRE', 'B-MISC', 'I-MISC']
tag are --> [[18, 26, 'B-ORG'], [27, 34, 'I-ORG'], [35, 41, 'B-MISC'], [42, 46, 'I-MISC'], [47, 51, 'I-MISC'], [52, 55, 'I-MISC'], [58, 62, 'B-LOC'], [63, 68, 'I-LOC'], [69, 74, 'I-LOC'], [75, 78, 'I-LOC'], [79, 83, 'I-LOC'], [85, 88, 'B-LOC'], [89, 93, 'I-LOC'], [94, 99, 'I-LOC'], [100, 107, 'B-DATE'], [108, 109, 'I-DATE'], [111, 115, 'I-DATE'], [116, 121, 'B-PER'], [122, 131, 'I-PER'], [132, 134, 'B-LOC'], [135, 144, 'I-LOC'], [145, 151, 'I-LOC'], [152, 158, 'I-LOC'], [194, 202, 'B-ORG'], [203, 210, 'I-ORG'], [211, 217, 'B-MISC'], [218, 222, 'I-MISC'], [223, 227, 'I-MISC'], [228, 230, 'I-MISC'], [234, 242, 'B-AGRE'], [243, 250, 'I-AGRE'], [251, 262, 'I-AGRE'], [281, 287, 'B-AGRE'], [288, 297, 'I-AGRE'], [304, 311, 'B-AGRE'], [312, 322, 'I-AGRE'], [375, 384, 'B-PER'], [390, 400, 'B-MISC'], [407, 414, 'B-AGRE'

In [158]:
print((classes))

['NONE', 'B-ORG', 'I-ORG', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-PER', 'I-PER', 'B-CURR', 'I-CURR', 'B-LOC', 'I-LOC', 'B-AGRE', 'I-AGRE', 'B-MISC', 'I-MISC']


In [159]:
classes_dict = {}
class_length = len(classes)

for i in range(class_length):
  classes_dict[classes[i]] = i

In [160]:
# class_dict

In [161]:
tokens=[]
named_tags=[]

In [162]:

for tag in tags:
  tokens.append(text[tag[0]:tag[1]])
  named_tags.append(tag[2])

In [163]:
print('tokens are:   ',tokens)
print('named_tag are:', named_tags)

tokens are:    ['Inveniam', 'Private', 'Equity', 'Fund', 'Demo', 'IV.', '8500', 'World', 'Trade', 'New', 'York', 'New', 'York', '10022', 'January', '1', '2019', 'Maria', 'Sharapova', '31', 'Blandford', 'Street', 'London', 'Inveniam', 'Private', 'Equity', 'Fund', 'Demo', 'IV', 'Delaware', 'limited', 'partnership', 'letter', 'agreement', '"Letter', 'Agreement"', 'Sharapova', '"Investor"', 'limited', 'partnership', 'Amended', 'Restated', 'Limited', 'Partnership', 'Agreement', 'August', '27', '2021', '"Limited', 'Partnership', 'Agreement"', 'Subscription', 'Agreement', 'Investor', '"Subscription', 'Agreement"', '$', '50,000,000', '"Capital', 'Commitment"', 'Inveniam', 'Capital', 'Partners', 'GP', 'L.L.C.', 'Delaware', 'limited', 'liability', 'company', '"General', 'Partner"', 'Investor', 'Investor', 'January', '1', '2019', '0.85', '%', 'General', 'Partner', 'Investor', 'Other', 'Agreements', 'Comparable', 'Investors', 'Inveniam', 'Private', 'Equity', 'Fund', 'Demo', 'IV.', '"Sister', 'Fund

In [164]:
dictionary=dict(zip(tokens,named_tags))

In [165]:
print(dictionary)

{'Inveniam': 'B-ORG', 'Private': 'I-ORG', 'Equity': 'B-MISC', 'Fund': 'I-ORG', 'Demo': 'I-MISC', 'IV.': 'I-MISC', '8500': 'B-LOC', 'World': 'I-LOC', 'Trade': 'I-LOC', 'New': 'B-LOC', 'York': 'I-LOC', '10022': 'I-LOC', 'January': 'B-DATE', '1': 'I-DATE', '2019': 'I-DATE', 'Maria': 'B-PER', 'Sharapova': 'B-PER', '31': 'B-LOC', 'Blandford': 'I-LOC', 'Street': 'I-LOC', 'London': 'I-LOC', 'IV': 'I-MISC', 'Delaware': 'B-ORG', 'limited': 'B-MISC', 'partnership': 'I-AGRE', 'letter': 'B-AGRE', 'agreement': 'I-AGRE', '"Letter': 'B-AGRE', 'Agreement"': 'I-AGRE', '"Investor"': 'B-MISC', 'Amended': 'B-AGRE', 'Restated': 'I-AGRE', 'Limited': 'B-MISC', 'Partnership': 'I-AGRE', 'Agreement': 'I-AGRE', 'August': 'B-DATE', '27': 'I-DATE', '2021': 'I-DATE', '"Limited': 'B-AGRE', 'Subscription': 'I-AGRE', 'Investor': 'B-MISC', '"Subscription': 'B-AGRE', '$': 'B-CURR', '50,000,000': 'I-CURR', '"Capital': 'B-AGRE', 'Commitment"': 'I-AGRE', 'Capital': 'I-MISC', 'Partners': 'I-MISC', 'GP': 'I-ORG', 'L.L.C.': '

In [166]:
all_tokens=text.split(" ")
named_label=[]
number_label=[]

In [167]:
for token in all_tokens:
  if token not in dictionary:
    named_label.append("NONE")
  else:
    named_label.append(dictionary[token])

In [168]:
for label in named_label:
  number_label.append(classes_dict[label])

In [169]:
print("anno_dict",dictionary)
print("tokens",all_tokens)
print("named_label", named_label)
print("number",number_label)

anno_dict {'Inveniam': 'B-ORG', 'Private': 'I-ORG', 'Equity': 'B-MISC', 'Fund': 'I-ORG', 'Demo': 'I-MISC', 'IV.': 'I-MISC', '8500': 'B-LOC', 'World': 'I-LOC', 'Trade': 'I-LOC', 'New': 'B-LOC', 'York': 'I-LOC', '10022': 'I-LOC', 'January': 'B-DATE', '1': 'I-DATE', '2019': 'I-DATE', 'Maria': 'B-PER', 'Sharapova': 'B-PER', '31': 'B-LOC', 'Blandford': 'I-LOC', 'Street': 'I-LOC', 'London': 'I-LOC', 'IV': 'I-MISC', 'Delaware': 'B-ORG', 'limited': 'B-MISC', 'partnership': 'I-AGRE', 'letter': 'B-AGRE', 'agreement': 'I-AGRE', '"Letter': 'B-AGRE', 'Agreement"': 'I-AGRE', '"Investor"': 'B-MISC', 'Amended': 'B-AGRE', 'Restated': 'I-AGRE', 'Limited': 'B-MISC', 'Partnership': 'I-AGRE', 'Agreement': 'I-AGRE', 'August': 'B-DATE', '27': 'I-DATE', '2021': 'I-DATE', '"Limited': 'B-AGRE', 'Subscription': 'I-AGRE', 'Investor': 'B-MISC', '"Subscription': 'B-AGRE', '$': 'B-CURR', '50,000,000': 'I-CURR', '"Capital': 'B-AGRE', 'Commitment"': 'I-AGRE', 'Capital': 'I-MISC', 'Partners': 'I-MISC', 'GP': 'I-ORG', '

In [170]:
tokens_array=[]
label_array=[]

temp1=[]
temp2=[]
for i in range(len(all_tokens)):

   temp1.append(all_tokens[i])
   temp2.append(number_label[i])
   if i%10==0 and i!=0:
    tokens_array.append(temp1)
    label_array.append(temp2)
    temp1=[]
    temp2=[]
tokens_array.append(temp1)
label_array.append(temp2)

In [171]:
print(tokens_array)
print(label_array)

[['EXECUTION', 'VERSION', 'Inveniam', 'Private', 'Equity', 'Fund', 'Demo', 'IV.', '.', '8500', 'World'], ['Trade', 'New', 'York,', 'New', 'York', '10022', 'January', '1,', '2019', 'Maria'], ['Sharapova', '31', 'Blandford', 'Street', 'London,', 'W1U', '3DN', 'Ladies', 'and', 'Gentlemen:'], ['RE:', 'Inveniam', 'Private', 'Equity', 'Fund', 'Demo', 'IV,', 'a', 'Delaware', 'limited'], ['partnership', '(the', '"Fund")', 'This', 'letter', 'agreement', '(this', '"Letter', 'Agreement")', 'is'], ['entered', 'into', 'in', 'connection', 'with', 'the', 'purchase', 'by', 'Sharapova', '(the'], ['"Investor")', 'of', 'a', 'limited', 'partnership', 'interest', 'in', 'the', 'Fund.', 'Capitalized'], ['terms', 'used', 'and', 'not', 'defined', 'herein', 'shall', 'have', 'the', 'meanings'], ['given', 'to', 'them', 'in', 'the', 'Fourth', 'Amended', 'and', 'Restated', 'Limited'], ['Partnership', 'Agreement', 'of', 'the', 'Fund', 'dated', 'as', 'of', 'August', '27,'], ['2021', '(the', '"Limited', 'Partnership',

In [172]:
total_data=len(tokens_array)

In [173]:
total_data

250

In [174]:
data_dict_train = {
    "tokens" : tokens_array[0:200],
    "labels" : label_array[0:200]
}

data_dict_val = {
    "tokens" : tokens_array[200:225],
    "labels" : label_array[200:225]
}

data_dict_test = {
    "tokens" : tokens_array[225:],
    "labels" : label_array[225:]
}

dataset_train = Dataset.from_dict(data_dict_train)
dataset_val = Dataset.from_dict(data_dict_val)
dataset_test = Dataset.from_dict(data_dict_test)

dataset_dict = DatasetDict({"train": dataset_train, "validate": dataset_val, "test": dataset_test})

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 200
    })
    validate: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 25
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 25
    })
})


In [175]:

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [176]:
inputs = tokenizer(dataset_dict["train"][15]["tokens"], is_split_into_words=True,)
tokens=tokenizer.convert_ids_to_tokens(inputs["input_ids"])
# print(tokens)
print(inputs.word_ids())


[None, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 9, 9, 9, None]


In [177]:
inputs

{'input_ids': [101, 1002, 2753, 1010, 2199, 1010, 2199, 2004, 1997, 1996, 3058, 2182, 11253, 1006, 1996, 1000, 3007, 8426, 1000, 1007, 1010, 1999, 8159, 25107, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [178]:
def align_labels_with_tokens(labels, word_ids):
    for label in labels:
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(labels[word_idx])
            previous_word_idx = word_idx

        return label_ids

In [179]:
labels = dataset_dict["train"][11]["labels"]
inputs = tokenizer(dataset_dict["train"][11]["tokens"], is_split_into_words=True,)
word_ids = inputs.word_ids()
print(dataset_dict["train"][11]["tokens"])
print(align_labels_with_tokens(labels, word_ids))
print(labels)

['Investor', 'and', 'the', 'Fund', 'dated', 'as', 'of', 'the', 'date', 'hereof']
[-100, 15, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, -100]
[15, 0, 0, 2, 0, 0, 0, 0, 0, 0]


In [180]:
example_tokens = dataset_dict["train"][11]
inputs = tokenizer(example_tokens["tokens"], is_split_into_words=True,)
print("Miss Matched Length!!")
print(len(inputs.tokens()))
print(len(example_tokens["labels"]))
print("Label matching successful!!")
print(len(align_labels_with_tokens(example_tokens["labels"], inputs.word_ids())))


Miss Matched Length!!
13
10
Label matching successful!!
13


In [181]:
def tokenizer_function(dataset):
  tokenized_dataset = tokenizer(dataset['tokens'],truncation=True,is_split_into_words=True,)
  tokenized_dataset['labels'] = align_labels_with_tokens(dataset['labels'],tokenized_dataset.word_ids())
  return tokenized_dataset


In [182]:
dataset_dict['train'][15]



{'tokens': ['$50,000,000',
  'as',
  'of',
  'the',
  'date',
  'hereof',
  '(the',
  '"Capital',
  'Commitment"),',
  'Inveniam'],
 'labels': [0, 0, 0, 0, 0, 0, 0, 13, 0, 1]}

In [183]:
# tokenized_dataset = data_dataset.map(tokenizer_function, remove_columns=['tokens'])
type(dataset_dict)

datasets.dataset_dict.DatasetDict

In [184]:
dataset_dict["train"].features["tokens"]

Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)

In [185]:
dataset_dict['train'].description


''

In [186]:

tokenized_dataset = dataset_dict.map(tokenizer_function, remove_columns=['tokens'])

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [187]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=17)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

In [188]:
args = TrainingArguments(
    "test-ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=5,
    weight_decay=0.01,
)

In [189]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [190]:
trainer = Trainer(
    model,
    args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["validate"],
   data_collator=data_collator,
   tokenizer=tokenizer,
)

In [191]:
model.save_pretrained("name_entity_model")

In [192]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [193]:
classes

['NONE',
 'B-ORG',
 'I-ORG',
 'B-DATE',
 'I-DATE',
 'B-TIME',
 'I-TIME',
 'B-PER',
 'I-PER',
 'B-CURR',
 'I-CURR',
 'B-LOC',
 'I-LOC',
 'B-AGRE',
 'I-AGRE',
 'B-MISC',
 'I-MISC']

In [194]:
id2label = {
    str(i): label for i,label in enumerate(classes)
}
label2id = {
    label: str(i) for i,label in enumerate(classes)
}

In [195]:

id2label



{'0': 'NONE',
 '1': 'B-ORG',
 '2': 'I-ORG',
 '3': 'B-DATE',
 '4': 'I-DATE',
 '5': 'B-TIME',
 '6': 'I-TIME',
 '7': 'B-PER',
 '8': 'I-PER',
 '9': 'B-CURR',
 '10': 'I-CURR',
 '11': 'B-LOC',
 '12': 'I-LOC',
 '13': 'B-AGRE',
 '14': 'I-AGRE',
 '15': 'B-MISC',
 '16': 'I-MISC'}

In [196]:
import json
config = json.load(open("name_entity_model/config.json"))


In [197]:
config["id2label"] = id2label
config["label2id"] = label2id


In [198]:
json.dump(config, open("name_entity_model/config.json","w"))

In [199]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("name_entity_model")

In [200]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

example = '''I'm Abhishek Ray employee of Vara Infrovate Pvt Ltd working as a  junior Associate from 10 july 2023'''

ner_results = nlp(example)

print(ner_results)

[{'entity': 'I-PER', 'score': 0.08692419, 'index': 1, 'word': 'i', 'start': 0, 'end': 1}, {'entity': 'B-CURR', 'score': 0.07575783, 'index': 2, 'word': "'", 'start': 1, 'end': 2}, {'entity': 'B-MISC', 'score': 0.11683419, 'index': 3, 'word': 'm', 'start': 2, 'end': 3}, {'entity': 'I-TIME', 'score': 0.088485666, 'index': 4, 'word': 'ab', 'start': 4, 'end': 6}, {'entity': 'B-PER', 'score': 0.091399714, 'index': 5, 'word': '##his', 'start': 6, 'end': 9}, {'entity': 'B-MISC', 'score': 0.10221719, 'index': 6, 'word': '##he', 'start': 9, 'end': 11}, {'entity': 'I-TIME', 'score': 0.095157586, 'index': 7, 'word': '##k', 'start': 11, 'end': 12}, {'entity': 'I-TIME', 'score': 0.09758566, 'index': 8, 'word': 'ray', 'start': 13, 'end': 16}, {'entity': 'B-ORG', 'score': 0.08936738, 'index': 9, 'word': 'employee', 'start': 17, 'end': 25}, {'entity': 'B-ORG', 'score': 0.12706904, 'index': 10, 'word': 'of', 'start': 26, 'end': 28}, {'entity': 'B-ORG', 'score': 0.10845241, 'index': 11, 'word': 'var', '