 # NER MODEL - RoBERTa

## Imports

In [2]:
# ! pip install transformers
# ! pip install datasets
# ! pip install evaluate
# ! pip install seqeval

In [3]:
import json
import os
import numpy as np
import pandas as pd
import datasets
import transformers
import evaluate
import tensorflow as tf
from google.colab import drive
drive.mount('/content/drive')
from google.colab import runtime

Mounted at /content/drive


## Data Preparation

In [4]:
datasetFolderPath = "/content/drive/MyDrive/ColabNotebooks/ScoreIT/Dataset/"
jsonFilePath = datasetFolderPath + "ResumeDataset.json"
csvFilePath = datasetFolderPath + "ResumeDataset.csv"
labelEncoderDecoderPath = datasetFolderPath + "LabelEncoderDecoder.json"

In [5]:
labelEncoderDecoder = json.load(open(labelEncoderDecoderPath))
labelEncoder = labelEncoderDecoder['labelEncoder']
labelDecoder = labelEncoderDecoder['labelDecoder']

In [6]:
len(labelEncoder), len(labelDecoder)

(73, 73)

In [7]:
dataset = datasets.Dataset.from_json(jsonFilePath)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['id', 'text', 'annotations', 'input', 'NER_LABELS', 'NER_TAGS'],
    num_rows: 349
})

In [8]:
modelID = 'roberta-base'
tokenizer = transformers.RobertaTokenizerFast.from_pretrained(modelID, add_prefix_space=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [9]:
def alignLabelsWithTokens(labels, word_ids):
    newLabels = []
    currentWord = None
    for wordID in word_ids:
        if wordID is None:
            newLabels.append(-100)
        elif wordID != currentWord: # New Word
            currentWord = wordID
            newLabels.append(labels[wordID])
        else: # Same Word
            label = labels[wordID]
            newLabels.append(label)
    return newLabels

In [10]:
def tokenizerFunction(dataset):
    tokenized = tokenizer(dataset['input'], truncation=True, is_split_into_words=True, max_length=512)
    tokenized['labels'] =  alignLabelsWithTokens(dataset['NER_TAGS'], tokenized.word_ids())
    return tokenized

In [11]:
tokenizedDataset = dataset.map(tokenizerFunction, remove_columns=['id', 'input', 'NER_LABELS', 'NER_TAGS', 'text', 'annotations'])

Map:   0%|          | 0/349 [00:00<?, ? examples/s]

In [12]:
tokenizedDataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 349
})

In [13]:
dataCollator = transformers.DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors='tf')

In [14]:
BATCH_SIZE = 4

In [15]:
tfDataset = tokenizedDataset.to_tf_dataset(
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=dataCollator,
)

In [16]:
trainDataset = tfDataset.take(int(len(tfDataset) * 0.9))
remDataset = tfDataset.skip(int(len(tfDataset) * 0.9))
valDataset = remDataset.take(int(len(remDataset) * 0.5))
testDataset = remDataset.skip(int(len(remDataset) * 0.5))

In [17]:
len(trainDataset), len(valDataset), len(testDataset)

(79, 4, 5)

## Modeling

In [18]:
modelID = 'roberta-base'
model = transformers.TFAutoModelForTokenClassification.from_pretrained(modelID, num_labels=73)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForTokenClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
model.summary()

Model: "tf_roberta_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  124055040 
 r)                                                              
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  56137     
                                                                 
Total params: 124111177 (473.45 MB)
Trainable params: 124111177 (473.45 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Training

In [20]:
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True)

In [21]:
learningRateScheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)

In [22]:
NUM_EPOCHS = 200
batchesPerEpoch = len(tokenizedDataset) // BATCH_SIZE
totalTrainSteps = batchesPerEpoch * NUM_EPOCHS
optimizer, schedule = transformers.create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=totalTrainSteps)

In [23]:
model.compile(optimizer=optimizer, metrics=['accuracy'])

In [24]:
history = model.fit(
    trainDataset,
    validation_data=valDataset,
    epochs=NUM_EPOCHS,
    callbacks=[earlyStopping, learningRateScheduler]
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 10: ReduceLROnPlateau reducing learning rate to 9.54655115492642e-06.
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 19: ReduceLROnPlateau reducing learning rate to 9.137930646829773e-06.
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 29: ReduceLROnPlateau reducing learning rate to 8.683908163220622e-06.
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 35: ReduceLROnPlateau reducing learning rate to 8.41149449115619e-06.
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 43: ReduceLROnPlateau reducing learning rate to 8.048275958572049e-06.
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Ep

In [25]:
modelFolderPath = "/content/drive/MyDrive/ColabNotebooks/ScoreIT/Models/"
model.save_pretrained(modelFolderPath + "ResumeNERModel-RoBERTaBase")

## Testing

In [26]:
modelID = 'roberta-base'
modelFolderPath = "/content/drive/MyDrive/ColabNotebooks/ScoreIT/Models/"
modelPath = modelFolderPath + "ResumeNERModel-RoBERTaBase"
tokenizer = transformers.RobertaTokenizerFast.from_pretrained(modelID, add_prefix_space=True)
model = transformers.TFAutoModelForTokenClassification.from_pretrained(modelPath)

Some layers from the model checkpoint at /content/drive/MyDrive/ColabNotebooks/ScoreIT/Models/ResumeNERModel-RoBERTaBase were not used when initializing TFRobertaForTokenClassification: ['dropout_37']
- This IS expected if you are initializing TFRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaForTokenClassification were initialized from the model checkpoint at /content/drive/MyDrive/ColabNotebooks/ScoreIT/Models/ResumeNERModel-RoBERTaBase.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForTokenClassifi

In [27]:
def getClasses(labelEncoder):
    classes = set()
    classes.add('O')
    for label in labelEncoder.keys():
        if label == 'O':
            continue
        l = label[2:]
        classes.add(l)
    classes = list(classes)
    return classes

In [44]:
def getResults(inputText, predictions, labelEncoderDecoder):
    labelEncoder = labelEncoderDecoder['labelEncoder']
    labelDecoder = labelEncoderDecoder['labelDecoder']
    labelDecoder = {int(k): v for k, v in labelDecoder.items()}
    classes = getClasses(labelEncoder)
    results = {}
    for label in classes:
        results[label] = ""
    for i, pred in enumerate(predictions):
        pred = labelDecoder[pred]
        if pred == 'O':
            results['O'] += tokenizer.decode(inputText['input_ids'][0][i])
        else:
            prefix = pred[0]
            pred = pred[2:]
            if prefix == 'I':
                results[pred] += tokenizer.decode(inputText['input_ids'][0][i])
            else:
                results[pred] = '\n'
                results[pred] = tokenizer.decode(inputText['input_ids'][0][i])
    return results

In [45]:
def infer(text, modelID, modelPath, labelEncoderDecoder):
    tokenizer = transformers.RobertaTokenizerFast.from_pretrained(modelID, add_prefix_space=True)
    model = transformers.TFAutoModelForTokenClassification.from_pretrained(modelPath)

    if isinstance(text, str):
        text = text.split(" ")
    inputText = tokenizer(text, is_split_into_words=True, truncation=True, padding='max_length', max_length=512, return_tensors='tf')

    predictions = model(**inputText).logits
    predictions = tf.argmax(predictions, axis=2).numpy()
    predictions = np.squeeze(predictions)

    results = getResults(inputText, predictions, labelEncoderDecoder)
    return results

In [46]:
testInputs = ["""
AMAN JAIN
Santa Clara, CA, USA | amanjn2003@gmail.com | +1 (425)380–6319
LinkedIn | GitHub | VISA Status: F1

EDUCATION

Santa Clara University, CA, USA (Sep 2024 – Present)
Master of Science in Computer Science
GPA: -/4.0
Related Coursework: Design and Analysis of Algorithms, Computer Architecture.

Jaypee Institute of Information Technology, India (Sep 2020 – May 2024)
Bachelor’s in Computer Science & Engineering
GPA: 3.2/4.0
Related Coursework: Software Development Fundamentals, Data Structures & Algorithms, Object Oriented Design, Operating Systems,
Computer Organization & Architecture, Computer Networks, Statistics & Probability, Soft Computing, Deep Learning, NLP & Big Data.

TECHNICAL SKILLS

Programming Languages: Python, C, C++, HTML, JavaScript, SQL
Tools & Technologies: Machine Learning, Deep Learning, Computer Vision, TensorFlow, Keras, Git, Google Cloud Platform & AWS

EXPERIENCE

AI/ML HUB OF JIIT | Founder & Instructor | Noida, India (Jan 2024 – May 2024)
• Headed the team for founding the first AI/ML hub at Jaypee Institute.
• Managed 50+ team members and delivered weekly ML training to 100 students.
• Created a 500+ member online AI/ML community.
• Recognized by the Vice-Chancellor for contributions to AI/ML awareness.
• Currently serving as an advisor to the hub.

TALLY SOLUTIONS | Machine Learning Intern | Bangalore, India (Jun 2023 – Jul 2023)
• Built an invoice parser converting invoice images to JSON using ML models.
• Worked with Google Cloud (Vertex AI, Document AI) and AWS (SageMaker).
• Optimized model compatibility and documented CPU/GPU performance.

ACHIEVEMENTS

Amazon ML Hackathon 2023 | Ranked 93rd out of 5000 participants
• Built LSTM model predicting product dimensions from a 2.2M sample dataset in 3 days.
• Gained experience in collaboration, ML, and leadership.

PROJECTS

Revive AI – Image Resolution Up-scaler (Jun 2024 – Jul 2024)
• Used SR-CNN and VDSR models on 85K+ images from LSDIR dataset.
• Improved PSNR; deployed with Tkinter + INNO setup as Windows app.

Posture Guard – Sitting Posture Detection System (Jan 2024 – May 2024)
• Used computer vision and ML to classify sitting posture in real-time.
• Built custom dataset of 30,000 images in 5 classes.
• Trained deep CNNs on Nvidia DGX; deployed as background Windows app with alerts.

GitHub: https://github.com/AmanJain2903
LinkedIn: https://www.linkedin.com/in/aman-jain-7b6478224/
Revive AI: https://github.com/AmanJain2903/Revive-AI.git
Posture Guard: https://github.com/AmanJain2903/Posture-Guard.git
Hackathon Leaderboard: https://www.hackerearth.com/challenges/competitive/amazon-ml-challenge-2023/leaderboard/
"""]

In [47]:
text = testInputs[0].replace("\n", " ")
result = infer(text, modelID, modelPath, labelEncoderDecoder)

Some layers from the model checkpoint at /content/drive/MyDrive/ColabNotebooks/ScoreIT/Models/ResumeNERModel-RoBERTaBase were not used when initializing TFRobertaForTokenClassification: ['dropout_37']
- This IS expected if you are initializing TFRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaForTokenClassification were initialized from the model checkpoint at /content/drive/MyDrive/ColabNotebooks/ScoreIT/Models/ResumeNERModel-RoBERTaBase.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForTokenClassifi

In [48]:
for key, value in result.items():
  if value == '':
    continue
  print(f"{key}: {value}")

applying_expertise:  Improved PSNR Guard
basic_education: s in Computer Science
place_higher_education: pee Institute of Information Technology
influencing:  delivered weekly ML training to 100 students
work_cities:  Bangalore
work_with_people: aged 50+ team members
achievement: rd
O: <s> AMAN JAIN Clara, CA, USA | amanjn2003@gmail.com | +1 (425)380–6319 LinkedIn | GitHub | VISA Status: F1 EDUCATION, USA (Sep 2024 – Present) in Computer Science GPA: -/4.0 Related Coursework:, Architecture., India (Sep 2020 – May 2024)� & Engineering GPA: 3.2/4.0 Related Coursework:,,,,,,,,,. TECHNICAL SKILLS Programming Languages:,, Tools & Technologies:,,,,,, & EXPERIENCE AI/ML HUB OF JIIT | Founder & Instructor |, India ( –) • for founding the first AI/ at Jaypee Institute. • and. • Created a 500+ member online AI/ML community. • for contributions to AI/ML. • Currently serving as an advisor to the hub. | |, India () • converting invoice images to JSON using models. • Worked with Cloud (,) and (). • a

## Evaluation

In [49]:
metric = evaluate.load('seqeval')

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [50]:
allPreds = []
allLabels = []

In [57]:
for batch in testDataset:
    logits = model.predict(batch)['logits']
    labels = batch['labels'].numpy()
    predictions = tf.argmax(logits, axis=-1).numpy()
    batchPred = []
    batchLabels = []
    for i in range(len(predictions)):
        for j in range(len(predictions[i])):
            if labels[i][j] != -100:
                batchPred.append(labelDecoder[predictions[i][j]])
                batchLabels.append(labelDecoder[labels[i][j]])
    allPreds.append(batchPred)
    allLabels.append(batchLabels)



In [58]:
metric.compute(predictions=allPreds, references=allLabels)

{'achievement': {'precision': np.float64(0.7222222222222222),
  'recall': np.float64(0.6842105263157895),
  'f1': np.float64(0.7027027027027027),
  'number': np.int64(19)},
 'adaption_to_change': {'precision': np.float64(0.8888888888888888),
  'recall': np.float64(0.8),
  'f1': np.float64(0.8421052631578948),
  'number': np.int64(10)},
 'analyzing': {'precision': np.float64(0.8888888888888888),
  'recall': np.float64(0.8),
  'f1': np.float64(0.8421052631578948),
  'number': np.int64(10)},
 'applying_expertise': {'precision': np.float64(0.8648648648648649),
  'recall': np.float64(0.8648648648648649),
  'f1': np.float64(0.8648648648648649),
  'number': np.int64(37)},
 'basic_education': {'precision': np.float64(0.875),
  'recall': np.float64(0.875),
  'f1': np.float64(0.875),
  'number': np.int64(16)},
 'birth_date': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(4)},
 'candidate_city': {'precision': np.float64(0.9629629629629629

In [59]:
runtime.unassign()