# MODEL INFERENCE

In [129]:
import json
import os
import numpy as np
import pandas as pd
import datasets
import transformers
import evaluate
import tensorflow as tf
import regex as re

## Functions

In [130]:
def getClasses(labelEncoder):
    classes = set()
    classes.add('O')
    for label in labelEncoder.keys():
        if label == 'O':
            continue
        l = label[2:]
        classes.add(l)
    classes = list(classes)
    return classes

In [131]:
def getResults(predictions, labelEncoderDecoder, tokenizer, inputText):
    labelEncoder = labelEncoderDecoder['labelEncoder']
    labelDecoder = labelEncoderDecoder['labelDecoder']
    labelDecoder = {int(k): v for k, v in labelDecoder.items()}
    classes = getClasses(labelEncoder)
    results = {}
    for label in classes:
        results[label] = ""
    currIndex = 0
    while currIndex < len(predictions):
        pred = labelDecoder[predictions[currIndex]]
        if pred == 'O':
            results['O'] += tokenizer.decode(inputText['input_ids'][0][currIndex])
        else:
            pred = pred[2:]
            results[pred] += tokenizer.decode(inputText['input_ids'][0][currIndex])
            
        currIndex += 1
    return results

In [132]:
def extract_tokens_from_resume(text):
    # Remove non-breaking spaces and normalize
    text = text.replace("\xa0", " ").strip()

    # Split on words, numbers, and punctuation
    tokens = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
    return tokens

In [133]:
def infer(text, modelID, modelPath, labelEncoderDecoder):
    tokenizer = transformers.RobertaTokenizerFast.from_pretrained(modelID, add_prefix_space=True)
    model = transformers.TFAutoModelForTokenClassification.from_pretrained(modelPath)

    if not isinstance(text, list):
        text = extract_tokens_from_resume(text)
    inputText = tokenizer(text, is_split_into_words=True, truncation=True, padding='max_length', max_length=512, return_tensors='tf')

    predictions = model(**inputText).logits
    predictions = tf.argmax(predictions, axis=2).numpy()
    predictions = np.squeeze(predictions)

    results = getResults(predictions, labelEncoderDecoder, tokenizer, inputText)
    return results

In [134]:
def postprocess_resume_ner_output(raw_output):
    processed = {}
    multi_value_fields = {
        'designation', 'company_name', 'technical_skills',
        'soft-skills', 'work_year', 'achievement', 'certification',
        'work_cities', 'languages_known'
    }
    for label, value in raw_output.items():
        if not value or value.strip() == "":
            continue

        # Remove any stray special tokens or junk characters
        cleaned = value.replace("<s>", "").replace("</s>", "").replace("<pad>", "").strip()

        # Remove excessive whitespace and punctuation artifacts
        cleaned = " ".join(cleaned.split())
        cleaned = cleaned.replace(" .", ".").replace(" ,", ",").replace(" :", ":").replace(" %", "%")

        # Remove duplicate adjacent words
        cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned)

        # Remove excess punctuation and fix spacing
        cleaned = re.sub(r'\s+', ' ', cleaned)
        cleaned = re.sub(r'[\-\–\—]{2,}', '-', cleaned)
        cleaned = re.sub(r'([,:;])\1+', r'\1', cleaned)

        processed[label] = cleaned

    return processed

# Inference

In [135]:
modelID = 'roberta-base'
modelPath = 'Models/ResumeNERModel-RoBERTaBase'
labelEncoderDecoderPath = 'Dataset/LabelEncoderDecoder.json'
labelEncoderDecoder = json.load(open(labelEncoderDecoderPath, 'r'))

In [136]:
testInputs = ["""
John Doe  
1234 Innovation Way  
San Jose, CA 95129  
Email: john.doe@example.com  
Phone: +1-408-555-1234  
LinkedIn: linkedin.com/in/johndoe  
GitHub: github.com/johndoe

---

Objective  
To obtain a challenging position as a Software Engineer where I can contribute to innovative projects using my skills in full-stack development, AI, and cloud technologies.

---

Education  
Master of Science in Computer Science  
Santa Clara University — Expected Graduation: June 2025  
GPA: 3.85 / 4.0

Bachelor of Technology in Computer Engineering  
Vellore Institute of Technology — Graduated: June 2022  
GPA: 8.7 / 10

---

Experience  

Software Engineering Intern — Google LLC, Mountain View, CA  
May 2024 – August 2024  
- Worked on the Google Photos team to improve backend scalability using Go and Kubernetes  
- Reduced latency of image tagging services by 18% through API optimization  
- Collaborated with cross-functional teams using Agile and SCRUM practices

Data Scientist — Infosys Ltd, Bangalore, India  
July 2022 – March 2023  
- Built machine learning pipelines for financial fraud detection (95% precision)  
- Worked on NLP models using Hugging Face transformers for document classification  
- Deployed models using AWS SageMaker and monitored performance in production

---

Technical Skills  
Languages: Python, Java, JavaScript, Go, SQL  
Frameworks: TensorFlow, PyTorch, React.js, Flask  
Tools: Git, Docker, Kubernetes, AWS, GCP  
Databases: PostgreSQL, MongoDB  
Soft Skills: Team Leadership, Public Speaking, Project Management

---

Projects  

AI Resume Parser  
- Built a smart resume parsing tool using Named Entity Recognition with spaCy  
- Achieved 92% accuracy in extracting job titles, skills, and education entities  
- Deployed as a web app using Flask and hosted on Heroku

Movie Recommender System  
- Created a collaborative filtering-based recommender system using Python  
- Integrated with TMDb API and deployed using Streamlit

---

Certifications  
- AWS Certified Solutions Architect – Associate  
- TensorFlow Developer Certificate

---

Achievements  
- Top 2% in Amazon ML Hackathon 2023  
- First place in VIT’s AI DevJam Hackathon 2021  

---

Languages  
English – Fluent  
Hindi – Native  
Spanish – Intermediate

---

Interests  
AI Art, Hiking, Indie Game Development, Aviation Photography

---

References  
Available upon request
"""]

In [137]:
testData = testInputs[0]

In [138]:
result = infer(testData, modelID, modelPath, labelEncoderDecoder)
result = postprocess_resume_ner_output(result)

Some layers from the model checkpoint at Models/ResumeNERModel-RoBERTaBase were not used when initializing TFRobertaForTokenClassification: ['dropout_75']
- This IS expected if you are initializing TFRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaForTokenClassification were initialized from the model checkpoint at Models/ResumeNERModel-RoBERTaBase.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForTokenClassification for predictions without further training.


In [139]:
for key, value in result.items():
    if value == '':
        continue
    print(f"{key}: {value}")
    print('----------------------------------------')

O: John Doe 1234 Innovation Way Jose, CA 95129 Email: john. doe @ example. com Phone: + 1 - 408 - 555 - 1234 LinkedIn: linkedin. com / in / johndoe GitHub: github. com / johndoe - - - Objective To obtain a challenging position as a Software Engineer where I can contribute to innovative projects using my skills in, and. - - - Education — Expected Graduation: 2025 GPA: 3. 85 / 4. 0 — Graduated: June 2022 GPA: 8. 7 / 10 - - - Experience —, View, CA -ed on the Google to - through - using —, - for ( ) -ed for - using and - - - Technical Skills Languages:, SQL Frameworks: Tools, Soft Skills:, - - - Projects - using withCy - A in job titles, skills, and education entities - Deployed as a web app using Flask and - using - and - - - Certifications - - - - - Achievements - - - - - Languages – – – - - - Interests, - - - References Available upon request
----------------------------------------
place_higher_education: Santa Clara University
----------------------------------------
analyzing: monit

In [140]:
datasetPath = 'Dataset/ResumeDataset.json'
dataset = json.load(open(datasetPath, 'r'))
dataset = datasets.Dataset.from_dict(dataset)
dataset

Dataset({
    features: ['id', 'text', 'annotations', 'input', 'NER_LABELS', 'NER_TAGS'],
    num_rows: 224
})

In [142]:
modelID = 'roberta-base'
tokenizer = transformers.RobertaTokenizerFast.from_pretrained(modelID, add_prefix_space=True)

In [143]:
def alignLabelsWithTokens(labels, word_ids):
    newLabels = []
    currentWord = None
    for wordID in word_ids:
        if wordID is None:
            newLabels.append(-100)
        elif wordID != currentWord: # New Word
            currentWord = wordID
            newLabels.append(labels[wordID])
        else: # Same Word
            label = labels[wordID]
            newLabels.append(label)
    return newLabels

In [144]:
def tokenizerFunction(dataset):
    tokenized = tokenizer(dataset['input'], truncation=True, is_split_into_words=True, max_length=512, padding='max_length')
    tokenized['labels'] =  alignLabelsWithTokens(dataset['NER_TAGS'], tokenized.word_ids())
    return tokenized

In [151]:
tokenizedDataset = dataset.map(tokenizerFunction, remove_columns=['input', 'NER_TAGS',  'NER_LABELS',  'text', 'annotations'])
tokenizedDataset

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 224
})

In [152]:
dataCollator = transformers.DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors='tf')
BATCH_SIZE = 4

In [153]:
tfDataset = tokenizedDataset.to_tf_dataset(
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=dataCollator,
)

In [154]:
trainDataset = tfDataset.take(int(len(tfDataset) * 0.9))
remDataset = tfDataset.skip(int(len(tfDataset) * 0.9))
valDataset = remDataset.take(int(len(remDataset) * 0.5))
testDataset = remDataset.skip(int(len(remDataset) * 0.5))

In [159]:
testInputs = []
for batch in testDataset:
    for id in batch['id']:
        id = id.numpy()
        index = dataset['id'].index(id)
        inputText = dataset['text'][index]
        testInputs.append(inputText)
len(testInputs)

12

In [161]:
results = []
for i in range(len(testInputs)):
    testData = testInputs[i]
    result = infer(testData, modelID, modelPath, labelEncoderDecoder)
    result = postprocess_resume_ner_output(result)
    results.append(result)

Some layers from the model checkpoint at Models/ResumeNERModel-RoBERTaBase were not used when initializing TFRobertaForTokenClassification: ['dropout_75']
- This IS expected if you are initializing TFRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaForTokenClassification were initialized from the model checkpoint at Models/ResumeNERModel-RoBERTaBase.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForTokenClassification for predictions without further training.
Some layers from the model checkpoint at Mo

In [163]:
for i in range(len(results)):
    print(f"Resume ID: {i+1}")
    for key, value in results[i].items():
        if value == '':
            continue
        print(f"{key}: {value}")
        print('----------------------------------------')

Resume ID: 1
O: Aniket Kumar Email me on Indeed httpwwwindeedcomrAniketKumar5bf7984ffc21f23c Willing to relocate Anywhere Work Experience Software Developer Oracle Bengaluru Karnataka December 2016 to Present I had been now working at oracle as a software developer and my experience here was quite good enough to be a exampler I had worked here from December 2016 to 2019 from 2019 i had left this job due to my personal reasons Education July 2012 to August 2016 Bachelors Skills IT Skills Languages
----------------------------------------
designation: Software Engineer
----------------------------------------
candidate_city: Patna Bihar
----------------------------------------
work_year: September 2020 to Present
----------------------------------------
researching: reaserch in mathematics and greater mathematician than rkm
----------------------------------------
company_name: Microsoft
----------------------------------------
basic_education: Bachelors in Computer Science
-------------