# RESUME DATASET PROCESSING

## Imports

In [29]:
import json
import os
import numpy as np
import pandas as pd
import datasets

## Functions

In [30]:
def readJsonFile(filePath):
    with open(filePath, 'r') as jsonFile:
        jsonData = json.load(jsonFile)
    return jsonData

In [31]:
def extractJsonStructureFromObject(data, indent=0):
    def printStructure(obj, indent):
        spacing = '  ' * indent
        if isinstance(obj, dict):
            for key, value in obj.items():
                print(f"{spacing}- {key}: {type(value).__name__}")
                printStructure(value, indent + 1)
        elif isinstance(obj, list):
            print(f"{spacing}- list of {len(obj)} items")
            if len(obj) > 0:
                printStructure(obj[0], indent + 1)
        else:
            pass

    print("JSON Structure:")
    printStructure(data, indent)

In [32]:
def preProcessDataRecord(jsonDataFile):
    keysToRemove = ['meta', 'created_at', 'updated_at', 'inner_id', 'total_annotations', 'cancelled_annotations', 'total_predictions', 'comment_count', 'unresolved_comment_count', 'last_comment_updated_at', 'project', 'updated_by', 'comment_authors', 'file_upload', 'drafts', 'predictions', 'agreement']
    annotationKeysToRemove = ['id', 'completed_by', 'reviews', 'was_cancelled', 'ground_truth', 'created_at', 'updated_at', 'draft_created_at', 'lead_time', 'prediction', 'result_count', 'unique_id', 'import_id', 'last_action', 'task', 'project', 'updated_by', 'parent_prediction', 'parent_annotation', 'last_created_by']
    resultKeysToRemove = ['id', 'origin', 'to_name', 'from_name']
    for key in keysToRemove:
        if key in jsonDataFile:
            del jsonDataFile[key]
    for key in annotationKeysToRemove:
        for i in range(len(jsonDataFile['annotations'])):
            if key in jsonDataFile['annotations'][i]:
                del jsonDataFile['annotations'][i][key]
    for key in resultKeysToRemove:
        for i in range(len(jsonDataFile['annotations'])):
            for j in range(len(jsonDataFile['annotations'][i]['result'])):
                if key in jsonDataFile['annotations'][i]['result'][j]:
                    del jsonDataFile['annotations'][i]['result'][j][key]
    jsonDataFile['text'] = jsonDataFile['data']['text']
    del jsonDataFile['data']    
    for annotation in jsonDataFile['annotations']:
        annotation = annotation['result']
        

In [33]:
def prepareDataset(datasetFolderPath):
    resumeDataset = []
    dataFiles = []
    for file in os.listdir(datasetFolderPath):
        if file.endswith('.json'):
            dataFiles.append(os.path.join(datasetFolderPath, file))
    for dataFile in dataFiles:
        jsonData = readJsonFile(dataFile)
        for record in jsonData:
            preProcessDataRecord(record)
            resumeDataset.append(record)
    return resumeDataset

In [34]:
def getLabels(resumeDataset):
    labels = set()
    for record in resumeDataset:
        for annotation in record['annotations']:
            for result in annotation['result']:
                if result['type'] == 'labels':
                    for i in range(len(result['value']['labels'])):
                        labels.add(result['value']['labels'][i])
    return list(labels)

In [35]:
def labelEncoderDecoder(labels):
    labelEncoder = {
        'UNK' : 0
    }
    tag = 1
    for label in labels:
        labelEncoder[label] = tag
        tag += 1
    labelDecoder = {key: value for value, key in labelEncoder.items()}
    return labelEncoder, labelDecoder

In [36]:
def mergeIntervals(intervals):
    if not intervals:
        return []
    intervals.sort(key=lambda x: x[0])
    merged = [intervals[0]]
    for current in intervals[1:]:
        last = merged[-1]
        if current[0] <= last[1]:
            merged[-1] = (last[0], max(last[1], current[1]))
        else:
            merged.append(current)
    return merged

In [37]:
def getAnnotations(record, labelEncoder):
    annotations = {}
    for label in labelEncoder.keys():
        annotations[label] = []

    for annotation in record['annotations']:
        for result in annotation['result']:
            if result['type'] == 'labels':
                label = result['value']['labels'][0]
                start = result['value']['start']
                end = result['value']['end']
                annotations[label].append((start, end))
    for label in annotations.keys():
        annotations[label] = mergeIntervals(annotations[label])
    return annotations

In [38]:
def getOffsetMapping(text):
    words = text.split(" ")
    offset_mapping = []
    currentIndex = 0
    for word in words:
        start = currentIndex
        end = currentIndex + len(word)
        offset_mapping.append((start, end))
        currentIndex = end + 1
    return offset_mapping

In [39]:
def updateAnnotationsWithOffsets(record, annotations):
    """
    Convert character-level entity spans into token-level indices.
    Returns: { label: [ [token_idx1, token_idx2, ...], ... ] }
    """
    text = record['text']
    tokens = text.split()
    offset_mapping = getOffsetMapping(text)  # list of (start, end) for each token

    updatedAnnotations = {label: [] for label in annotations}

    for label, spans in annotations.items():
        for span_start, span_end in spans:
            entity_token_indices = []
            for i, (token_start, token_end) in enumerate(offset_mapping):
                if token_start >= span_start and token_end <= span_end:
                    entity_token_indices.append(i)

            updatedAnnotations[label].append(entity_token_indices)

    return updatedAnnotations

In [None]:
def verifyAnnotations(resumeDataset, labelEncoder):
    mismatches = 0
    total = 0
    for record in resumeDataset:
        annotations = getAnnotations(record, labelEncoder)
        updatedAnnotations = updateAnnotationsWithOffsets(record, annotations)
        text = record['text']
        splitText = text.split(" ")
        for label in annotations:
            for i, interval in enumerate(annotations[label]):
                if len(interval) == 0 or len(updatedAnnotations[label][i]) == 0:
                    continue
                start, end = interval
                output1 = text[start:end]
                start2, end2 = updatedAnnotations[label][i][0], updatedAnnotations[label][i][-1]
                output2 = splitText[start2:end2+1]
                output2 = " ".join(output2)
                output1 = output1.strip()
                output2 = output2.strip()
                # print(output1, '-------', output2)
                # print('----------------------------------------------------')
                if output1 != output2:
                    print(f"Mismatch found in record: {record['id']}")
                    print(f"Original: {output1}")
                    print(f"Updated: {output2}")
                    print('----------------------------------------------------')
                    mismatches += 1
                total += 1
    errorRate = mismatches / total if total > 0 else 0
    errorRate *= 100
    print(f"Total Annotations: {total}")
    print(f"Total Mismatches: {mismatches}")
    print(f"Error Rate: {errorRate:.2f}%")
    return mismatches, total, errorRate

In [41]:
def labelEncoderDecoderIOB(labels):
    labelEncoder = {
        'O' : 0
    }
    tag = 1
    for label in labels:
        labelEncoder['B-' + label] = tag
        tag += 1
        labelEncoder['I-' + label] = tag
        tag += 1
    labelDecoder = {key: value for value, key in labelEncoder.items()}
    return labelEncoder, labelDecoder

In [42]:
def convertToIOB(tokens, nerLabels):
    iobLabels = []
    prevLabel = 'UNK'

    for i, label in enumerate(nerLabels):
        if label == 'UNK':
            iobLabels.append('O')
            prevLabel = 'UNK'
        else:
            if prevLabel != label:
                iobLabels.append('B-' + label)
            else:
                iobLabels.append('I-' + label)
            prevLabel = label

    return iobLabels

In [43]:
def getOutput(record, labelEncoder):
    tokens = record['input']
    annotations = record['annotations']
    ner_labels = ['O'] * len(tokens)

    for label, token_index_groups in annotations.items():
        for indices in token_index_groups:
            if not indices:
                continue
            ner_labels[indices[0]] = f"B-{label}"
            for i in indices[1:]:
                ner_labels[i] = f"I-{label}"

    ner_tags = [labelEncoder.get(tag, 0) for tag in ner_labels]
    return ner_labels, ner_tags

In [44]:
def buildDataset(datasetFolderPath):
    resumeDataset = prepareDataset(resumeDatasetPath)
    labels = getLabels(resumeDataset)

    for record in resumeDataset:
        labelEncoder, labelDecoder = labelEncoderDecoder(labels)
        annotations = getAnnotations(record, labelEncoder)
        updatedAnnotations = updateAnnotationsWithOffsets(record, annotations)
        record['annotations'] = updatedAnnotations
        record['input'] = record['text'].split(" ")
        labelEncoder, labelDecoder = labelEncoderDecoderIOB(labels)
        NER_LABELS, NER_TAGS = getOutput(record, labelEncoder)
        record['NER_LABELS'] = NER_LABELS
        record['NER_TAGS'] = NER_TAGS

    dataset = {
        'id' : [],
        'text' : [],
        'annotations' : [],
        'input' : [],
        'NER_LABELS' : [],
        'NER_TAGS' : [] 
    }

    for record in resumeDataset:
        dataset['id'].append(record['id'])
        dataset['text'].append(record['text'])
        dataset['annotations'].append(record['annotations'])
        dataset['input'].append(record['input'])
        dataset['NER_LABELS'].append(record['NER_LABELS'])
        dataset['NER_TAGS'].append(record['NER_TAGS'])

    return dataset, labelEncoder, labelDecoder

## Prepare Dataset

In [45]:
datasetFolderPath = 'Dataset/'
resumeDatasetPath = 'Dataset/DatasetFiles'

In [46]:
resumeDataset = prepareDataset(resumeDatasetPath)
labels = getLabels(resumeDataset)
labelEncoder, labelDecoder = labelEncoderDecoder(labels)

In [47]:
mismatches, total, errorRate = verifyAnnotations(resumeDataset, labelEncoder)

Mismatch found in record: 69389226
Original: Higher Secondary
Updated: Higher
----------------------------------------------------
Mismatch found in record: 69389229
Original: Higher Secondary
Updated: Higher
----------------------------------------------------
Mismatch found in record: 69512121
Original: Higher Secondary
Updated: Higher
----------------------------------------------------
Mismatch found in record: 69511645
Original: deas inspire collaboration
Updated: inspire collaboration
----------------------------------------------------
Mismatch found in record: 69511645
Original: Technical Product Development  Consultin
Updated: Technical Product Development
----------------------------------------------------
Mismatch found in record: 69511645
Original: delivery of endto end project
Updated: delivery of endto end
----------------------------------------------------
Mismatch found in record: 69511653
Original: uly 2014 to Present
Updated: 2014 to Present
------------------------

In [20]:
resumeDataset, labelEncoder, labelDecoder = buildDataset(resumeDatasetPath)

In [21]:
for key in resumeDataset.keys():
    print(key, ' : ', type(resumeDataset[key]), ' : ', len(resumeDataset[key]))

id  :  <class 'list'>  :  349
text  :  <class 'list'>  :  349
annotations  :  <class 'list'>  :  349
input  :  <class 'list'>  :  349
NER_LABELS  :  <class 'list'>  :  349
NER_TAGS  :  <class 'list'>  :  349


In [22]:
extractJsonStructureFromObject(resumeDataset)

JSON Structure:
- id: list
  - list of 349 items
- text: list
  - list of 349 items
- annotations: list
  - list of 349 items
    - UNK: list
      - list of 0 items
    - persuading: list
      - list of 0 items
    - commercial_thinking: list
      - list of 0 items
    - supervising: list
      - list of 0 items
    - place_higher_education: list
      - list of 2 items
        - list of 2 items
    - languages_known: list
      - list of 1 items
        - list of 1 items
    - place_basic_education: list
      - list of 2 items
        - list of 8 items
    - influencing: list
      - list of 0 items
    - higher_education: list
      - list of 1 items
        - list of 4 items
    - innovative: list
      - list of 0 items
    - result_higher_education: list
      - list of 0 items
    - researching: list
      - list of 0 items
    - work_year: list
      - list of 2 items
        - list of 4 items
    - work_cities: list
      - list of 2 items
        - list of 3 items
    - ce

In [23]:
resumeDataframe = pd.DataFrame(resumeDataset)
resumeDataframe.head()

Unnamed: 0,id,text,annotations,input,NER_LABELS,NER_TAGS
0,69389221,Kalpesh Panchal Azure Certified Cloud Engineer...,"{'UNK': [], 'persuading': [], 'commercial_thin...","[Kalpesh, Panchal, Azure, Certified, Cloud, En...","[O, O, O, O, O, O, B-candidate_city, I-candida...","[0, 0, 0, 0, 0, 0, 43, 44, 0, 0, 0, 0, 0, 0, 0..."
1,69389222,Kailash Nikam Thane Maharashtra Email me on In...,"{'UNK': [], 'persuading': [], 'commercial_thin...","[Kailash, Nikam, Thane, Maharashtra, Email, me...","[O, O, B-candidate_city, I-candidate_city, O, ...","[0, 0, 43, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,69389223,Jose George CLOUD ENGINEER Kochi Kerala Email ...,"{'UNK': [], 'persuading': [], 'commercial_thin...","[Jose, George, CLOUD, ENGINEER, Kochi, Kerala,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,69389224,Job Seeker AWS Certified Solutions Architect A...,"{'UNK': [], 'persuading': [], 'commercial_thin...","[Job, Seeker, AWS, Certified, Solutions, Archi...","[O, O, O, O, O, O, O, B-candidate_city, I-cand...","[0, 0, 0, 0, 0, 0, 0, 43, 44, 44, 0, 0, 0, 0, ..."
4,69389225,JAY PATEL Ahmedabad Gujarat Email me on Indeed...,"{'UNK': [], 'persuading': [], 'commercial_thin...","[JAY, PATEL, Ahmedabad, Gujarat, Email, me, on...","[O, O, B-candidate_city, O, O, O, O, O, O, O, ...","[0, 0, 43, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [24]:
csvDatasetPath = 'Dataset/ResumeDataset.csv'
jsonDatasetPath = 'Dataset/ResumeDataset.json'
labelEncoderDecoderPath = 'Dataset/LabelEncoderDecoder.json'
labelEncoderDecoder = {
    'labelEncoder' : labelEncoder,
    'labelDecoder' : labelDecoder
}
resumeDataframe.to_csv(csvDatasetPath, index=False)
json.dump(resumeDataset, open(jsonDatasetPath, 'w'), indent=4)
json.dump(labelEncoderDecoder, open(labelEncoderDecoderPath, 'w'), indent=4)

In [25]:
dataset = datasets.Dataset.from_dict(resumeDataset)
dataset

Dataset({
    features: ['id', 'text', 'annotations', 'input', 'NER_LABELS', 'NER_TAGS'],
    num_rows: 349
})

# Visualization

In [26]:
record = resumeDataframe.iloc[0]
record = resumeDataframe.iloc[0].to_dict()

In [27]:
output = {}
for label in record['annotations'].keys():
    if label not in output:
        output[label] = ""
    for interval in record['annotations'][label]:
        if len(interval) == 0:
            continue
        start, end = interval[0], interval[-1]
        for i in range(start, end + 1):
            output[label] += record['input'][i] + " "
        output[label] = output[label].strip()

In [28]:
for key in output.keys():
    print(key, ' : ', output[key])
    print("----------------------------------------------------")

UNK  :  
----------------------------------------------------
persuading  :  
----------------------------------------------------
commercial_thinking  :  
----------------------------------------------------
supervising  :  
----------------------------------------------------
place_higher_education  :  Mumbai UniversityMumbai Maharashtra
----------------------------------------------------
languages_known  :  English
----------------------------------------------------
place_basic_education  :  SK Somaiya College of Arts  Mumbai MaharashtraMaharashtra State Board
----------------------------------------------------
influencing  :  
----------------------------------------------------
higher_education  :  Masters in Computer Science
----------------------------------------------------
innovative  :  
----------------------------------------------------
result_higher_education  :  
----------------------------------------------------
researching  :  
-----------------------------------