# RESUME DATASET PROCESSING

## Imports

In [31]:
import json
import os
import numpy as np
import pandas as pd
import import_ipynb

## Functions

In [36]:
def readJsonFile(filePath):
    with open(filePath, 'r') as jsonFile:
        jsonData = json.load(jsonFile)
    return jsonData

In [32]:
def extractJsonStructureFromObject(data, indent=0):
    def printStructure(obj, indent):
        spacing = '  ' * indent
        if isinstance(obj, dict):
            for key, value in obj.items():
                print(f"{spacing}- {key}: {type(value).__name__}")
                printStructure(value, indent + 1)
        elif isinstance(obj, list):
            print(f"{spacing}- list of {len(obj)} items")
            if len(obj) > 0:
                printStructure(obj[0], indent + 1)
        else:
            pass

    print("JSON Structure:")
    printStructure(data, indent)

In [None]:
def preProcessDataRecord(jsonDataFile):
    keysToRemove = ['meta', 'created_at', 'updated_at', 'inner_id', 'total_annotations', 'cancelled_annotations', 'total_predictions', 'comment_count', 'unresolved_comment_count', 'last_comment_updated_at', 'project', 'updated_by', 'comment_authors', 'file_upload', 'drafts', 'predictions', 'agreement']
    annotationKeysToRemove = ['id', 'completed_by', 'reviews', 'was_cancelled', 'ground_truth', 'created_at', 'updated_at', 'draft_created_at', 'lead_time', 'prediction', 'result_count', 'unique_id', 'import_id', 'last_action', 'task', 'project', 'updated_by', 'parent_prediction', 'parent_annotation', 'last_created_by']
    resultKeysToRemove = ['id', 'origin', 'to_name', 'from_name']
    for key in keysToRemove:
        if key in jsonDataFile:
            del jsonDataFile[key]
    for key in annotationKeysToRemove:
        for i in range(len(jsonDataFile['annotations'])):
            if key in jsonDataFile['annotations'][i]:
                del jsonDataFile['annotations'][i][key]
    for key in resultKeysToRemove:
        for i in range(len(jsonDataFile['annotations'])):
            for j in range(len(jsonDataFile['annotations'][i]['result'])):
                if key in jsonDataFile['annotations'][i]['result'][j]:
                    del jsonDataFile['annotations'][i]['result'][j][key]
    jsonDataFile['text'] = jsonDataFile['data']['text']
    del jsonDataFile['data']    
    for annotation in jsonDataFile['annotations']:
        annotation = annotation['result']
        

In [137]:
def prepareDataset(datasetFolderPath):
    resumeDataset = []
    dataFiles = []
    for file in os.listdir(datasetFolderPath):
        if file.endswith('.json'):
            dataFiles.append(os.path.join(datasetFolderPath, file))
    for dataFile in dataFiles:
        jsonData = readJsonFile(dataFile)
        for record in jsonData:
            preProcessDataRecord(record)
            resumeDataset.append(record)
    return resumeDataset

In [146]:
def getLabels(resumeDataset):
    labels = set()
    for record in resumeDataset:
        for annotation in record['annotations']:
            for result in annotation['result']:
                if result['type'] == 'labels':
                    for i in range(len(result['value']['labels'])):
                        labels.add(result['value']['labels'][i])
    return list(labels)

In [147]:
def labelEncoderDecoder(labels):
    labelEncoder = {
        'UNK' : 0
    }
    tag = 1
    for label in labels:
        labelEncoder[label] = tag
        tag += 1
    labelDecoder = {key: value for value, key in labelEncoder.items()}
    return labelEncoder, labelDecoder

In [163]:
def mergeIntervals(intervals):
    if not intervals:
        return []
    intervals.sort(key=lambda x: x[0])
    merged = [intervals[0]]
    for current in intervals[1:]:
        last = merged[-1]
        if current[0] <= last[1]:
            merged[-1] = (last[0], max(last[1], current[1]))
        else:
            merged.append(current)
    return merged

In [164]:
def getAnnotations(record, labelEncoder):
    annotations = {}
    for label in labelEncoder.keys():
        annotations[label] = []

    for annotation in record['annotations']:
        for result in annotation['result']:
            if result['type'] == 'labels':
                label = result['value']['labels'][0]
                start = result['value']['start']
                end = result['value']['end']
                annotations[label].append((start, end))
    for label in annotations.keys():
        annotations[label] = mergeIntervals(annotations[label])
    return annotations

In [259]:
def getOffsetMapping(text):
    words = text.split(" ")
    offset_mapping = []
    currentIndex = 0
    for word in words:
        start = currentIndex
        end = currentIndex + len(word)
        offset_mapping.append((start, end))
        currentIndex = end + 1
    return offset_mapping

In [260]:
def updateAnnotationsWithOffsets(record, annotations):
    offset_mapping = getOffsetMapping(record['text'])
    updatedAnnotations = {}
    for label, intervals in annotations.items():
        for interval in intervals:
            start, end = interval
            found = False
            startPosition = 0
            endPosition = 0
            for i, (offset_start, offset_end) in enumerate(offset_mapping):
                if offset_start <= start and offset_end >= start and not found:
                    startPosition = i
                    found = True
                if found and offset_end >= end:
                    endPosition = i
                    break
            if label not in updatedAnnotations:
                updatedAnnotations[label] = []
            updatedAnnotations[label].append((startPosition, endPosition))
    return updatedAnnotations

In [None]:
# CURRENT ERROR RATE 18% - IMPROVE THIS
def verifyAnnotations(resumeDataset, labelEncoder):
    mismatches = 0
    total = 0
    for record in resumeDataset:
        annotations = getAnnotations(record, labelEncoder)
        updatedAnnotations = updateAnnotationsWithOffsets(record, annotations)
        text = record['text']
        splitText = text.split(" ")
        for label in annotations:
            for i, interval in enumerate(annotations[label]):
                start, end = interval
                output1 = text[start:end+1]
                start2, end2 = updatedAnnotations[label][i]
                output2 = splitText[start2:end2+1]
                output2 = " ".join(output2)
                output1 = output1.strip()
                output2 = output2.strip()
                # print(output1, '-------', output2)
                # print('----------------------------------------------------')
                if output1 != output2:
                    # print(f"Mismatch found in record: {record['id']}")
                    # print(f"Original: {output1}")
                    # print(f"Updated: {output2}")
                    # print('----------------------------------------------------')
                    mismatches += 1
                total += 1
    errorRate = mismatches / total if total > 0 else 0
    errorRate *= 100
    print(f"Total Annotations: {total}")
    print(f"Total Mismatches: {mismatches}")
    print(f"Error Rate: {errorRate:.2f}%")
    return mismatches, total, errorRate

In [314]:
def labelEncoderDecoderIOB(labels):
    labelEncoder = {
        'O' : 0
    }
    tag = 1
    for label in labels:
        labelEncoder['B-' + label] = tag
        tag += 1
        labelEncoder['I-' + label] = tag
        tag += 1
    labelDecoder = {key: value for value, key in labelEncoder.items()}
    return labelEncoder, labelDecoder

In [325]:
def convertToIOB(tokens, nerLabels):
    iobLabels = []
    prevLabel = 'UNK'

    for i, label in enumerate(nerLabels):
        if label == 'UNK':
            iobLabels.append('O')
            prevLabel = 'UNK'
        else:
            if prevLabel != label:
                iobLabels.append('B-' + label)
            else:
                iobLabels.append('I-' + label)
            prevLabel = label

    return iobLabels

In [326]:
def getOutput(record, labelEncoder):
    input = record['input']
    annotations = record['annotations']
    NER_LABELS = ['UNK' for _ in range(len(input))]
    NER_TAGS = []
    for annotation in annotations.keys():
        label = annotation
        for i, interval in enumerate(annotations[label]):
            start, end = interval
            for j in range(start, end + 1):
                NER_LABELS[j] = label
    NER_LABELS = convertToIOB(input, NER_LABELS)
    for label in NER_LABELS:
        NER_TAGS.append(labelEncoder[label])
    return NER_LABELS, NER_TAGS

In [327]:
def buildDataset(datasetFolderPath):
    resumeDataset = prepareDataset(resumeDatasetPath)
    labels = getLabels(resumeDataset)

    for record in resumeDataset:
        labelEncoder, labelDecoder = labelEncoderDecoder(labels)
        annotations = getAnnotations(record, labelEncoder)
        updatedAnnotations = updateAnnotationsWithOffsets(record, annotations)
        record['annotations'] = updatedAnnotations
        record['input'] = record['text'].split(" ")
        labelEncoder, labelDecoder = labelEncoderDecoderIOB(labels)
        NER_LABELS, NER_TAGS = getOutput(record, labelEncoder)
        record['NER_LABELS'] = NER_LABELS
        record['NER_TAGS'] = NER_TAGS

    dataset = {
        'id' : [],
        'text' : [],
        'annotations' : [],
        'input' : [],
        'NER_LABELS' : [],
        'NER_TAGS' : [] 
    }

    for record in resumeDataset:
        dataset['id'].append(record['id'])
        dataset['text'].append(record['text'])
        dataset['annotations'].append(record['annotations'])
        dataset['input'].append(record['input'])
        dataset['NER_LABELS'].append(record['NER_LABELS'])
        dataset['NER_TAGS'].append(record['NER_TAGS'])

    return dataset, labelEncoder, labelDecoder

## Prepare Dataset

In [328]:
datasetFolderPath = 'Dataset/'
resumeDatasetPath = 'Dataset/DatasetFiles'

In [329]:
resumeDataset, labelEncoder, labelDecoder = buildDataset(resumeDatasetPath)

In [330]:
for key in resumeDataset.keys():
    print(key, ' : ', type(resumeDataset[key]), ' : ', len(resumeDataset[key]))

id  :  <class 'list'>  :  349
text  :  <class 'list'>  :  349
annotations  :  <class 'list'>  :  349
input  :  <class 'list'>  :  349
NER_LABELS  :  <class 'list'>  :  349
NER_TAGS  :  <class 'list'>  :  349


In [331]:
extractJsonStructureFromObject(resumeDataset)

JSON Structure:
- id: list
  - list of 349 items
- text: list
  - list of 349 items
- annotations: list
  - list of 349 items
    - place_higher_education: list
      - list of 2 items
    - company_name: list
      - list of 1 items
    - technical_skills: list
      - list of 32 items
    - initiating_actions: list
      - list of 12 items
    - basic_education: list
      - list of 3 items
    - work_cities: list
      - list of 2 items
    - candidate_city: list
      - list of 1 items
    - designation: list
      - list of 2 items
    - work_with_people: list
      - list of 3 items
    - certification: list
      - list of 3 items
    - work_year: list
      - list of 2 items
    - languages_known: list
      - list of 1 items
    - higher_education: list
      - list of 1 items
    - place_basic_education: list
      - list of 2 items
    - applying_expertise: list
      - list of 21 items
- input: list
  - list of 349 items
    - list of 550 items
- NER_LABELS: list
  - list o

In [332]:
resumeDataframe = pd.DataFrame(resumeDataset)
resumeDataframe.head()

Unnamed: 0,id,text,annotations,input,NER_LABELS,NER_TAGS
0,69389221,Kalpesh Panchal Azure Certified Cloud Engineer...,"{'place_higher_education': [(457, 458), (465, ...","[Kalpesh, Panchal, Azure, Certified, Cloud, En...","[O, O, O, O, O, O, B-candidate_city, I-candida...","[0, 0, 0, 0, 0, 0, 31, 32, 0, 0, 0, 0, 0, 0, 0..."
1,69389222,Kailash Nikam Thane Maharashtra Email me on In...,"{'place_higher_education': [(401, 405)], 'comp...","[Kailash, Nikam, Thane, Maharashtra, Email, me...","[O, O, B-candidate_city, I-candidate_city, O, ...","[0, 0, 31, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,69389223,Jose George CLOUD ENGINEER Kochi Kerala Email ...,"{'company_name': [(27, 28), (107, 108)], 'tech...","[Jose, George, CLOUD, ENGINEER, Kochi, Kerala,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,69389224,Job Seeker AWS Certified Solutions Architect A...,"{'place_higher_education': [(100, 102)], 'comp...","[Job, Seeker, AWS, Certified, Solutions, Archi...","[O, O, O, O, O, O, O, B-candidate_city, I-cand...","[0, 0, 0, 0, 0, 0, 0, 31, 32, 32, 0, 0, 0, 0, ..."
4,69389225,JAY PATEL Ahmedabad Gujarat Email me on Indeed...,"{'company_name': [(28, 32), (200, 201), (250, ...","[JAY, PATEL, Ahmedabad, Gujarat, Email, me, on...","[O, O, B-candidate_city, O, O, O, O, O, O, O, ...","[0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [342]:
csvDatasetPath = 'Dataset/ResumeDataset.csv'
jsonDatasetPath = 'Dataset/ResumeDataset.json'
labelEncoderDecoderPath = 'Dataset/LabelEncoderDecoder.json'
labelEncoderDecoder = {
    'labelEncoder' : labelEncoder,
    'labelDecoder' : labelDecoder
}
resumeDataframe.to_csv(csvDatasetPath, index=False)
json.dump(resumeDataset, open(jsonDatasetPath, 'w'), indent=4)
json.dump(labelEncoderDecoder, open(labelEncoderDecoderPath, 'w'), indent=4)

# Visualization

In [334]:
record = resumeDataframe.iloc[0]
record = resumeDataframe.iloc[0].to_dict()

In [339]:
output = {}
for label in record['annotations'].keys():
    if label not in output:
        output[label] = ""
    for interval in record['annotations'][label]:
        start, end = interval
        for i in range(start, end + 1):
            output[label] += record['input'][i] + " "
        output[label] = output[label].strip()

In [341]:
for key in output.keys():
    print(key, ' : ', output[key])
    print("----------------------------------------------------")

place_higher_education  :  Mumbai UniversityMumbai Maharashtra
----------------------------------------------------
company_name  :  Cloudxchangeio
----------------------------------------------------
technical_skills  :  LB Application gateway traffic managerFront Door VNETsubnetUDR ExpressRoute BastionsCDN Virtual network gatewayPublic IP addressesNetwork security groupsAzure Site recoveryStorage accountsApp ServicesWeb appAPI Management servicesAzure Kubernetes servicesAzure Container RegistryServicenowBMC remedyFreshServiceAzure  Configuring backupAzure MigrateCloudBerryZscalerBarracudaMimecastCloudallySymantec endpoint ProtectionSentinel oneCisco MerakiSolarwinds RMMPrinterlogicExchangeOffice 365AzureActive DirectoryNetworking
----------------------------------------------------
initiating_actions  :  Implemented multiple network componentsDeployed Azure IaaS virtual machinesCloud services PaaS role instancesBuilding up the strategyImplemented Azure VDIManaging Azure Active Direct