# Install the required dependencies

In [None]:
!pip install tika==1.24
!pip install spacy==2.2.3
!pip install scikit_learn==0.23.0

In [None]:
import json
import os
import random
import logging
import spacy
import tika
from sklearn.metrics import classification_report, precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.util import minibatch, compounding
tika.initVM()
from tika import parser

# Define all the functions we will need

## Convert the input data or input file into a format that SpaCy accepts

In [None]:
def convert_data_to_spacy(JSON_File):

  '''
  # Parameter
  # JSON_File: The input file that is in json format

  # Return value
  # training_data: A list of data that is in a format that SpaCy accepts. It is a list that contains tuples. The tuples will contain 2 items.
                   The first item is the text of the whole resume. The second item is a dictionary where the key is the string "entities" and the value
                   is a yet another list of tuples. The tuple has the format (start_point, end_point, label). 
                   Therefore, the value of the key "entities" will contain tuples for all words that are labelled in the given text of a resume.
  # Returns None if there is something wrong with opening and extracting data from the input file
  '''

  try:
    training_data = []
    lines = []

    # Open the input json file and encode it to bytes
    with open(JSON_File, 'r', encoding = 'utf-8') as f:
      lines = f.readlines()

    # Go through each line of the input file and extract the texts
    for line in lines:
      data = json.loads(line)
      text = data['content']
      
      entities = []
      # Extract the entities (start point, end point, and label of a word)
      for annotation in data['annotation']:
        # We can have more than 1 label for a word but there is only one start point and one end point for a word
        # So keep point as a single value but labels as a list
        point = annotation['points'][0]
        labels = annotation['label']
        
        # Make sure that labels is a list
        if not isinstance(labels, list):
          labels = [labels]

        # If a word has 2 labels, 2 entity tuples will be recorded for the word
        # SpaCy needs the end point to be 1 more than the point where the word really ends
        for label in labels:
          entities.append((point['start'], point['end']+1, label))

      training_data.append((text, {"entities" : entities}))

    print("Successfully converted the input data into SpaCy format.\n")

    return training_data

  except Exception as e:
    print("Unable to process the file: " + JSON_File + "\n" + "Error: " + str(e))
    return None

## Check if model already exists

In [None]:
#@title
def check_model(model_name):

  '''
  # Parameter
  # model_name: The name of the model that we want to load if it exists

  # Return value
  # model_name: The name of the model that is loaded successfully
  # Returns None if there is something wrong in loading the model specified by model_name. It is likely because the model doesn't exist.
  '''

  try:
    model = spacy.load(model_name)
    print("The model exists and is loaded successfully.\n")
    
    return model_name
  
  except Exception as e:
    print("Model is not loaded successfully. Make sure to check if the model exists.\n")

    return None

## Build a SpaCy model (or update the model if it exists) and train it

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def build_spacy_model(training_data, model_name):
  
  '''
  # Parameter
  # training_data: The list of data that is converted into SpaCy format. This will be what the function convert_data_to_spacy returns.
  # model_name: The name of the model that we want to load if it exists (not None)

  # Return value
  # spacy_model: The SpaCy model that is built or updated 
  '''

  if model_name is not None:
    nlp = spacy.load(model_name)
    print("The model " + model_name + " exists and is loaded successfully.\n")

  else:
    # Create a SpaCy model that is based on the English language if model_name is None
    nlp = spacy.blank("en")
    print("A new, blank SpaCy model based on English is created.\n")

  # Create a built-in component "ner" and add it to the pipeline
  # ner is a built-in SpaCy pipeline component for recognizing entities
  if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last = True)

  else:
    ner = nlp.get_pipe("ner")

  # Get the labels from the training data
  for _, annotations_dict in training_data:
    for entity in annotations_dict.get('entities'):
      # entity contains 3 elements: start_point, end_point, and label of a word
      ner.add_label(entity[2])

  # Disable other components of the pipeline before training
  other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
  with nlp.disable_pipes(*other_pipes):
    if model_name is None:
      # begin_training returns an optimizer
      optimizer = nlp.begin_training()
    
    for iter in range(500):
      print("Starting iteration " + str(iter))

      random.shuffle(training_data)
      losses = {}
      for text, annotations_dict in training_data:
        try:
          # Apply dropout so that it is harder for the model to memorize the data
          nlp.update([text], [annotations_dict], drop = 0.2, sgd = optimizer, losses = losses)
        except Exception as e:
          pass

      print("Losses: ", losses)

  # Save the model on Google Drive
  nlp.to_disk("model")

  return nlp

## Convert PDF to text

In [None]:
def convert_pdf_to_text(dir):

  '''
  # Parameter
  # dir: The directory where the PDF files are 

  # Return value
  # output: A list that contains the texts of all the PDF files that are converted 
  '''

  output = []
  for dirpath, dirnames, filenames in os.walk(dir):
    for file in filenames:
      path_to_pdf_file = os.path.join(dirpath, file)
      [path, extension] = os.path.splitext(path_to_pdf_file)

      # Can only work with PDF files
      if(extension == ".pdf"):
        # Get the content in text form
        pdf_content_text = parser.from_file(path_to_pdf_file, service = "text")
        # Append the text content into the output list
        output.append(pdf_content_text['content'])

  return output

## Use the model to predict

In [None]:
def predict(path):

  '''
  # Parameter
  # path: The path where the PDF files are

  # Return value
  # output: A dictionary that contains the entities recognized. The key corresponds to the hash index of a specific 
            entity.
  '''

  output = {}
  nlp = spacy.load("model")
  test_data = convert_pdf_to_text(path)
  
  for text in test_data:
    text = text.replace("\n", " ")
    doc = nlp(text)
    print("doc: ", doc)

    #print("entities: ", doc.ents)
    for ent in doc.ents:
      print(f'{ent.label_.upper():{30}}-{ent,text}')
      output[ent.label_.upper()] = ent.text

  return output

# Run all the functions defined above

In [None]:
%cd /content/drive/My Drive/Resume Parser/input

In [None]:
# Convert the training data to SpaCy format
train = convert_data_to_spacy("Entity Recognition in Resumes.json")

print("Data is converted to SpaCy format\n")

In [None]:
%cd /content/drive/My Drive/Resume Parser

In [None]:
# Load model if exists 
model = check_model("model")

# TRAIN
# Build a new model or update existing model 
model = build_spacy_model(train, model)

In [None]:
# TEST
# Use the model to predict
output = predict("/content/drive/My Drive/Resume Parser/test")