First, we need to install the `transformers` library, which provides pre-trained BERT models and tokenizers.

In [26]:
pip install transformers torch



Now, let's import the necessary modules from the `transformers` library and define the BERT model and tokenizer we'll use (e.g., `bert-base-uncased`). We'll also define some example sentences.

In [27]:
def load_bert():
  from transformers import BertModel, BertTokenizer

  # Load pre-trained BERT model and tokenizer
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  model = BertModel.from_pretrained('bert-base-uncased')

  # Example sentences
  sentences = [
      "This is a sample sentence.",
      "Hello, how are you today?",
      "BERT is a powerful language model."
  ]

  print("BERT model and tokenizer loaded successfully.")
  return tokenizer, model

Next, we will tokenize the sentences. Tokenization converts the sentences into numerical inputs that the BERT model can understand. We'll also ensure that all sentences are padded to the same length and attention masks are created.

In [28]:
def bert_processing(sentence):
  # Tokenize sentences
  tokenizer, model = load_bert()
  encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')

  print("Tokenized input:")
  for key, value in encoded_input.items():
      print(f"{key}: {value.shape}")
      # print(value)
  return encoded_input, model


Finally, we pass the tokenized inputs through the BERT model to obtain the embeddings. We'll extract the embeddings corresponding to the `[CLS]` token, which are often used as sentence-level representations.

In [29]:
def bert_generated_embeddings(sentence):
  import torch
  encoded_input, model = bert_processing(sentence)
  # Get model outputs
  with torch.no_grad(): # Disable gradient calculation for inference
      model_output = model(**encoded_input)

  # The last_hidden_state contains the embeddings for all tokens
  # The pooler_output (optional) can be used, but [CLS] token is common for sentence embeddings
  # To get the [CLS] token embedding, we take the first token's embedding from the last_hidden_state
  sentence_embeddings = model_output.last_hidden_state[:, 0, :]
  return sentence_embeddings
  # print("Shape of sentence embeddings (number_of_sentences, embedding_dimension):")
  # print(sentence_embeddings.shape)
  # print("\nFirst sentence embedding (first 10 dimensions):\n", sentence_embeddings[0, :10])


First, we'll install the necessary libraries: `tensorflow` (which Keras is now a part of) and `tensorflow-text` for the BERT preprocessor.

In [30]:
pip install gradio PyMuPDF



In [31]:
# Lemmatizing the text
# Importing the required libraries
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize

In [32]:
# Download required resources
nltk.download('wordnet')                 # WordNet dictionary
nltk.download('omw-1.4')                 # Multilingual WordNet support
nltk.download('averaged_perceptron_tagger')  # POS tagger
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [33]:
def remove_punctuations(sentence):
    import re
    sentence = re.sub(r'[^\w\s]', '', sentence)
    return sentence

In [34]:
def preprocess(sentence):
  sentence = sentence.lower()
  sentence = remove_punctuations(sentence)
  return sentence

In [35]:
# When passed a word it will give it's POS using wordnet
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [36]:
# Lemmatizing the sentence
def lemmatize_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    sentence = preprocess(sentence)
    res = []
    sen = nltk.sent_tokenize(sentence)
    for j in sen:
        j = remove_punctuations(j)
        words = nltk.word_tokenize(j)
        lemmatized = [lemmatizer.lemmatize(w, pos=get_wordnet_pos(w)) for w in words]
        res.extend(lemmatized)
    return res

In [37]:
sentence_1 = "Proficient in Injury Prevention, Motivation, Nutrition, Health Coaching, Strength Training, with mid-level experience in the field. Holds a Bachelors degree. Holds certifications such as Certified Personal Trainer (CPT) by NASM. Skilled in delivering results and adapting to dynamic environments."
sentence_2 = " A Fitness Coach is responsible for helping clients achieve their fitness goals by designing and leading group or individual fitness programs. You will provide instruction on exercises, proper form, and injury prevention techniques, encouraging clients to push their limits while maintaining a focus on their well-being. The role requires a passion for health and fitness, a strong understanding of exercise physiology, and the ability to motivate and inspire others. You will also monitor clients’ progress and make adjustments to their fitness plans as needed to ensure continuous improvement."

In [38]:
sentence_1 = lemmatize_sentence(sentence_1)
sentence_2 = lemmatize_sentence(sentence_2)

In [39]:
def extract_data(resume_data):
  import re
  pattern = r"Technical Skills(.*?)Extracurricular Activities"
  matches = re.search(pattern, resume_data, re.DOTALL)

  if matches:
      technical_skills = matches.group(1).strip()
      # print("### Technical Skills ###")
      # print(technical_skills)
  else:
      print("Technical Skills section not found.")

  skills_list = re.split(r'\n(?=•)', technical_skills.strip())

  cleaned_text = [re.sub(r'\s+', ' ', item).replace('•', '').strip() for item in skills_list]

  result_dict = {}
  for item in cleaned_text:
      key, value = item.split(':', 1)
      result_dict[key.strip()] = value.strip()

  print(result_dict)

  final_text = "Proficient in " + result_dict['Programming Languages'] + ". Knowns Web Technologies which includes " + result_dict["Web Technologies"] + ". Familiar with " + result_dict["Engineering Software"]
  return final_text

In [50]:
def get_score( sentence_1, sentence_2):
  from sklearn.metrics.pairwise import cosine_similarity

  # Compute cosine similarity
  similarity = cosine_similarity(sentence_1, sentence_2)
  return similarity

In [51]:
def similarity_score(sentence_1, sentence_2):

  check = sentence_2[-1]

  sentence_1 = lemmatize_sentence(sentence_1)
  sentence_2 = lemmatize_sentence(sentence_2)

  emb_1 = bert_generated_embeddings(sentence_1)
  emb_2 = bert_generated_embeddings(sentence_2)

  # resume_emb_stack_1 = stack_tensors_1(emb_1, len(emb_1), emb_1[0].shape[0])
  # job_emb_stack_1 = stack_tensors_1(emb_2, len(emb_2), emb_2[0].shape[0])

  score = get_score(emb_1, emb_2)
  return score

In [56]:
import gradio as gr
import fitz  # PyMuPDF

# Function to read the PDF file using PyMuPDF
def read_pdf(file):
    try:
        # Open the uploaded PDF file using PyMuPDF
        doc = fitz.open(file.name)
        resume_data = ""

        # Extract resume_data from all pages in the PDF
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)  # Load each page
            resume_data += page.get_text()  # Extract resume_data from the page

        sentence_1 = extract_data(resume_data)
        sentence_2 = 'Requred a software developer who is proficient in C, Java, Python, JavaScript, PHP and related languages. Knowns Web Technologies which includes HTML, CSS, Django. Familiar with Visual Studio, GitHub, PyCharm, IntelliJ, MySQL, GNU 8085 Simulator.T'
        # sentence_2 = 'As a Personal Trainer, you will design personalized fitness programs that help clients achieve their physical health goals. Your role involves motivating clients to push their limits, providing expert advice on exercise techniques, and offering nutritional guidance. You will work with individuals at different fitness levels, providing support and encouragement to help them improve their strength, endurance, and overall well-being. The role demands a passion for fitness, excellent interpersonal skills, and the ability to inspire others. You will also stay up-to-date with the latest trends in health and fitness to ensure that your training methods remain effective and innovative.F'

        matching_score = similarity_score(sentence_1, sentence_2)
        matching = 1 if matching_score[0][0] >= 0.5 else 0

        return sentence_1, matching

    except Exception as e:
        return f"Error reading PDF: {e}"

# Create a Gradio interface
iface = gr.Interface(
    fn=read_pdf,  # Function to process the uploaded PDF
    inputs=gr.File(label="Upload a PDF File"),  # File input for uploading PDF
    outputs = [
        gr.Textbox(label="Parsed Text"),  # Display the extracted text
        gr.Textbox(label="Similairty")
    ]
)

# Launch the Gradio app
iface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ef9b4732a097de4c5a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


