In [1]:
!pip install PyPDF2 fuzzywuzzy unidecode

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting unidecode
  Downloading Unidecode-1.3.7-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.7-py3-none-any.whl (235 kB)
   ---------------------------------------- 0.0/235.5 kB ? eta -:--:--
   ---------------------------------------- 235.5/235.5 kB 4.8 MB/s eta 0:00:00
Installing collected packages: fuzzywuzzy, unidecode
Successfully installed fuzzywuzzy-0.18.0 unidecode-1.3.7


In [3]:
import PyPDF2
from fuzzywuzzy import fuzz
import pandas as pd
from unidecode import unidecode
import re
import os
from os.path import isfile, join



In [5]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            if page_num > 0:  # Add newline for page breaks (except the first page)
                text += "\n"
            text += pdf_reader.pages[page_num].extract_text()
    text = unidecode(text)
    return text

In [6]:
def identify_headings(text):
    headings = []
    lines = text.split('\n')

    # Define synonyms and their corresponding fuzz ratio thresholds
    heading_synonyms = {
        "Education": ["Education", "Qualifications", "Educational Qualifications", "Academic Background", "Educational Details", "Education and Training"],
        "Skills": ["Skills", "Technical Skills", "Key Competencies", "Skill Highlights", "Primary Skills", "Specializations", "Areas of Expertise", "Expertise", "Programming Languages"],
        "Experience": ["Work Experience", "Professional Background", "Professional Experience", "Work History", "Teaching Experience", "Employment History"],
        "Achievements": ["Accomplishments", "Achievements", "Notable Projects", "Qualifications"],
        "Others": ["Awards", "Honors", "Recognition", "Publications", "Certifications", "Presentations", "Volunteer Experience", "Leadership Experience","Interests","Hobbies", "Languages", "Licenses"],
        "Summary": ["Career Overview", "Summary", "About Me", "Profile Summary", "Highlights", "Objective"]
    }

    for line in lines:
        line = line.strip()
        for heading, synonyms in heading_synonyms.items():
            for synonym in synonyms:
                if re.match(r'^\s*{}\s*'.format(synonym), line, re.IGNORECASE):
                    headings.append((line, heading))

    found = False
    for tup in headings:
      if tup[1] == "Experience":
          found = True

    if not found:
      for line in lines:
        line = line.strip()
        for heading, synonyms in heading_synonyms.items():
          if heading == "Experience":
            synonyms = ["Work Experience", "Professional Background", "Professional Experience", "Work History", "Teaching Experience", "Employment History", "Experience"]
            for synonym in synonyms:
                if re.match(r'^\s*{}\s*'.format(synonym), line, re.IGNORECASE):
                    headings.append((line, heading))
    return headings

In [7]:
def clean_text(text):
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove leading and trailing spaces
    text = text.strip()
    # Remove newline characters
    text = text.replace('\n', ' ')
    # Remove tab characters
    text = text.replace('\t', ' ')

    return text

In [8]:
def parse_resume(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    headings = identify_headings(text)

    resume_parts = {}
    current_heading = ""
    current_part = ""
    capturing_work_experience = False

    for line in text.split('\n'):
        line = line.strip()
        if (line, "Education") in headings:
            current_heading = "Education"
            current_part = ""
            capturing_work_experience = False
        elif (line, "Skills") in headings:
            current_heading = "Skills"
            current_part = ""
            capturing_work_experience = False
        elif (line, "Experience") in headings:
            current_heading = "Experience"
            current_part = ""
            capturing_work_experience = True
        elif (line, "Achievements") in headings:
            current_heading = "Achievements"
            current_part = ""
            capturing_work_experience = False
        elif (line, "Others") in headings:
            current_heading = "Others"
            current_part = ""
            capturing_work_experience = False
        elif (line, "Summary") in headings:
            current_heading = "Summary"
            current_part = ""
            capturing_work_experience = False
        else:
            if capturing_work_experience:
                current_part += line + "\n"
                resume_parts.setdefault(current_heading, "")  # Initialize the dictionary key if not present
                resume_parts[current_heading] += line + "\n"
            else:
               if current_heading:
                current_part += line + "\n"
                resume_parts[current_heading] = current_part

    # Create a DataFrame from the parsed resume parts
    fixed_columns = {
        'ResumeID': '',
        'Category':'',
        'Education': '',
        'Skills': '',
        'Experience': '',
        'Achievements': '',
        'Others':'',
        'Summary':''
    }

    # Update the fixed_columns dictionary with data from the input dictionary
    fixed_columns.update(resume_parts)

    # Create a DataFrame from the updated dictionary
    df = pd.DataFrame([fixed_columns])
    df = df.applymap(clean_text)
    return df

In [9]:
dfs = []

root_dir = 'uploads/resumes'
for root, dirs, files in os.walk(root_dir):
    for file in files:
        if file.endswith(".pdf"):
            # Extract text from the PDF file
            pdf_path = os.path.join(root, file)
            parsed_resume_df = parse_resume(pdf_path)
            parsed_resume_df["ResumeID"] = file.replace(".pdf","")
            parsed_resume_df["Category"] = os.path.basename(root)
            dfs.append(parsed_resume_df)
dfs = pd.concat(dfs).reset_index(drop=True)

In [10]:
dfs.to_csv("Resume_extracted.csv", index=False)

In [11]:
!pip install -qU datasets transformers sentence-transformers git+https://github.com/naver/splade.git
!pip install einops

  error: subprocess-exited-with-error
  
  Building wheel for tokenizers (pyproject.toml) did not run successfully.
  exit code: 1
  
  [51 lines of output]
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-cpython-311
  creating build\lib.win-amd64-cpython-311\tokenizers
  copying py_src\tokenizers\__init__.py -> build\lib.win-amd64-cpython-311\tokenizers
  creating build\lib.win-amd64-cpython-311\tokenizers\models
  copying py_src\tokenizers\models\__init__.py -> build\lib.win-amd64-cpython-311\tokenizers\models
  creating build\lib.win-amd64-cpython-311\tokenizers\decoders
  copying py_src\tokenizers\decoders\__init__.py -> build\lib.win-amd64-cpython-311\tokenizers\decoders
  creating build\lib.win-amd64-cpython-311\tokenizers\normalizers
  copying py_src\tokenizers\normalizers\__init__.py -> build\lib.win-amd64-cpython-311\tokenizers\normalizers
  creating build\lib.win-amd64-cpython-311\tokenizers\pre_tokenizers
  copying py_

Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)
Downloading einops-0.7.0-py3-none-any.whl (44 kB)
   ---------------------------------------- 0.0/44.6 kB ? eta -:--:--
   ---------------------------------------- 44.6/44.6 kB ? eta 0:00:00
Installing collected packages: einops
Successfully installed einops-0.7.0


# 

In [14]:
from datasets import load_dataset
import pandas as pd
import ast
import torch
from splade.models.transformer_rep import Splade
from transformers import AutoTokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'

ModuleNotFoundError: No module named 'splade'