##### Copyright 2018 The TensorFlow Authors.

In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

### Parse CV data

In [None]:
!pip install pymupdf


import fitz  # PyMuPDF
import pandas as pd
import os
import re
import logging
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
import re
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:

# Configure logging
logging.basicConfig(filename='cv_parser.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        doc = fitz.open(pdf_path)
        text = "".join(page.get_text() for page in doc)
        return text
    except Exception as e:
        logging.error(f"Error extracting text from {pdf_path}: {e}")
        return ""

def detect_section_boundaries(text):
    """Detect section boundaries in the text."""
    section_keywords = {
        "Education": ["education", "degree", "university", "academic", "school", "qualification"],
        "Work Experience": ["experience", "employment", "job", "career", "internship", "work experience", "professional experience", "employment history"],
        "Skills and Competencies": ["skills", "competencies", "abilities", "expertise", "technical"],
        "Achievements and Awards": ["achievement", "award", "recognition", "accomplishment"],
        "References": ["references", "referees", "recommendations"],
    }

    section_boundaries = {}

    for section, keywords in section_keywords.items():
        for keyword in keywords:
            match = re.search(rf'\b{keyword}\b[:\-\s]', text, re.IGNORECASE)
            if match:
                section_boundaries[section] = match.start()
                break

    if not section_boundaries:
        logging.warning("No sections detected, assigning all text to Other Information.")
        return []

    sorted_sections = sorted(section_boundaries.items(), key=lambda x: x[1])
    return sorted_sections

def parse_sections(text, section_boundaries):
    """Parse text into structured sections based on detected boundaries."""
    parsed_data = defaultdict(str)

    if not section_boundaries:
        parsed_data["Other Information"] = re.sub(r'\s+', ' ', text.strip())
        return dict(parsed_data)

    for i, (section, start_idx) in enumerate(section_boundaries):
        end_idx = section_boundaries[i + 1][1] if i + 1 < len(section_boundaries) else len(text)
        content = text[start_idx:end_idx].strip()
        content = re.sub(r'^.*?:\s*', '', content, flags=re.IGNORECASE)
        content = re.sub(r'\uf0b7', '•', content)  # Replace bullet points
        parsed_data[section] = re.sub(r'\s+', ' ', content)

    # Extract remaining text as "Other Information"
    used_text = " ".join(parsed_data.values())
    other_info = text.replace(used_text, '').strip()
    if other_info:
        parsed_data["Other Information"] = re.sub(r'\s+', ' ', other_info)

    return dict(parsed_data)

def parse_cv_text(text):
    """Parse the entire CV text into structured sections."""
    text = text.replace('\n', ' ')
    section_boundaries = detect_section_boundaries(text)
    return parse_sections(text, section_boundaries)

def write_to_excel(cv_data_list, output_file):
    """Write the parsed CV data to an Excel file."""
    try:
        df = pd.DataFrame(cv_data_list)
        df.to_excel(output_file, index=False)
        logging.info(f"Data successfully written to {output_file}")
    except Exception as e:
        logging.error(f"Error writing to Excel file {output_file}: {e}")

def process_cv_file(pdf_path):
    """Process a single CV file."""
    try:
        text = extract_text_from_pdf(pdf_path)
        if text:
            parsed_data = parse_cv_text(text)
            parsed_data['File Name'] = os.path.basename(pdf_path)  # Add file name to parsed data
            return parsed_data
    except Exception as e:
        logging.error(f"Error processing file {pdf_path}: {e}")
    return None

# Main execution
if __name__ == "__main__":
    cv_folder_path = '/content/Test'  # Adjust to your folder path
    output_file = '/content/output.xlsx'  # Adjust to your desired output file path

    pdf_files = [os.path.join(cv_folder_path, f) for f in os.listdir(cv_folder_path) if f.endswith('.pdf')]
    cv_data_list = []

    with ThreadPoolExecutor() as executor:
        results = executor.map(process_cv_file, pdf_files)
        for result in results:
            if result:
                cv_data_list.append(result)

    write_to_excel(cv_data_list, output_file)


Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.1


In [4]:
# Only with Education to be deleted

import re
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Read data from Excel file
file_path = '/content/output.xlsx'  # Replace with your file path
df = pd.read_excel(file_path)

# Ensure the 'Education' column contains only string values
df['Education'] = df['Education'].astype(str)

# Sample job description requirement
job_requirement = "at least a bachelor degree in IT, computer science, or relevant field"

# Function to extract GPA from text
def extract_gpa(text):
    gpa_match = re.search(r'GPA:\s*(\d+\.\d+/\d+\.\d+)', text)
    if gpa_match:
        return float(gpa_match.group(1).split('/')[0])
    return None

# Function to encode text using BERT
def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Encode job requirement
job_embedding = encode_text(job_requirement, tokenizer, model)

# Extract GPA and encode candidate education
candidate_data = []
for index, row in df.iterrows():
    candidate = row['Education']
    file_name = row['File Name']  # Extract the file name
    if pd.isna(candidate) or not isinstance(candidate, str):
        continue  # Skip rows with missing or non-string values

    gpa = extract_gpa(candidate)
    candidate_embedding = encode_text(candidate, tokenizer, model)
    candidate_data.append({
        'file_name': file_name,
        'candidate': candidate,
        'gpa': gpa,
        'embedding': candidate_embedding
    })

# Compute similarity scores
for data in candidate_data:
    data['similarity_score'] = cosine_similarity([job_embedding], [data['embedding']])[0][0]

# Combine similarity score with GPA to create a composite score
for data in candidate_data:
    if data['gpa'] is not None and data['gpa'] > 3.0:
        data['composite_score'] = data['similarity_score'] + (data['gpa'] - 3.0)  # Adjust the weighting as needed
    else:
        data['composite_score'] = data['similarity_score']

# Rank candidates based on composite score
ranked_candidates = pd.DataFrame(candidate_data).sort_values(by='composite_score', ascending=False)

# Print the ranked candidates along with the file name
print(ranked_candidates[['file_name', 'candidate', 'gpa', 'similarity_score', 'composite_score']])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

                              file_name  \
3                  Resume-Samples-2.pdf   
32                Resume-Samples-14.pdf   
36                Resume-Samples-35.pdf   
2   Systems_Engineer_resume_example.pdf   
13                Resume-Samples-15.pdf   
23                Resume-Samples-36.pdf   
10                Resume-Samples-13.pdf   
25                Resume-Samples-17.pdf   
9                  Resume-Samples-6.pdf   
28                 Resume-Samples-5.pdf   
27                Resume-Samples-32.pdf   
4                 Resume-Samples-18.pdf   
34                Resume-Samples-33.pdf   
11                Resume-Samples-24.pdf   
22                 Resume-Samples-7.pdf   
38                Resume-Samples-34.pdf   
7                                T3.pdf   
6                                T2.pdf   
12                Resume-Samples-23.pdf   
0                  Resume-Samples-9.pdf   
26                 Resume-Samples-3.pdf   
14                Resume-Samples-11.pdf   
20         

In [5]:


# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Read data from Excel file
file_path = '/content/output.xlsx'  # Replace with your file path
df = pd.read_excel(file_path)

# Ensure the 'Education', 'Work Experience', and 'Skills and Competencies' columns contain only string values
df['Education'] = df['Education'].astype(str)
df['Work Experience'] = df['Work Experience'].astype(str)
df['Skills and Competencies'] = df['Skills and Competencies'].astype(str)

# Job description requirement for education
education_requirement = "at least a bachelor degree in IT, computer science, or relevant field"

# Job description requirement for work experience
work_experience_requirement = """We are seeking a skilled and motivated Analyst Programmer to join our dynamic team. The ideal candidate will have a strong background in both programming and systems analysis, with the ability to design, develop, and implement software solutions that meet our business needs.

Key Responsibilities
System Analysis: Analyze business requirements and translate them into technical specifications.
Software Development: Design, code, test, and debug software applications using various programming languages and technologies.
Database Management: Develop and maintain database systems, ensuring data integrity and security.
Technical Support: Provide technical support and troubleshooting for software applications and systems.
Documentation: Create and maintain comprehensive documentation for all developed systems and applications.
Collaboration: Work closely with cross-functional teams, including business analysts, project managers, and other developers, to deliver high-quality software solutions."""

# Job description requirement for skills and competencies
skills_requirement = """Technical Skills:
Proficiency in programming languages such as Java, C#, Python, or JavaScript.
Experience with database management systems like SQL Server, Oracle, or MySQL.
Familiarity with software development methodologies such as Agile or Scrum.
Knowledge of web technologies (HTML, CSS, JavaScript) and frameworks (Angular, React).
Soft Skills:
Strong analytical and problem-solving skills.
Excellent communication and interpersonal skills.
Ability to work independently and as part of a team.
Attention to detail and a commitment to quality."""

# Function to extract GPA from text
def extract_gpa(text):
    gpa_match = re.search(r'GPA:\s*(\d+\.\d+/\d+\.\d+)', text)
    if gpa_match:
        return float(gpa_match.group(1).split('/')[0])
    return None

# Function to encode text using BERT
def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Encode job requirements
education_embedding = encode_text(education_requirement, tokenizer, model)
work_experience_embedding = encode_text(work_experience_requirement, tokenizer, model)
skills_embedding = encode_text(skills_requirement, tokenizer, model)

# Extract GPA, encode candidate education, work experience, and skills and competencies
candidate_data = []
for index, row in df.iterrows():
    education = row['Education']
    work_experience = row['Work Experience']
    skills_competencies = row['Skills and Competencies']
    file_name = row['File Name']  # Extract the file name
    if pd.isna(education) or not isinstance(education, str) or pd.isna(work_experience) or not isinstance(work_experience, str) or pd.isna(skills_competencies) or not isinstance(skills_competencies, str):
        continue  # Skip rows with missing or non-string values
    gpa = extract_gpa(education)
    education_embedding_candidate = encode_text(education, tokenizer, model)
    work_experience_embedding_candidate = encode_text(work_experience, tokenizer, model)
    skills_competencies_embedding_candidate = encode_text(skills_competencies, tokenizer, model)
    candidate_data.append({
        'file_name': file_name,
        'education': education,
        'work_experience': work_experience,
        'skills_competencies': skills_competencies,
        'gpa': gpa,
        'education_embedding': education_embedding_candidate,
        'work_experience_embedding': work_experience_embedding_candidate,
        'skills_competencies_embedding': skills_competencies_embedding_candidate
    })

# Compute similarity scores
for data in candidate_data:
    data['education_similarity_score'] = cosine_similarity([education_embedding], [data['education_embedding']])[0][0]
    data['work_experience_similarity_score'] = cosine_similarity([work_experience_embedding], [data['work_experience_embedding']])[0][0]
    data['skills_competencies_similarity_score'] = cosine_similarity([skills_embedding], [data['skills_competencies_embedding']])[0][0]

# Combine similarity scores with GPA to create a composite score
for data in candidate_data:
    if data['gpa'] is not None and data['gpa'] > 3.0:
        data['composite_score'] = data['education_similarity_score'] * 0.1 + data['work_experience_similarity_score'] * 0.7 + data['skills_competencies_similarity_score'] * 0.2  # Adjust the weighting as needed
    else:
        data['composite_score'] = data['education_similarity_score'] * 0.1 + data['work_experience_similarity_score'] * 0.7 + data['skills_competencies_similarity_score'] * 0.2  # Adjust the weighting as needed

# Rank candidates based on composite score
ranked_candidates = pd.DataFrame(candidate_data).sort_values(by='composite_score', ascending=False)

# Print the ranked candidates along with the file name
print(ranked_candidates[['file_name', 'education', 'work_experience', 'skills_competencies', 'gpa', 'education_similarity_score', 'work_experience_similarity_score', 'skills_competencies_similarity_score', 'composite_score']])


                              file_name  \
2   Systems_Engineer_resume_example.pdf   
6                                T2.pdf   
7                                T3.pdf   
24                               T1.pdf   
0                  Resume-Samples-9.pdf   
28                 Resume-Samples-5.pdf   
26                 Resume-Samples-3.pdf   
36                Resume-Samples-35.pdf   
34                Resume-Samples-33.pdf   
31                Resume-Samples-21.pdf   
32                Resume-Samples-14.pdf   
11                Resume-Samples-24.pdf   
33                Resume-Samples-10.pdf   
5                 Resume-Samples-29.pdf   
9                  Resume-Samples-6.pdf   
38                Resume-Samples-34.pdf   
25                Resume-Samples-17.pdf   
16                Resume-Samples-28.pdf   
27                Resume-Samples-32.pdf   
29                Resume-Samples-20.pdf   
4                 Resume-Samples-18.pdf   
35                Resume-Samples-22.pdf   
14         