# Resume Matching Project

## Importing Libraries

In [1]:
import PyPDF2 as ppd
import torch
from transformers import BertTokenizer, BertForMaskedLM
from collections import Counter
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

In [7]:
def extract_text_from_pdf(pdf_file_path):
    with open(pdf_file_path, 'rb') as pdf_file:
        pdf_reader = ppd.PdfReader(pdf_file)
        text = ""
        for page_number in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_number]
            text += page.extract_text().lower()
            # text += preprocess_text(page.extract_text())
        return text

In [26]:
resumes=[]
pdf1 = 'Dataset/data/data/ACCOUNTANT/10554236.pdf'
pdf2 = 'Dataset/data/data/ACCOUNTANT/10674770.pdf'
pdf3 = 'Dataset/data/data/ACCOUNTANT/11163645.pdf'
pdf4 = 'Dataset/data/data/ENGINEERING/10219099.pdf'
resumes.append(extract_text_from_pdf(pdf1))
resumes.append(extract_text_from_pdf(pdf2))
resumes.append(extract_text_from_pdf(pdf3))
resumes.append(extract_text_from_pdf(pdf4))

In [11]:
resumes

['accountant\nsummary\nfinancial accountant specializing in financial planning reporting and analysis within the department of defense\nhighlights\naccount reconciliations\nresultsoriented\nfinancial reporting\ncritical thinking\naccounting operations professional\nanalysis of financial systems\nerp enterprise resource planning software\nexcellent facilitator\naccomplishments\nserved on a tiger team which identified and resolved general ledger postings in deams totaling b in accounting adjustments this allowed\nfor the first successful fiscal yearend close for \nin collaboration with dfas europe developed an automated tool that identified duplicate obligations this tool allowed hq usafe to\ndeobligate over m in duplicate obligations\nexperience\ncompany name\n \njuly \n \nto \nnovember \n \naccountant\n \ncity\n \n \nstate\nenterprise resource planning office ero\nin this position as an accountant assigned to the defense enterprise accounting and management system deams ero i was\nresp

In [28]:
page = resumes[3]
skills_index = page.find('skills')
print(skills_index)
page

142


"equipment engineering technician\nprofessional summary\nskilledâ maintenance mechanicâ technician with superb problem solving and multi-tasking skills. self-directed and motivated worker. to obtain\na position where teamwork, integrity and proffesionalism are a high standard in the company's mission statement. all while engaing in new\nchallenges and learning experiences.\nskills\nâ·â â 20 years of material management experienceâ  with state and federal government â· 13\nyears of law enforcement experience with the united states marine corps and army national\nguard. \nâ·â knowledge of mechanical test equipment. \nâ· work from written/verbal instructions, schematics, rough sketches, troubleshooting diagrams,\nlayouts and plans, interpretation and application of technical knowledge and understanding of\nmechanical theory and principles.\nâ· proficient in the use of pc's, msâ \noffice, 10key by touch and customer\nservice skills. \nâ·â forklift and crane certified\nâ·â self motivated to

### For skills extraction

In [22]:
import pandas as pd
import spacy

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

def extract_skills(resume_text):
    nlp_text = nlp(resume_text)

    # Removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    # Read the CSV file containing skills data
    data = pd.read_csv("skills.csv") 
    
    # Extract skill values from the CSV file
    skills = list(data.columns.values)
    
    skillset = []
    
    # Check for one-grams (example: python)
    for token in tokens:
        if token.lower() in skills:
            skillset.append(token)
    
    # Check for bi-grams and tri-grams (example: machine learning)
    for chunk in nlp_text.noun_chunks:
        chunk_text = chunk.text.lower().strip()
        if chunk_text in skills:
            skillset.append(chunk_text)
    
    return [i.capitalize() for i in set([i.lower() for i in skillset])]


In [30]:
job_role = ''
for i in page:
    if i == '\n':
        break
    job_role += i

job_role

'equipment engineering technician'

In [31]:
import re

def extract_education(text):
    # Define a regex pattern to match education details
    education_pattern = r"(\b[\w\s]+[\.,]?\s+(?:University|College|School|Institute)[\w\s]*[\.,]?)\s+(\b[\w\s]+[\.,]?\s+(?:Degree|Diploma|Certificate)[\w\s]*[\.,]?)"

    # Search for education details in the text
    education_matches = re.findall(education_pattern, text, re.IGNORECASE)

    # Extracted education details
    education_details = [match for match in education_matches]

    return education_details


In [32]:
education_details = extract_education(page)
education_details

[]

In [29]:
skills = []
skills = extract_skills(page)
skills

['Logistics',
 'Coaching',
 'Repairs',
 'Scheduling',
 'Hardware',
 'Schematics',
 'Operations',
 'Research',
 'Process',
 'Electrical',
 'Internal customers',
 'Vendors',
 'Instrumentation',
 'Transport',
 'Supervisor',
 'Communication',
 'Programming',
 'System',
 'Regulations',
 'Engineering',
 'Installation',
 'Testing',
 'Safety',
 'Reports',
 'Compliance',
 'Analyze',
 'Procurement',
 'Plan',
 'Certification',
 'Technical',
 'Automation',
 'Iso',
 'Requests',
 'Technical knowledge',
 'Inventory',
 'Audit',
 'Schedule',
 'Troubleshooting']