In [16]:
import numpy as np
import pandas as pd
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import io
from spacy.matcher import PhraseMatcher
import spacy
import docx2txt
import re
import os
from pymongo import MongoClient

In [18]:
MONGODB_URI = "mongodb://0.0.0.0:27017/"
client = MongoClient(MONGODB_URI)

db = client['RESUME_DATA']
collection = db['user_data']

In [2]:
# Load the NLP model
nlp = spacy.load('en_core_web_sm')



In [3]:
def get_files_in_directory(directory_path, extension='.pdf'):
    print("directory_path: ", directory_path)
    files = []
    for file in os.listdir(directory_path):
        if file.endswith(extension):
            files.append(os.path.join(directory_path, file))
    return files

In [4]:
def extract_text(file_path):
    try:
        if file_path.endswith('.docx'):
            # Load the docx file
            text = docx2txt.process(file_path)
        elif file_path.endswith('.pdf'):
            # Load the PDF file
                i_f =  open(file_path, 'rb') 
                resMgr = PDFResourceManager()
                retData = io.StringIO()
                TxtConverter = TextConverter(resMgr,retData, laparams= LAParams())
                interpreter = PDFPageInterpreter(resMgr,TxtConverter)
                for page in PDFPage.get_pages(i_f):
                    interpreter.process_page(page)

                    text = retData.getvalue()
                    # print(text)
        else:
            raise ValueError('Unsupported file format')
    except:
        print("Error: Failed to extract text from file.")
        return ""
    
    # text = str(txt)
    return text


In [5]:
def extract_info(text):
    email = re.findall(r'[\w\.-]+@[\w\.-]+', text)
    name1 = re.findall(r'\b[A-Z][a-z]+\s[A-Z][a-z]+\b', text)
    name2 = re.findall(r'\b[A-Z]+\s[A-Z]+\b', text)
    name = [n for n in name1 if n not in name2]
    phone1 = re.findall(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text)
    phone2 = re.findall(r'\+1\s-\d{3}-\d{3}-\d{4}', text)
    phone = phone1 + phone2
    # linkedin = re.findall(r'/((https?:\/\/)?((www|\w\w)\.)?linkedin\.com\/)((([\w]{2,3})?)|([^\/]+\/(([\w|\d-&#?=])+\/?){1,}))$/gm', text)


    doc = nlp(text)
    
    for ent in doc.ents:
        if ent.label_ == "PERSON" and not name:
            name = ent.text
        elif ent.label_ == "EMAIL" and not email:
            email = ent.text
        elif ent.label_ == "PHONE" and not phone:
            phone = ent.text
        # elif ent.label_ == "ORG" and "linkedin" in ent.text.lower():
            # linkedin = ent.text.strip()

    return email, name, phone

In [6]:
skills_df = pd.read_csv("data/skills.csv")
skills_df.rename(columns = {'(ISC)2':'Skill'}, 
            inplace = True)
  
skills = list(skills_df["Skill"].values)
# print(skills)

# Create a phrase matcher with the skills as patterns
skill_matcher = PhraseMatcher(nlp.vocab)
patterns = [nlp(str(skill) )for skill in skills]
skill_matcher.add("Skills", None, *patterns)

def extract_skills(resume_text):
    doc = nlp(resume_text)
    skill_matches = skill_matcher(doc)
    skills = [doc[start:end].text for match_id, start, end in skill_matches]
    return skills

In [14]:
location_df = pd.read_csv("data/location.csv", encoding = "ISO-8859-1")
location_df
States = list(set(location_df["state_name"].values))
# print(States)
Cities= list(location_df["city"].values)
# print(Cities)

state_matcher = PhraseMatcher(nlp.vocab)
state_patterns = [nlp(str(State) )for State in States]
state_matcher.add("States", None, *state_patterns)

# Create a PhraseMatcher for cities
city_matcher = PhraseMatcher(nlp.vocab)
city_patterns = [nlp(city) for city in Cities]
city_matcher.add("Cities", None, *city_patterns)

def extract_location(resume_text):
    doc = nlp(resume_text)
    state_matches = state_matcher(doc)
    city_matches = city_matcher(doc)
    states = [doc[start:end].text for match_id, start, end in state_matches]
    city = [doc[start:end].text for match_id, start, end in city_matches]
    return states , city


([], [])


In [8]:
job_df = pd.read_csv("data/job.csv")
# job_df
jobs = list(job_df["Job Title"].values)

job_matcher = PhraseMatcher(nlp.vocab)
patterns = [nlp(str(job) )for job in jobs]
job_matcher.add("Jobs", None, *patterns)

def extract_jobs(resume_text):
    doc = nlp(resume_text)
    job_matches = job_matcher(doc)
    jobs = [doc[start:end].text for match_id, start, end in job_matches]
    return jobs



In [9]:
def extract_school(resume_txt):
    sub_patterns = [
                    '[A-Za-zÀ-ȕ]* University', 
                    '[A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]* University', 
                    '[A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]* University', 
                    #'[A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]* University',
                    '[A-Za-zÀ-ȕ]* Institute [A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]*',
                    '[A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]* Institute *[A-Za-zÀ-ȕ] *[A-Za-zÀ-ȕ]',            
                    '[A-Za-zÀ-ȕ]* Institute',
                    '[A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]* Institute',
                    '[A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]* Institute',
                    'Institute [A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]*', 
                    'Institute [A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]*',
                    'Institute [A-Za-zÀ-ȕ]*',    
                    'University *[A-Za-zÀ-ȕ] [A-Za-zÀ-ȕ]*',
                    'University [A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]*',
                    'University [A-Za-zÀ-ȕ]*',
                    '[A-Za-zÀ-ȕ]* School', 
                    '[A-Za-zÀ-ȕ]* [A-Z][a-z]* School', 
                    '[A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]* [A-Za-zÀ-ȕ]* School']
    pattern = '({})'.format('|'.join(sub_patterns))
    matches = re.findall(pattern, resume_txt)
    return(list(dict.fromkeys(matches)))



In [10]:
def extract_experience(resume_txt):
    experience_pattern = r"(\d+(\.\d+)?)(\s*\+)?\s*years"
    matches = re.findall(experience_pattern, resume_txt)
    if matches:
        experience_list = [match[0] for match in matches]
        return experience_list
    else:
        return []


In [21]:
def extract_resume(file_path):
    text = extract_text(file_path)
    email, name, phone = extract_info(text)
    education = extract_school(text)
    skills = extract_skills(text)
    location = extract_location(text)
    job_title = extract_jobs(text)
    experience = extract_experience(text)
    resume = {
        'email': email[0],
        'name': name[0],
        'phone': phone[0],
        'education': education,
        'location': location[0][],
        'job_title': job_title,
        'skills': skills,
        'experience' : experience
    }
    return resume

In [23]:
def process_files(file_paths):
    resumes = []
    for file_path in file_paths:
        extracted_resume = extract_resume(file_path)
        resumes.append(extracted_resume)
    return resumes

directory_path = 'resumes'
files = get_files_in_directory(directory_path)
documents = process_files(files)
result = collection.insert_many(documents)

directory_path:  resumes
