In [1]:
import pathlib
from src.parsing.pdf_text import extract_text_from_pdf

# Input and output folders
input_folder = pathlib.Path(r"C:\\Users\\aayus\\Dev\\CV-Analyzer\\data\\resumes_raw")
output_folder = pathlib.Path(r"C:\\Users\\aayus\\Dev\\CV-Analyzer\\data\\resumes_text")
output_folder.mkdir(parents=True, exist_ok=True)

# Loop over all PDFs in the input folder
for pdf_file in input_folder.glob("*.pdf"):
    output_file = output_folder / f"{pdf_file.stem}.txt"
    text = extract_text_from_pdf(pdf_file, output_file)

from src.parsing.docx_text import extract_text_from_docx

for docx_file in input_folder.glob("*.docx"):
    output_file = output_folder / f"{docx_file.stem}.txt"
    text = extract_text_from_docx(docx_file, output_file)

In [2]:
from src.parsing.resume_parser import parse_resume

for file in output_folder.iterdir(): # only PDFs and DOCX
        print(file)
        parsed = parse_resume(str(file))
        print(f"{file.name}: {parsed}")



C:\Users\aayus\Dev\CV-Analyzer\data\resumes_text\Amit_Jangir.txt
<re.Match object; span=(145, 172), match='skills, hands-on experience'>
Amit_Jangir.txt: {'name': None, 'email': None, 'phone': None, 'education': ['B.Tech', 'BE', 'ME'], 'skills': ['hands', 'on']}
C:\Users\aayus\Dev\CV-Analyzer\data\resumes_text\Ananya_Sharma.txt
<re.Match object; span=(143, 170), match='skills, hands-on experience'>
Ananya_Sharma.txt: {'name': None, 'email': None, 'phone': None, 'education': ['B.Tech', 'BE', 'ME', 'University'], 'skills': ['hands', 'on']}
C:\Users\aayus\Dev\CV-Analyzer\data\resumes_text\CV_Aarav_Mehta.txt
<re.Match object; span=(279, 451), match='Skills JavaScript, React, Node.js, Python, SQL, A>
CV_Aarav_Mehta.txt: {'name': None, 'email': 'aarav.mehta@gmail.com', 'phone': '+91 9876543210', 'education': ['B.Tech', 'BE', 'ME'], 'skills': ['JavaScript', 'React', 'Node.js', 'Python', 'SQL', 'AWS', 'Git Problem Solving Skills Strong debugging skills', 'optimized database queries', 'designed

In [25]:
from src.nlp.spacy_pipe import extract_entities, extract_person_names, quick_ner_summary
from src.parsing.resume_parser import parse_resume
import pathlib

input_folder = pathlib.Path(r"C:\\Users\\aayus\\Dev\\CV-Analyzer\\data\\resumes_raw")

for file in input_folder.iterdir():
    parsed = parse_resume(str(file))
    combined_text = " ".join(
        str(value) for value in parsed.values() if value
    )
    print(extract_person_names(combined_text))
    print(quick_ner_summary(combined_text))
    print(extract_entities(combined_text)[:20])


['Kali Linux']
{'PERSON': 1}
[('Kali Linux', 'PERSON')]
[]
{'ORG': 1}
[('SQL', 'ORG')]
['Java']
{'PERSON': 1, 'ORG': 1}
[('Java', 'PERSON'), ('SQL', 'ORG')]
['Email arjun.sharma@gmail.com +91']
{'PERSON': 1, 'DATE': 1, 'WORK_OF_ART': 1}
[('Email arjun.sharma@gmail.com +91', 'PERSON'), ('9988776655', 'DATE'), ('MBA', 'WORK_OF_ART')]
['Priya Gupta']
{'PERSON': 1}
[('Priya Gupta', 'PERSON')]
['+91', 'Rahul Verma UX/']
{'PERSON': 2, 'PRODUCT': 1}
[('Rahul Verma UX/', 'PERSON'), ('+91', 'PERSON'), ('Excel', 'PRODUCT')]
['+91']
{'PERSON': 1, 'DATE': 1, 'ORG': 1, 'PRODUCT': 1}
[('+91', 'PERSON'), ('9876501234', 'DATE'), ('SQL', 'ORG'), ('Excel', 'PRODUCT')]
['David Wilson Role', 'Java', "Machine Learning'"]
{'PERSON': 3}
[('David Wilson Role', 'PERSON'), ('Java', 'PERSON'), ("Machine Learning'", 'PERSON')]
['Emily Johnson Role', 'Java']
{'PERSON': 2, 'WORK_OF_ART': 1, 'ORG': 1}
[('Emily Johnson Role', 'PERSON'), ('Java', 'PERSON'), ("C++'", 'WORK_OF_ART'), ('SQL', 'ORG')]
['James Anderson Rol

In [6]:
from src.nlp.skills import load_skills, compile_patterns, detect_skills

skills = load_skills("data/skills.csv")
print(skills)
patterns = compile_patterns(skills)
sample_text = """
Experienced Python and React developer with AWS and Docker.
Built REST APIs with Flask and PostgreSQL, CI/CD via GitHub Actions.
"""

print(detect_skills(sample_text, patterns))
# Expected: {'Python', 'React', 'AWS', 'Docker', 'REST API', 'Flask', 'PostgreSQL', 'CI/CD', 'Git'}


{'Python': ['cpython', 'py', 'python', 'python3'], 'Java': ['java', 'jdk', 'jre'], 'C++': ['c++', 'cpp'], 'C': ['c'], 'C#': ['c#', 'csharp'], 'JavaScript': ['ecmascript', 'javascript', 'js'], 'TypeScript': ['ts', 'typescript'], 'HTML': ['html', 'html5'], 'CSS': ['css', 'css3'], 'SQL': ['ansi sql', 'sql'], 'PostgreSQL': ['postgres', 'postgresql', 'psql'], 'MySQL': ['mysql'], 'SQLite': ['sqlite', 'sqlite3'], 'MongoDB': ['mongo', 'mongodb'], 'Redis': ['redis'], 'Django': ['django'], 'Flask': ['flask'], 'FastAPI': ['fastapi'], 'Spring': ['spring', 'spring boot', 'spring-boot'], 'Node.js': ['node', 'node.js', 'nodejs'], 'Express': ['express', 'express.js', 'expressjs'], 'React': ['react', 'react.js', 'reactjs'], 'Angular': ['angular', 'angular.js', 'angularjs'], 'Vue': ['vue', 'vue.js', 'vuejs'], 'Next.js': ['next', 'next.js', 'nextjs'], 'Tailwind CSS': ['tailwind', 'tailwind css', 'tailwindcss'], 'Bootstrap': ['bootstrap'], 'jQuery': ['jquery'], 'REST API': ['rest', 'rest api', 'restful'],