In [2]:
# Test all libraries
import sys
print(f"‚úÖ Python version: {sys.version}\n")

# Test each library
libraries = {
    'pandas': 'Data manipulation',
    'numpy': 'Numerical computing',
    'spacy': 'NLP and entity recognition',
    'pdfplumber': 'PDF text extraction',
    'docx': 'Word document reading',
    'transformers': 'Advanced NLP models',
    'sklearn': 'Machine learning',
    'matplotlib': 'Data visualization'
}

print("Testing library imports...\n")
success_count = 0
failed_libs = []

for lib, purpose in libraries.items():
    try:
        __import__(lib)
        print(f"‚úÖ {lib:15} - {purpose}")
        success_count += 1
    except ImportError as e:
        print(f"‚ùå {lib:15} - FAILED")
        failed_libs.append(lib)

# Test spaCy model
print("\n" + "="*50)
print("Testing spaCy language model...")
print("="*50)
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
    doc = nlp("John Doe works at Google in 2024")
    print("‚úÖ spaCy model loaded successfully")
    print(f"   Test sentence: 'John Doe works at Google in 2024'")
    print(f"   Entities found: {[(ent.text, ent.label_) for ent in doc.ents]}")
except Exception as e:
    print(f"‚ùå spaCy model failed: {e}")
    failed_libs.append('spacy_model')

print("\n" + "="*50)
print(f"üéâ RESULTS: {success_count}/{len(libraries)} libraries working")
print("="*50)

if failed_libs:
    print(f"\n‚ö†Ô∏è  Failed libraries: {', '.join(failed_libs)}")
    print("We'll need to reinstall these.")
else:
    print("\n‚úÖ All libraries working perfectly!")
    print("üöÄ Ready to start building your Resume Parser!")

‚úÖ Python version: 3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:37:03) [MSC v.1929 64 bit (AMD64)]

Testing library imports...

‚úÖ pandas          - Data manipulation
‚úÖ numpy           - Numerical computing
‚úÖ spacy           - NLP and entity recognition
‚úÖ pdfplumber      - PDF text extraction
‚úÖ docx            - Word document reading


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


‚úÖ transformers    - Advanced NLP models
‚úÖ sklearn         - Machine learning
‚úÖ matplotlib      - Data visualization

Testing spaCy language model...
‚úÖ spaCy model loaded successfully
   Test sentence: 'John Doe works at Google in 2024'
   Entities found: [('John Doe', 'PERSON'), ('Google', 'ORG'), ('2024', 'DATE')]

üéâ RESULTS: 8/8 libraries working

‚úÖ All libraries working perfectly!
üöÄ Ready to start building your Resume Parser!


In [8]:
import os

# Create sample resumes
resumes_folder = "../data/resumes/"

# Sample Resume 1 - Software Engineer
resume1 = """
PRIYA SHARMA
Email: priya.sharma@email.com
Phone: +91-9876543210
Location: Bangalore, India

PROFESSIONAL SUMMARY
Software Engineer with 3 years of experience in Python, Machine Learning, and Data Analysis.

SKILLS
- Programming: Python, Java, SQL
- Machine Learning: Scikit-learn, TensorFlow, Pandas
- Tools: Git, Docker, Jupyter
- Soft Skills: Team Leadership, Problem Solving

WORK EXPERIENCE

Data Scientist | TechCorp India | June 2021 - Present
- Built machine learning models for customer segmentation
- Improved prediction accuracy by 25%
- Worked with cross-functional teams of 10+ members

Software Engineer Intern | StartupXYZ | Jan 2020 - May 2021
- Developed REST APIs using Python Flask
- Automated data pipelines reducing processing time by 40%

EDUCATION
Bachelor of Technology in Computer Science
Indian Institute of Technology (IIT) Delhi
Graduated: 2020
GPA: 8.5/10

CERTIFICATIONS
- AWS Certified Developer Associate (2022)
- Google Data Analytics Professional Certificate (2021)

PROJECTS
- Built a recommendation system using collaborative filtering
- Created a sentiment analysis tool for social media data
"""

# Sample Resume 2 - Marketing Manager
resume2 = """
RAHUL VERMA
rahul.verma@gmail.com | +91-9123456789 | Mumbai, Maharashtra

OBJECTIVE
Experienced Marketing Manager seeking to leverage 5+ years of digital marketing expertise.

CORE COMPETENCIES
Digital Marketing, SEO/SEM, Content Strategy, Social Media Management, Google Analytics
Team Management, Budget Planning, Campaign Optimization

PROFESSIONAL EXPERIENCE

Senior Marketing Manager | BrandCo Ltd. | March 2020 - Present
- Led digital marketing campaigns with $500K+ annual budget
- Increased website traffic by 150% through SEO optimization
- Managed team of 8 marketing professionals
- Launched 20+ successful product campaigns

Marketing Executive | MediaHub Agency | July 2018 - Feb 2020
- Coordinated social media strategy across 5 platforms
- Improved engagement rates by 80%
- Created content calendars and managed influencer partnerships

EDUCATION
MBA in Marketing
XLRI Jamshedpur
Completed: 2018

Bachelor of Commerce
Mumbai University
Completed: 2016

ACHIEVEMENTS
- Winner of Best Digital Campaign Award 2022
- Increased ROI by 200% for key client accounts
- Published articles in Marketing Today magazine
"""

# Sample Resume 3 - Fresh Graduate
resume3 = """
ANJALI PATEL
Email: anjali.patel2024@email.com
Phone: +91-9988776655
LinkedIn: linkedin.com/in/anjalipatel

EDUCATION
Master of Science in Data Science
Pune University
Expected Graduation: May 2024
CGPA: 9.2/10

Bachelor of Engineering in Information Technology  
Gujarat Technological University
Graduated: 2022
CGPA: 8.8/10

TECHNICAL SKILLS
- Languages: Python, R, SQL, JavaScript
- Libraries: Pandas, NumPy, Matplotlib, Scikit-learn, spaCy
- Tools: Jupyter, Git, Tableau, Power BI
- Databases: MySQL, MongoDB

ACADEMIC PROJECTS

Resume Parser using NLP | Jan 2024 - Present
- Building an automated resume parsing system using spaCy and transformers
- Extracting entities like skills, education, experience from resumes
- Achieved 85% accuracy in entity recognition

Customer Churn Prediction | Sep 2023 - Dec 2023
- Predicted customer churn using Random Forest and XGBoost
- Analyzed dataset of 50,000+ customer records
- Improved model performance to 92% accuracy

INTERNSHIPS

Data Analytics Intern | FinTech Solutions Pvt. Ltd. | Summer 2023
- Analyzed transaction data to identify fraud patterns
- Created dashboards using Tableau for executive reporting
- Reduced false positive alerts by 30%

CERTIFICATIONS
- Python for Data Science (Coursera, 2023)
- Machine Learning Specialization (DeepLearning.AI, 2023)

EXTRACURRICULAR
- Vice President, Data Science Club, Pune University
- Participated in 3 Kaggle competitions, highest rank: Top 15%
"""

# Save the resumes
resumes = {
    "resume_priya_sharma.txt": resume1,
    "resume_rahul_verma.txt": resume2,
    "resume_anjali_patel.txt": resume3
}

for filename, content in resumes.items():
    filepath = os.path.join(resumes_folder, filename)
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)
    print(f"‚úÖ Created: {filename}")

print(f"\nüéâ Successfully created {len(resumes)} sample resumes in {resumes_folder}")

‚úÖ Created: resume_priya_sharma.txt
‚úÖ Created: resume_rahul_verma.txt
‚úÖ Created: resume_anjali_patel.txt

üéâ Successfully created 3 sample resumes in ../data/resumes/


In [9]:
# Create sample job descriptions
jobs_folder = "../data/job_descriptions/"

# Job Description 1 - Data Scientist
job1 = """
JOB TITLE: Data Scientist
COMPANY: TechVentures Pvt. Ltd.
LOCATION: Bangalore, India
JOB TYPE: Full-time

ABOUT THE ROLE:
We are seeking a talented Data Scientist to join our AI team. You will work on cutting-edge 
machine learning projects and help drive data-driven decision making across the organization.

REQUIRED QUALIFICATIONS:
- Bachelor's or Master's degree in Computer Science, Statistics, or related field
- 2-4 years of experience in data science or machine learning roles
- Strong programming skills in Python
- Experience with machine learning libraries: Scikit-learn, TensorFlow, or PyTorch
- Proficiency in SQL and database management
- Experience with data visualization tools (Tableau, Power BI, or matplotlib)

PREFERRED SKILLS:
- Experience with NLP and text analytics
- Knowledge of big data technologies (Spark, Hadoop)
- Strong statistical analysis skills
- Experience with cloud platforms (AWS, Azure, or GCP)
- Published research or contributions to open-source projects

RESPONSIBILITIES:
- Build and deploy machine learning models for various business use cases
- Analyze large datasets to extract actionable insights
- Collaborate with cross-functional teams including engineers and product managers
- Present findings to stakeholders and leadership team
- Mentor junior data scientists

WHAT WE OFFER:
- Competitive salary: ‚Çπ15-25 LPA
- Health insurance and wellness benefits
- Flexible work hours and remote work options
- Learning and development budget
- Opportunity to work on innovative AI projects
"""

# Job Description 2 - Digital Marketing Manager
job2 = """
POSITION: Digital Marketing Manager
ORGANIZATION: GrowthMax Marketing Agency
LOCATION: Mumbai, Maharashtra (Hybrid)
EMPLOYMENT TYPE: Full-time

JOB SUMMARY:
GrowthMax is looking for an experienced Digital Marketing Manager to lead our client campaigns 
and drive measurable results. The ideal candidate has a proven track record in digital marketing 
and team leadership.

MINIMUM REQUIREMENTS:
- Bachelor's degree in Marketing, Business, or related field; MBA preferred
- 5+ years of experience in digital marketing
- Proven experience managing marketing budgets of $100K+
- Strong understanding of SEO, SEM, and social media marketing
- Experience with Google Analytics, Google Ads, and Facebook Ads Manager
- Excellent written and verbal communication skills

NICE TO HAVE:
- Google Ads Certification
- HubSpot or similar marketing automation platform experience
- Experience in B2B and B2C marketing
- Knowledge of content management systems (WordPress, etc.)
- Video marketing and production experience

KEY RESPONSIBILITIES:
- Develop and execute comprehensive digital marketing strategies
- Manage and mentor a team of 5-10 marketing professionals
- Oversee SEO/SEM, email marketing, social media, and content marketing campaigns
- Track and analyze campaign performance using analytics tools
- Manage client relationships and present campaign results
- Stay updated with latest marketing trends and technologies
- Allocate and optimize marketing budgets

BENEFITS:
- Annual salary: ‚Çπ12-18 LPA
- Performance-based bonuses
- Professional development opportunities
- Work-life balance initiatives
- Modern office with collaborative workspace
"""

# Save job descriptions
jobs = {
    "job_data_scientist.txt": job1,
    "job_marketing_manager.txt": job2
}

for filename, content in jobs.items():
    filepath = os.path.join(jobs_folder, filename)
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)
    print(f"‚úÖ Created: {filename}")

print(f"\nüéâ Successfully created {len(jobs)} sample job descriptions in {jobs_folder}")

‚úÖ Created: job_data_scientist.txt
‚úÖ Created: job_marketing_manager.txt

üéâ Successfully created 2 sample job descriptions in ../data/job_descriptions/


In [10]:
import os

# List all files
print("üìÅ RESUMES:")
print("-" * 50)
for file in os.listdir("../data/resumes/"):
    print(f"  ‚úì {file}")

print("\nüìÅ JOB DESCRIPTIONS:")
print("-" * 50)
for file in os.listdir("../data/job_descriptions/"):
    print(f"  ‚úì {file}")

print("\n‚úÖ All sample files created successfully!")


üìÅ RESUMES:
--------------------------------------------------
  ‚úì resume_anjali_patel.txt
  ‚úì resume_priya_sharma.txt
  ‚úì resume_rahul_verma.txt

üìÅ JOB DESCRIPTIONS:
--------------------------------------------------
  ‚úì job_data_scientist.txt
  ‚úì job_marketing_manager.txt

‚úÖ All sample files created successfully!


In [11]:
# Read a single resume file
resume_path = "../data/resumes/resume_priya_sharma.txt"

# Open and read the file
with open(resume_path, 'r', encoding='utf-8') as file:
    resume_text = file.read()

# Display the content
print("üìÑ RESUME CONTENT:")
print("=" * 70)
print(resume_text)
print("=" * 70)
print(f"\n‚úÖ Successfully loaded resume!")
print(f"üìä Resume length: {len(resume_text)} characters")
print(f"üìä Number of lines: {len(resume_text.split(chr(10)))}")

üìÑ RESUME CONTENT:

PRIYA SHARMA
Email: priya.sharma@email.com
Phone: +91-9876543210
Location: Bangalore, India

PROFESSIONAL SUMMARY
Software Engineer with 3 years of experience in Python, Machine Learning, and Data Analysis.

SKILLS
- Programming: Python, Java, SQL
- Machine Learning: Scikit-learn, TensorFlow, Pandas
- Tools: Git, Docker, Jupyter
- Soft Skills: Team Leadership, Problem Solving

WORK EXPERIENCE

Data Scientist | TechCorp India | June 2021 - Present
- Built machine learning models for customer segmentation
- Improved prediction accuracy by 25%
- Worked with cross-functional teams of 10+ members

Software Engineer Intern | StartupXYZ | Jan 2020 - May 2021
- Developed REST APIs using Python Flask
- Automated data pipelines reducing processing time by 40%

EDUCATION
Bachelor of Technology in Computer Science
Indian Institute of Technology (IIT) Delhi
Graduated: 2020
GPA: 8.5/10

CERTIFICATIONS
- AWS Certified Developer Associate (2022)
- Google Data Analytics Profession

In [12]:
import os

# Function to read all resumes from a folder
def load_all_resumes(folder_path):
    """
    Reads all .txt files from the resumes folder
    Returns a dictionary: {filename: resume_text}
    """
    resumes = {}
    
    # Get all files in the folder
    files = os.listdir(folder_path)
    
    # Loop through each file
    for filename in files:
        if filename.endswith('.txt'):  # Only read .txt files
            filepath = os.path.join(folder_path, filename)
            
            # Read the file
            with open(filepath, 'r', encoding='utf-8') as file:
                resumes[filename] = file.read()
    
    return resumes

# Load all resumes
resumes_folder = "../data/resumes/"
all_resumes = load_all_resumes(resumes_folder)

# Display summary
print("üìö LOADED RESUMES:")
print("=" * 70)
for filename, content in all_resumes.items():
    print(f"\n‚úÖ {filename}")
    print(f"   Characters: {len(content)}")
    print(f"   Lines: {len(content.split(chr(10)))}")
    print(f"   First 100 characters: {content[:100]}...")

print("\n" + "=" * 70)
print(f"üéâ Total resumes loaded: {len(all_resumes)}")

üìö LOADED RESUMES:

‚úÖ resume_anjali_patel.txt
   Characters: 1453
   Lines: 50
   First 100 characters: 
ANJALI PATEL
Email: anjali.patel2024@email.com
Phone: +91-9988776655
LinkedIn: linkedin.com/in/anja...

‚úÖ resume_priya_sharma.txt
   Characters: 1131
   Lines: 40
   First 100 characters: 
PRIYA SHARMA
Email: priya.sharma@email.com
Phone: +91-9876543210
Location: Bangalore, India

PROFES...

‚úÖ resume_rahul_verma.txt
   Characters: 1125
   Lines: 38
   First 100 characters: 
RAHUL VERMA
rahul.verma@gmail.com | +91-9123456789 | Mumbai, Maharashtra

OBJECTIVE
Experienced Mar...

üéâ Total resumes loaded: 3


In [13]:
# Load all job descriptions
jobs_folder = "../data/job_descriptions/"
all_jobs = load_all_resumes(jobs_folder)  # We can reuse the same function!

# Display summary
print("üíº LOADED JOB DESCRIPTIONS:")
print("=" * 70)
for filename, content in all_jobs.items():
    print(f"\n‚úÖ {filename}")
    print(f"   Characters: {len(content)}")
    print(f"   Lines: {len(content.split(chr(10)))}")
    
    # Extract job title (first line usually)
    first_line = content.split('\n')[0]
    print(f"   Job Title: {first_line}")

print("\n" + "=" * 70)
print(f"üéâ Total job descriptions loaded: {len(all_jobs)}")

üíº LOADED JOB DESCRIPTIONS:

‚úÖ job_data_scientist.txt
   Characters: 1534
   Lines: 39
   Job Title: 

‚úÖ job_marketing_manager.txt
   Characters: 1648
   Lines: 42
   Job Title: 

üéâ Total job descriptions loaded: 2


In [14]:
import pandas as pd

# Create a summary dataframe for resumes
resume_summary = []

for filename, content in all_resumes.items():
    # Extract candidate name (usually first line)
    lines = content.split('\n')
    name = lines[0].strip() if lines else "Unknown"
    
    resume_summary.append({
        'Filename': filename,
        'Candidate Name': name,
        'Character Count': len(content),
        'Line Count': len(lines),
        'Status': '‚úÖ Loaded'
    })

# Convert to DataFrame
df_resumes = pd.DataFrame(resume_summary)

print("üìä RESUME SUMMARY TABLE:")
print("=" * 70)
print(df_resumes.to_string(index=False))
print("=" * 70)

üìä RESUME SUMMARY TABLE:
               Filename Candidate Name  Character Count  Line Count   Status
resume_anjali_patel.txt                            1453          50 ‚úÖ Loaded
resume_priya_sharma.txt                            1131          40 ‚úÖ Loaded
 resume_rahul_verma.txt                            1125          38 ‚úÖ Loaded


In [16]:
import re
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    """
    Clean and preprocess resume text
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters but keep important ones like @ . -
    text = re.sub(r'[^\w\s@.\-+(),:;]', '', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def tokenize_text(text):
    """
    Tokenize text using spaCy
    """
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return tokens

# Test preprocessing on one resume
sample_resume = all_resumes['resume_priya_sharma.txt']

print("üìÑ ORIGINAL TEXT (first 200 chars):")
print("=" * 70)
print(sample_resume[:200])

print("\n\nüßπ PREPROCESSED TEXT (first 200 chars):")
print("=" * 70)
preprocessed = preprocess_text(sample_resume)
print(preprocessed[:200])

print("\n\nüî§ TOKENIZED (first 30 tokens):")
print("=" * 70)
tokens = tokenize_text(preprocessed)
print(tokens[:30])

print(f"\n‚úÖ Preprocessing complete!")
print(f"üìä Total tokens extracted: {len(tokens)}")

üìÑ ORIGINAL TEXT (first 200 chars):

PRIYA SHARMA
Email: priya.sharma@email.com
Phone: +91-9876543210
Location: Bangalore, India

PROFESSIONAL SUMMARY
Software Engineer with 3 years of experience in Python, Machine Learning, and Data An


üßπ PREPROCESSED TEXT (first 200 chars):
priya sharma email: priya.sharma@email.com phone: +91-9876543210 location: bangalore, india professional summary software engineer with 3 years of experience in python, machine learning, and data anal


üî§ TOKENIZED (first 30 tokens):
['priya', 'sharma', 'email', 'priya.sharma@email.com', 'phone', '+91', '9876543210', 'location', 'bangalore', 'india', 'professional', 'summary', 'software', 'engineer', '3', 'years', 'experience', 'python', 'machine', 'learning', 'data', 'analysis', 'skills', 'programming', 'python', 'java', 'sql', 'machine', 'learning', 'scikit']

‚úÖ Preprocessing complete!
üìä Total tokens extracted: 131


In [17]:
def extract_entities_from_resume(text):
    """
    Extract key information from resume using spaCy NER
    """
    doc = nlp(text)
    
    entities = {
        'names': [],
        'organizations': [],
        'dates': [],
        'locations': [],
        'emails': [],
        'phones': [],
        'skills': []
    }
    
    # Extract named entities using spaCy
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities['names'].append(ent.text)
        elif ent.label_ == "ORG":
            entities['organizations'].append(ent.text)
        elif ent.label_ == "DATE":
            entities['dates'].append(ent.text)
        elif ent.label_ == "GPE":  # Geo-Political Entity (cities, countries)
            entities['locations'].append(ent.text)
    
    # Extract email using regex
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails = re.findall(email_pattern, text)
    entities['emails'] = emails
    
    # Extract phone numbers using regex
    phone_pattern = r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]'
    phones = re.findall(phone_pattern, text)
    entities['phones'] = phones
    
    # Extract skills (simple keyword matching)
    skill_keywords = ['python', 'java', 'sql', 'machine learning', 'data science', 
                     'tensorflow', 'pandas', 'numpy', 'git', 'docker', 'aws',
                     'javascript', 'react', 'node', 'mongodb', 'tableau',
                     'power bi', 'excel', 'r programming', 'spark', 'hadoop',
                     'nlp', 'deep learning', 'scikit-learn', 'flask', 'django']
    
    text_lower = text.lower()
    for skill in skill_keywords:
        if skill in text_lower:
            entities['skills'].append(skill)
    
    # Remove duplicates
    for key in entities:
        entities[key] = list(set(entities[key]))
    
    return entities

# Test on one resume
print("üîç EXTRACTING ENTITIES FROM PRIYA SHARMA'S RESUME:")
print("=" * 70)

extracted = extract_entities_from_resume(sample_resume)

for category, items in extracted.items():
    print(f"\n{category.upper()}:")
    for item in items:
        print(f"  ‚úì {item}")

print("\n" + "=" * 70)
print("‚úÖ Entity extraction complete!")

üîç EXTRACTING ENTITIES FROM PRIYA SHARMA'S RESUME:

NAMES:
  ‚úì Machine Learning
  ‚úì Pandas
  ‚úì Java
  ‚úì Jupyter
- Soft
  ‚úì Docker

ORGANIZATIONS:
  ‚úì Data Scientist
  ‚úì CERTIFICATIONS
  ‚úì Google Data Analytics Professional Certificate
  ‚úì Bachelor of Technology
  ‚úì TensorFlow
  ‚úì IIT
  ‚úì GPA
  ‚úì Python
  ‚úì Data Analysis

DATES:
  ‚úì June 2021 - Present
  ‚úì Jan 2020 -
  ‚úì 10+
  ‚úì 2021
  ‚úì 3 years

LOCATIONS:
  ‚úì SHARMA
  ‚úì Flask
  ‚úì India
  ‚úì Delhi

EMAILS:
  ‚úì priya.sharma@email.com

PHONES:
  ‚úì +91-9876543210

SKILLS:
  ‚úì scikit-learn
  ‚úì docker
  ‚úì java
  ‚úì aws
  ‚úì machine learning
  ‚úì python
  ‚úì tensorflow
  ‚úì sql
  ‚úì git
  ‚úì pandas
  ‚úì flask

‚úÖ Entity extraction complete!


In [18]:
# Parse all resumes
parsed_resumes = {}

print("üîÑ PARSING ALL RESUMES...")
print("=" * 70)

for filename, content in all_resumes.items():
    print(f"\nüìÑ Processing: {filename}")
    entities = extract_entities_from_resume(content)
    parsed_resumes[filename] = entities
    print(f"   ‚úÖ Extracted {len(entities['emails'])} emails, {len(entities['skills'])} skills, {len(entities['organizations'])} organizations")

print("\n" + "=" * 70)
print(f"üéâ Successfully parsed {len(parsed_resumes)} resumes!")

üîÑ PARSING ALL RESUMES...

üìÑ Processing: resume_anjali_patel.txt
   ‚úÖ Extracted 1 emails, 14 skills, 10 organizations

üìÑ Processing: resume_priya_sharma.txt
   ‚úÖ Extracted 1 emails, 11 skills, 9 organizations

üìÑ Processing: resume_rahul_verma.txt
   ‚úÖ Extracted 1 emails, 1 skills, 10 organizations

üéâ Successfully parsed 3 resumes!


In [19]:
import json

# Save parsed data to JSON
output_file = "../outputs/parsed_resumes.json"

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(parsed_resumes, f, indent=4, ensure_ascii=False)

print(f"üíæ Saved parsed resumes to: {output_file}")

# Display one example
print("\nüìã SAMPLE OUTPUT (Priya Sharma's Resume):")
print("=" * 70)
print(json.dumps(parsed_resumes['resume_priya_sharma.txt'], indent=2))

üíæ Saved parsed resumes to: ../outputs/parsed_resumes.json

üìã SAMPLE OUTPUT (Priya Sharma's Resume):
{
  "names": [
    "Machine Learning",
    "Pandas",
    "Java",
    "Jupyter\n- Soft",
    "Docker"
  ],
  "organizations": [
    "Data Scientist",
    "CERTIFICATIONS",
    "Google Data Analytics Professional Certificate",
    "Bachelor of Technology",
    "TensorFlow",
    "IIT",
    "GPA",
    "Python",
    "Data Analysis"
  ],
  "dates": [
    "June 2021 - Present",
    "Jan 2020 -",
    "10+",
    "2021",
    "3 years"
  ],
  "locations": [
    "SHARMA",
    "Flask",
    "India",
    "Delhi"
  ],
  "emails": [
    "priya.sharma@email.com"
  ],
  "phones": [
    "+91-9876543210"
  ],
  "skills": [
    "scikit-learn",
    "docker",
    "java",
    "aws",
    "machine learning",
    "python",
    "tensorflow",
    "sql",
    "git",
    "pandas",
    "flask"
  ]
}


In [20]:
# Convert to structured DataFrame
structured_data = []

for filename, entities in parsed_resumes.items():
    structured_data.append({
        'Filename': filename,
        'Name': entities['names'][0] if entities['names'] else 'Unknown',
        'Email': entities['emails'][0] if entities['emails'] else 'N/A',
        'Phone': entities['phones'][0] if entities['phones'] else 'N/A',
        'Skills': ', '.join(entities['skills'][:5]),  # First 5 skills
        'Organizations': ', '.join(entities['organizations'][:3]),  # First 3 orgs
        'Total Skills': len(entities['skills'])
    })

df_parsed = pd.DataFrame(structured_data)

print("üìä PARSED RESUME SUMMARY:")
print("=" * 70)
print(df_parsed.to_string(index=False))

# Save to CSV
csv_output = "../outputs/parsed_resumes.csv"
df_parsed.to_csv(csv_output, index=False)
print(f"\nüíæ Saved to CSV: {csv_output}")

üìä PARSED RESUME SUMMARY:
               Filename                  Name                      Email          Phone                                            Skills                                                                  Organizations  Total Skills
resume_anjali_patel.txt            Matplotlib anjali.patel2024@email.com +91-9988776655 nlp, scikit-learn, java, machine learning, python                          JavaScript, INTERNSHIPS\n\nData Analytics Intern, SQL            14
resume_priya_sharma.txt      Machine Learning     priya.sharma@email.com +91-9876543210 scikit-learn, docker, java, aws, machine learning Data Scientist, CERTIFICATIONS, Google Data Analytics Professional Certificate            11
 resume_rahul_verma.txt Jamshedpur\nCompleted      rahul.verma@gmail.com +91-9123456789                                               git                             MediaHub Agency, Marketing Today, Content Strategy             1

üíæ Saved to CSV: ../outputs/parsed_resumes.cs

In [29]:
import subprocess
import os

# Open the outputs folder
output_path = r"C:\Users\GURU IS GREAT\Documents\Resume_Parser_Project\outputs"
subprocess.Popen(f'explorer "{output_path}"')
print(f"üóÇÔ∏è Opening folder in File Explorer...")
print(f"üìç Location: {output_path}")
print("\nYour files are:")
print("  üìÑ parsed_resumes.json")
print("  üìÑ parsed_resumes.csv")

üóÇÔ∏è Opening folder in File Explorer...
üìç Location: C:\Users\GURU IS GREAT\Documents\Resume_Parser_Project\outputs

Your files are:
  üìÑ parsed_resumes.json
  üìÑ parsed_resumes.csv


In [28]:
def extract_job_requirements(text):
    """
    Extract requirements from job description
    """
    doc = nlp(text)
    
    job_info = {
        'job_title': '',
        'company': '',
        'location': '',
        'required_skills': [],
        'experience_required': [],
        'education_required': [],
        'organizations_mentioned': []
    }
    
    # Extract entities
    for ent in doc.ents:
        if ent.label_ == "ORG":
            job_info['organizations_mentioned'].append(ent.text)
        elif ent.label_ == "GPE":
            if not job_info['location']:
                job_info['location'] = ent.text
        elif ent.label_ == "DATE":
            job_info['experience_required'].append(ent.text)
    
    # Extract skills
    skill_keywords = ['python', 'java', 'sql', 'machine learning', 'data science', 
                     'tensorflow', 'pandas', 'numpy', 'git', 'docker', 'aws',
                     'javascript', 'react', 'node', 'mongodb', 'tableau',
                     'power bi', 'excel', 'r programming', 'spark', 'hadoop',
                     'nlp', 'deep learning', 'scikit-learn', 'flask', 'django',
                     'seo', 'sem', 'digital marketing', 'analytics', 'marketing']
    
    text_lower = text.lower()
    for skill in skill_keywords:
        if skill in text_lower:
            job_info['required_skills'].append(skill)
    
    # Extract job title (usually first line or after "TITLE:")
    lines = text.split('\n')
    for line in lines[:5]:
        if 'title:' in line.lower() or 'position:' in line.lower():
            job_info['job_title'] = line.split(':')[-1].strip()
            break
    
    # Remove duplicates
    for key in ['required_skills', 'organizations_mentioned']:
        job_info[key] = list(set(job_info[key]))
    
    return job_info

# Parse all job descriptions
parsed_jobs = {}

print("üîÑ PARSING JOB DESCRIPTIONS...")
print("=" * 70)

for filename, content in all_jobs.items():
    print(f"\nüìÑ Processing: {filename}")
    job_data = extract_job_requirements(content)
    parsed_jobs[filename] = job_data
    print(f"   ‚úÖ Found {len(job_data['required_skills'])} required skills")
    print(f"   üìç Location: {job_data['location']}")

print("\n" + "=" * 70)
print(f"üéâ Successfully parsed {len(parsed_jobs)} job descriptions!")

# Display one example
print("\nüìã SAMPLE JOB PARSING:")
print(json.dumps(parsed_jobs['job_data_scientist.txt'], indent=2))

üîÑ PARSING JOB DESCRIPTIONS...

üìÑ Processing: job_data_scientist.txt
   ‚úÖ Found 13 required skills
   üìç Location: India

üìÑ Processing: job_marketing_manager.txt
   ‚úÖ Found 7 required skills
   üìç Location: Mumbai

üéâ Successfully parsed 2 job descriptions!

üìã SAMPLE JOB PARSING:
{
  "job_title": "Data Scientist",
  "company": "",
  "location": "India",
  "required_skills": [
    "nlp",
    "scikit-learn",
    "aws",
    "machine learning",
    "python",
    "tensorflow",
    "data science",
    "spark",
    "power bi",
    "sql",
    "analytics",
    "tableau",
    "hadoop"
  ],
  "experience_required": [],
  "education_required": [],
  "organizations_mentioned": [
    "NLP",
    "GCP",
    "TensorFlow",
    "Computer Science, Statistics",
    "PyTorch",
    "TechVentures Pvt",
    "Data Scientist",
    "Tableau, Power BI",
    "Spark, Hadoop",
    "AI"
  ]
}


In [25]:
def calculate_match_score(resume_entities, job_requirements):
    """
    Calculate how well a resume matches a job description
    """
    resume_skills = set(resume_entities['skills'])
    job_skills = set(job_requirements['required_skills'])
    
    if not job_skills:
        return 0
    
    # Calculate skill match percentage
    matching_skills = resume_skills.intersection(job_skills)
    match_score = (len(matching_skills) / len(job_skills)) * 100
    
    return round(match_score, 2), list(matching_skills)

# Match all resumes to all jobs
print("üéØ MATCHING RESUMES TO JOBS...")
print("=" * 70)

matching_results = []

for resume_file, resume_data in parsed_resumes.items():
    candidate_name = resume_data['names'][0] if resume_data['names'] else 'Unknown'
    
    for job_file, job_data in parsed_jobs.items():
        job_title = job_data['job_title'] or job_file.replace('.txt', '')
        
        score, matched_skills = calculate_match_score(resume_data, job_data)
        
        matching_results.append({
            'Candidate': candidate_name,
            'Resume File': resume_file,
            'Job Title': job_title,
            'Job File': job_file,
            'Match Score (%)': score,
            'Matched Skills': ', '.join(matched_skills),
            'Total Matched Skills': len(matched_skills)
        })

# Convert to DataFrame
df_matches = pd.DataFrame(matching_results)
df_matches = df_matches.sort_values('Match Score (%)', ascending=False)

print(df_matches.to_string(index=False))
print("\n" + "=" * 70)

# Save matching results
match_csv = "resume_job_matches.csv"
df_matches.to_csv(match_csv, index=False)
print(f"üíæ Saved matching results to: {match_csv}")

üéØ MATCHING RESUMES TO JOBS...
            Candidate             Resume File                 Job Title                  Job File  Match Score (%)                                                                    Matched Skills  Total Matched Skills
           Matplotlib resume_anjali_patel.txt            Data Scientist    job_data_scientist.txt            61.54 nlp, scikit-learn, machine learning, python, data science, power bi, sql, tableau                     8
     Machine Learning resume_priya_sharma.txt            Data Scientist    job_data_scientist.txt            46.15                      scikit-learn, aws, machine learning, python, tensorflow, sql                     6
           Matplotlib resume_anjali_patel.txt Digital Marketing Manager job_marketing_manager.txt            14.29                                                                               git                     1
     Machine Learning resume_priya_sharma.txt Digital Marketing Manager job_marketing_manag

In [26]:
# Calculate overall statistics
print("üìä PARSING STATISTICS:")
print("=" * 70)

print(f"\nüìÑ RESUMES PROCESSED: {len(parsed_resumes)}")
for filename, data in parsed_resumes.items():
    print(f"\n  {filename}:")
    print(f"    Emails found: {len(data['emails'])}")
    print(f"    Skills found: {len(data['skills'])}")
    print(f"    Organizations: {len(data['organizations'])}")
    print(f"    Dates extracted: {len(data['dates'])}")

print(f"\n\nüíº JOB DESCRIPTIONS PROCESSED: {len(parsed_jobs)}")
for filename, data in parsed_jobs.items():
    print(f"\n  {filename}:")
    print(f"    Required skills: {len(data['required_skills'])}")
    print(f"    Location: {data['location']}")

print("\n\nüéØ MATCHING RESULTS:")
print("=" * 70)
print(f"Total matches calculated: {len(df_matches)}")
print(f"Best match score: {df_matches['Match Score (%)'].max()}%")
print(f"Average match score: {df_matches['Match Score (%)'].mean():.2f}%")

print("\nüèÜ TOP 3 MATCHES:")
print(df_matches.head(3)[['Candidate', 'Job Title', 'Match Score (%)']].to_string(index=False))

üìä PARSING STATISTICS:

üìÑ RESUMES PROCESSED: 3

  resume_anjali_patel.txt:
    Emails found: 1
    Skills found: 14
    Organizations: 10
    Dates extracted: 5

  resume_priya_sharma.txt:
    Emails found: 1
    Skills found: 11
    Organizations: 9
    Dates extracted: 5

  resume_rahul_verma.txt:
    Emails found: 1
    Skills found: 1
    Organizations: 10
    Dates extracted: 7


üíº JOB DESCRIPTIONS PROCESSED: 2

  job_data_scientist.txt:
    Required skills: 13
    Location: India

  job_marketing_manager.txt:
    Required skills: 7
    Location: Mumbai


üéØ MATCHING RESULTS:
Total matches calculated: 6
Best match score: 61.54%
Average match score: 25.09%

üèÜ TOP 3 MATCHES:
       Candidate                 Job Title  Match Score (%)
      Matplotlib            Data Scientist            61.54
Machine Learning            Data Scientist            46.15
      Matplotlib Digital Marketing Manager            14.29


In [27]:
# Save all outputs to the outputs folder
output_folder = r"C:\Users\GURU IS GREAT\Documents\Resume_Parser_Project\outputs"

# Save parsed resumes
with open(os.path.join(output_folder, "parsed_resumes.json"), 'w') as f:
    json.dump(parsed_resumes, f, indent=4)

# Save parsed jobs
with open(os.path.join(output_folder, "parsed_jobs.json"), 'w') as f:
    json.dump(parsed_jobs, f, indent=4)

# Save matches
df_matches.to_csv(os.path.join(output_folder, "resume_job_matches.csv"), index=False)

# Save resume summary
df_parsed.to_csv(os.path.join(output_folder, "parsed_resumes_summary.csv"), index=False)

print("‚úÖ All files saved to outputs folder!")
print("\nFiles created:")
print("  üìÑ parsed_resumes.json")
print("  üìÑ parsed_jobs.json")
print("  üìÑ resume_job_matches.csv")
print("  üìÑ parsed_resumes_summary.csv")

# Open folder
subprocess.Popen(f'explorer "{output_folder}"')

‚úÖ All files saved to outputs folder!

Files created:
  üìÑ parsed_resumes.json
  üìÑ parsed_jobs.json
  üìÑ resume_job_matches.csv
  üìÑ parsed_resumes_summary.csv


<Popen: returncode: None args: 'explorer "C:\\Users\\GURU IS GREAT\\Document...>