# install spacy

In [21]:
pip install spacy==3.6.1

Collecting spacy==3.6.1
  Downloading spacy-3.6.1-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting thinc<8.2.0,>=8.1.8 (from spacy==3.6.1)
  Downloading thinc-8.1.12-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting pathy>=0.10.0 (from spacy==3.6.1)
  Downloading pathy-0.11.0-py3-none-any.whl.metadata (16 kB)
Collecting blis<0.8.0,>=0.7.8 (from thinc<8.2.0,>=8.1.8->spacy==3.6.1)
  Using cached blis-0.7.11-cp311-cp311-win_amd64.whl.metadata (7.6 kB)
Collecting pathlib-abc==0.1.1 (from pathy>=0.10.0->spacy==3.6.1)
  Downloading pathlib_abc-0.1.1-py3-none-any.whl.metadata (18 kB)
Downloading spacy-3.6.1-cp311-cp311-win_amd64.whl (12.0 MB)
   ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/12.0 MB 3.3 MB/s eta 0:00:04
   ------ --------------------------------- 1.8/12.0 MB 5.6 MB/s eta 0:00:02
   --------- ------------------------------ 2.9/12.0 MB 5.4 MB/s eta 0:00:02
   ------------ --------------------------- 3.

In [25]:
pip uninstall thinc spacy -y

Found existing installation: thinc 8.1.12
Uninstalling thinc-8.1.12:
  Successfully uninstalled thinc-8.1.12
Found existing installation: spacy 3.6.1
Uninstalling spacy-3.6.1:
  Successfully uninstalled spacy-3.6.1
Note: you may need to restart the kernel to use updated packages.


In [28]:
pip install --no-cache-dir thinc spacy

Note: you may need to restart the kernel to use updated packages.


# install direct source version of spacy

In [32]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     ----- ---------------------------------- 1.8/12.8 MB 6.3 MB/s eta 0:00:02
     ---------- ----------------------------- 3.4/12.8 MB 6.7 MB/s eta 0:00:02
     -------------- ------------------------- 4.7/12.8 MB 6.8 MB/s eta 0:00:02
     -------------------- ------------------- 6.6/12.8 MB 6.9 MB/s eta 0:00:01
     ---------------------- ----------------- 7.3/12.8 MB 6.4 MB/s eta 0:00:01
     ------------------------- -------------- 8.1/12.8 MB 6.1 MB/s eta 0:00:01
     ---------------------------- ----------- 9.2/12.8 MB 5.8 MB/s eta 0:00:01
     ------------------------------- -------- 10.0/12.8 MB 5.6 MB/s eta 0:00:01
     ---------------------------------- ----- 1

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.


# PARSING LOGIC

In [42]:
import re
from pdfminer.high_level import extract_text
import spacy
from spacy.matcher import Matcher

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def extract_contact_number_from_resume(text):
    contact_number = None
    # Use regex pattern to find a potential contact number
    pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
    match = re.search(pattern, text)
    if match:
        contact_number = match.group()
    return contact_number

def extract_email_from_resume(text):
    email = None
    # Use regex pattern to find a potential email address
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    match = re.search(pattern, text)
    if match:
        email = match.group()
    return email

def extract_skills_from_resume(text, skills_list):
    skills = []
    for skill in skills_list:
        # Escape special regex characters in the skill
        escaped_skill = re.escape(skill)
        pattern = r"\b{}\b".format(escaped_skill)
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            skills.append(skill)
    return skills

def extract_education_from_resume(text):
    education = []
    # Use regex pattern to find education information
    pattern = r"(?i)(?:Bsc|\bB\.\w+|\bM\.\w+|\bPh\.D\.\w+|\bBachelor(?:'s)?|\bMaster(?:'s)?|\bPh\.D)\s(?:\w+\s)*\w+"
    matches = re.findall(pattern, text)
    for match in matches:
        education.append(match.strip())
    return education

def extract_name(resume_text):
    nlp = spacy.load('en_core_web_sm')
    
    lines = resume_text.strip().split('\n')
    
    # Common section headers in resumes (to stop searching after we hit them)
    section_headers = [
        'objective', 'summary', 'education', 'experience', 'employment',
        'work history', 'skills', 'projects', 'certifications', 'achievements',
        'publications', 'awards', 'interests', 'hobbies', 'references',
        'professional', 'technical', 'personal'
    ]
    
    # Keywords that indicate organization names, not person names
    organization_indicators = [
        'college', 'university', 'institute', 'school', 'academy',
        'company', 'corporation', 'technologies', 'systems', 'solutions',
        'services', 'consulting', 'group', 'pvt', 'ltd', 'inc', 'llc',
        'department', 'organization', 'foundation', 'center', 'laboratory'
    ]
    
    # Patterns to skip (definitely not names)
    skip_patterns = [
        r'https?://',  # URLs
        r'www\.',      # Web addresses
        r'@',          # Email addresses
        r'\d{3,}',     # Lines with 3+ consecutive digits (phone numbers, dates)
    ]
    
    candidates = []
    
    # Search through first 15 lines (names are typically at the top)
    for i, line in enumerate(lines[:15]):
        line = line.strip()
        
        # Skip empty lines or very short lines
        if not line or len(line) < 3:
            continue
        
        # Stop if we hit a section header
        if any(header in line.lower() for header in section_headers):
            break
        
        # Skip lines matching skip patterns
        if any(re.search(pattern, line, re.IGNORECASE) for pattern in skip_patterns):
            continue
        
        # Skip lines with organization indicators (but only if they have multiple words from the list)
        org_count = sum(1 for indicator in organization_indicators if indicator in line.lower())
        if org_count >= 2:  # If 2+ organization keywords, likely not a name
            continue
        
        # Extract words that could be part of a name
        # Remove special characters except spaces and hyphens (some names have hyphens)
        cleaned_line = re.sub(r'[^\w\s\-]', '', line)
        words = cleaned_line.split()
        
        # Filter words that look like names
        potential_name_words = []
        for word in words:
            word_clean = word.strip('-')
            
            # Skip very short words (except initials with period)
            if len(word_clean) < 2:
                continue
            
            # Skip words that are all digits
            if word_clean.isdigit():
                continue
            
            # Skip common resume words
            if word_clean.lower() in ['resume', 'cv', 'curriculum', 'vitae', 'phone', 
                                      'email', 'address', 'linkedin', 'github', 'portfolio']:
                continue
            
            # Names typically start with uppercase (or are all uppercase)
            if word_clean[0].isupper() or word_clean.isupper():
                potential_name_words.append(word_clean)
        
        # Names are typically 2-4 words (First Last, First Middle Last, etc.)
        if 2 <= len(potential_name_words) <= 4:
            potential_name = ' '.join(potential_name_words)
            
            # Skip if contains organization indicators
            if any(indicator in potential_name.lower() for indicator in organization_indicators):
                continue
            
            # Use spaCy to verify these are likely person names
            doc = nlp(potential_name)
            
            # Check for PERSON entities (spaCy's named entity recognition)
            person_entities = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
            if person_entities:
                # spaCy recognized it as a person name
                candidates.append((i, person_entities[0], 'PERSON_ENTITY'))
                continue
            
            # Check for proper nouns (POS tagging)
            proper_nouns = [token.text for token in doc if token.pos_ == 'PROPN']
            if len(proper_nouns) >= 2:
                # At least 2 proper nouns detected
                name = ' '.join(proper_nouns[:4])  # Take up to 4 words
                candidates.append((i, name, 'PROPN'))
    
    # Return the best candidate (prioritize earlier lines and PERSON entities)
    if candidates:
        # Sort by: PERSON entities first, then by line number
        candidates.sort(key=lambda x: (0 if x[2] == 'PERSON_ENTITY' else 1, x[0]))
        return candidates[0][1]
    
    # Fallback: Look for simple pattern of 2-3 capitalized words in first few lines
    for line in lines[:5]:
        line = line.strip()
        if not line:
            continue
        
        # Skip lines with common non-name patterns
        if re.search(r'https?://|@|linkedin|github|phone|email', line, re.IGNORECASE):
            continue
        
        # Simple regex for 2-3 capitalized words
        name_pattern = r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})$'
        match = re.match(name_pattern, line)
        if match:
            return match.group(1)
        
        # Check for all-caps name (2-4 words)
        if line.isupper():
            words = line.split()
            if 2 <= len(words) <= 4 and all(word.isalpha() for word in words):
                return ' '.join(word.title() for word in words)
    
    return None

if __name__ == '__main__':
    resume_paths = [r"C:\Users\nitin\images\shubham-resume.pdf"]
    
    for resume_path in resume_paths:
        text = extract_text_from_pdf(resume_path)
        print("Resume:", resume_path)
        
        name = extract_name(text)
        if name:
            print("Name:", name)
        else:
            print("Name not found")
        
        contact_number = extract_contact_number_from_resume(text)
        if contact_number:
            print("Contact Number:", contact_number)
        else:
            print("Contact Number not found")
        
        email = extract_email_from_resume(text)
        if email:
            print("Email:", email)
        else:
            print("Email not found")
        
        skills_list = ['Python', 'C', 'C++', 'Data Analysis', 'Machine Learning', 
                      'Communication', 'Project Management', 'Deep Learning', 'SQL', 'Tableau','javascript']
        extracted_skills = extract_skills_from_resume(text, skills_list)
        if extracted_skills:
            print("Skills:", extracted_skills)
        else:
            print("No skills found")
        
        extracted_education = extract_education_from_resume(text)
        if extracted_education:
            print("Education:", extracted_education)
        else:
            print("No education information found")
        
        print()

Resume: C:\Users\nitin\images\shubham-resume.pdf
Name: SHUBHAM JHA
Contact Number: 91 8305054597
Email: jha525200@gmail.com
Skills: ['Python', 'C', 'SQL', 'javascript']
No education information found



In [37]:
import re
from pdfminer.high_level import extract_text
import spacy
from spacy.matcher import Matcher

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def extract_contact_number_from_resume(text):
    contact_number = None
    # Use regex pattern to find a potential contact number
    pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
    match = re.search(pattern, text)
    if match:
        contact_number = match.group()
    return contact_number

def extract_email_from_resume(text):
    email = None
    # Use regex pattern to find a potential email address
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    match = re.search(pattern, text)
    if match:
        email = match.group()
    return email

def extract_skills_from_resume(text, skills_list):
    skills = []
    for skill in skills_list:
        # Escape special regex characters in the skill
        escaped_skill = re.escape(skill)
        pattern = r"\b{}\b".format(escaped_skill)
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            skills.append(skill)
    return skills

def extract_education_from_resume(text):
    education = []
    # Use regex pattern to find education information
    pattern = r"(?i)(?:Bsc|\bB\.\w+|\bM\.\w+|\bPh\.D\.\w+|\bBachelor(?:'s)?|\bMaster(?:'s)?|\bPh\.D)\s(?:\w+\s)*\w+"
    matches = re.findall(pattern, text)
    for match in matches:
        education.append(match.strip())
    return education

def extract_name(resume_text):
    nlp = spacy.load('en_core_web_sm')
    
    # Get first few lines
    lines = resume_text.strip().split('\n')
    
    # Keywords to exclude (college/school/company indicators)
    exclude_keywords = [
        'college', 'university', 'institute', 'school', 'company', 
        'technologies', 'systems', 'solutions', 'services', 'pvt', 'ltd',
        'github', 'linkedin', 'email', 'education', 'experience',
        'engineering', 'vidyalaya', 'rewa'
    ]
    
    for line in lines[:10]:  # Check first 10 lines
        line = line.strip()
        
        # Skip empty lines
        if not line or len(line) < 3:
            continue
        
        # Skip lines with URLs, emails, pipes, or many digits
        if re.search(r'https?://|www\.|@|\||GITHUB|LINKEDIN|EMAIL|\d{10}', line, re.IGNORECASE):
            continue
        
        # Skip lines with exclude keywords (likely institution names)
        if any(keyword in line.lower() for keyword in exclude_keywords):
            continue
        
        # Skip lines that are all uppercase and very long (likely headers)
        if line.isupper() and len(line) > 30:
            continue
        
        # Look for name pattern: 2-4 words with capital letters
        # Names are usually 2-4 words, each starting with capital
        words = line.split()
        
        # Filter to words that look like names (start with capital, mostly letters)
        name_words = []
        for word in words:
            # Remove special characters
            clean_word = re.sub(r'[^a-zA-Z]', '', word)
            # Check if it looks like a name: starts with capital, 2+ chars, mostly letters
            if clean_word and clean_word[0].isupper() and len(clean_word) >= 2:
                name_words.append(clean_word)
        
        # Valid name should have 2-4 words
        if 2 <= len(name_words) <= 4:
            potential_name = ' '.join(name_words)
            
            # Additional check: shouldn't contain exclude keywords
            if not any(keyword in potential_name.lower() for keyword in exclude_keywords):
                # Verify with spaCy that these are proper nouns
                doc = nlp(potential_name)
                proper_nouns = [token.text for token in doc if token.pos_ == 'PROPN']
                
                # If at least half the words are proper nouns, it's likely a name
                if len(proper_nouns) >= len(name_words) // 2:
                    return potential_name
    
    # Fallback: Look for all-caps name at the very beginning (like "AYUSH SHARMA")
    for line in lines[:5]:
        line = line.strip()
        if not line:
            continue
            
        # Skip lines with links/emails
        if re.search(r'GITHUB|LINKEDIN|EMAIL|https?://|@|\|', line):
            continue
        
        # Check if line is all caps and looks like a name (2-4 words)
        if line.isupper():
            words = line.split()
            if 2 <= len(words) <= 4 and all(word.isalpha() for word in words):
                return line.title()  # Convert to Title Case
    
    return None

if __name__ == '__main__':
    resume_paths = [r"C:\Users\nitin\images\shubham-resume.pdf"]
    
    for resume_path in resume_paths:
        text = extract_text_from_pdf(resume_path)
        print("Resume:", resume_path)
        
        name = extract_name(text)
        if name:
            print("Name:", name)
        else:
            print("Name not found")
        
        contact_number = extract_contact_number_from_resume(text)
        if contact_number:
            print("Contact Number:", contact_number)
        else:
            print("Contact Number not found")
        
        email = extract_email_from_resume(text)
        if email:
            print("Email:", email)
        else:
            print("Email not found")
        
        skills_list = ['Python', 'C', 'C++', 'Data Analysis', 'Machine Learning', 
                      'Communication', 'Project Management', 'Deep Learning', 'SQL', 'Tableau']
        extracted_skills = extract_skills_from_resume(text, skills_list)
        if extracted_skills:
            print("Skills:", extracted_skills)
        else:
            print("No skills found")
        
        extracted_education = extract_education_from_resume(text)
        if extracted_education:
            print("Education:", extracted_education)
        else:
            print("No education information found")
        
        print()

Resume: C:\Users\nitin\images\shubham-resume.pdf
Name: SHUBHAM JHA
Contact Number: 91 8305054597
Email: jha525200@gmail.com
Skills: ['Python', 'C', 'SQL']
No education information found

