In [None]:
%pip install PyPDF2
%pip install python-docx
%pip install spacy
%pip install pandas
!python -m spacy download en_core_web_sm

In [None]:
from PyPDF2 import PdfReader
from docx import Document
import os

# extract text from PDF or DOCX
def extract_text(file_path):
    text = ""
    if file_path.endswith('.pdf'):
        reader = PdfReader(file_path)  
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:  
                text += page_text + "\n"
    elif file_path.endswith('.docx'):
        doc = Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    else:
        raise ValueError("Unsupported file format")
    return text

sample_text = extract_text("YourResumeFile.pdf") # Replace with your file path (PDF or DOCX)
print(sample_text[:500])


In [None]:
import re

# remove extra spaces and newlines from extracted text
def clean_text(text):
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r' +', ' ', text)
    return text.strip()

cleaned = clean_text(sample_text)
print(cleaned[:500])

In [None]:
# email and phoneNumber extraction
def extract_emails(text):
    match = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
    return match if match else None

def extract_phones(text):
    match = re.search(r'(\+91[\s-]?\d{5}[\s-]?\d{5}|\d{10})', text)
    return match.group(0) if match else None

email = extract_emails(cleaned)
phone = extract_phones(cleaned)
print("Email:", email, "Phone:", phone)

In [None]:
# name extraction using spaCy
import spacy
nlp = spacy.load('en_core_web_sm')

def extract_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            return ent.text
    return None

In [None]:
# extract skills with the help of predefined skills list from resume
SKILLS = ['Python', 'Java', 'C++', 'Machine Learning', 'Data Analysis', 'SQL', 'Excel', 'Communication', 'Leadership']
def extract_skills(text, skills=SKILLS):
    found = [skill for skill in skills if skill.lower() in text.lower()]
    return list(set(found))
print(extract_skills(cleaned))

In [None]:
#extract education details from resume using predefined keywords
EDU_KEYWORDS = ['B.Tech', 'M.Tech', 'B.Sc', 'M.Sc', 'PhD', 'Bachelor', 'Master', 'University', 'College', 'School', 'Degree']

def extract_education(text):
    lines = text.split('\n')
    education = []
    for i, line in enumerate(lines):
        for word in EDU_KEYWORDS:
            if word.lower() in line.lower():
                education.append(line.strip())
                if i + 1 < len(lines) and lines[i+1].strip():
                    education.append(lines[i+1].strip())
                break
    return sorted(set(education))

print(extract_education(cleaned))


In [None]:
# extract experience details from resume using predefined keywords
def extract_experience(text):
    experience_keywords = ['experience', 'worked', 'employment', 'career', 'professional']
    
    exp_set = set()
    
    lines = text.split('\n')
    
    for line in lines:
        lower_line = line.lower()
        if any(keyword in lower_line for keyword in experience_keywords):
            exp_set.add(line.strip())
    return list(exp_set)
print(extract_experience(cleaned))


In [None]:
# Combine all extraction functions
parsed_resume = {
    "Name": extract_name(cleaned),
    "Email": extract_emails(cleaned),
    "Phone": extract_phones(cleaned),
    "Skills": extract_skills(cleaned),
    "Education": extract_education(cleaned),
    "Experience": extract_experience(cleaned)
}
print(parsed_resume)

In [None]:
# Process  resumes in a folder and save results to CSV
import pandas as pd
import os

def process_folder(folder_path):
    results = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf") or file.endswith(".docx"):
            path = os.path.join(folder_path, file)
            text = clean_text(extract_text(path))
            parsed = {
    "File": file,
    "Name": extract_name(text),
    "Email": extract_emails(text),
    "Phone": extract_phones(text),
    "Skills": extract_skills(text),
    "Education": extract_education(text),
    "Experience": extract_experience(text)  
}

            results.append(parsed)
    return pd.DataFrame(results)

folder_path = "YourResumeFile.pdf" # Replace with your file path (PDF or DOCX)

output_path = "prased_resume.csv" # Replace with your desired output path

df = process_folder(folder_path)
df.to_csv(output_path, index=False)

if os.path.exists(output_path):
    print("CSV saved successfully!")
else:
    print("CSV not found.")

print("Current working directory:", os.getcwd())
