In [None]:
%pip install pdfplumber

In [1]:
import pdfplumber
import re
import pandas as pd
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree

# Ensure NLTK corpora are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def extract_name(text):
    """Extract name using Named Entity Recognition."""
    sentences = nltk.sent_tokenize(text)
    print(sentences)
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence) #tokenizes the input sentence into individual words and punctuation
        print(tokens)
        tags = nltk.pos_tag(tokens)  #assigns each token a grammatical category
        print(tags)
        chunks = ne_chunk(tags) #which detects entities like names (PERSON), locations (GPE), organizations (ORG)
        print(chunks)
        for chunk in chunks:
            if isinstance(chunk, Tree) and chunk.label() == 'PERSON':
                return " ".join(c[0] for c in chunk)
    return "Not Found"

def extract_email(text):  
    """Extract email using regex."""
    email_match = re.search(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', text) #  RE package is used to extract the proper email pattern  
    print(email_match)
    print(email_match.group(0))
    return email_match.group(0) if email_match else "Not Found"

def extract_qualification(text):
    """Extract qualifications by matching common degree terms."""
    qualifications = re.findall(r'\b(B(?:\.|achelor)?|M(?:\.|aster)?|Ph\.?D|Diploma|High School|HSC|UG|PG|CS|Engineering|Science)\b', text, re.IGNORECASE)
    print(qualifications)
    return ", ".join(set(qualifications)) if qualifications else "Not Found"

def extract_resume_details_nltk(file_path):
    """Extract details using pdfplumber and NLTK."""
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
        print(text)
    
    name = extract_name(text)
    print(name)
    email = extract_email(text)
    print(email)
    qualification = extract_qualification(text)
    print(qualification)
    
    print({"Name": name, "Qualification": qualification, "Email": email})
    return {"Name": name, "Qualification": qualification, "Email": email}

# Process all resumes
resume_files = ["akashkumar_CV.pdf,Resume01.pdf,Resume02.pdf"]  # List of resume file paths,u can add more files here
resume_data = []
print(resume_files)
print(resume_data)

for file in resume_files:
    details = extract_resume_details_nltk(file)
    print(details)
    resume_data.append(details)
    print(resume_data)

print(resume_data)
# Convert to DataFrame and save as CSV
df = pd.DataFrame(resume_data)
print(df)
output_path = "extracted_resume_data_nltk.csv"
df.to_csv(output_path, index=False)

print(f"Data saved to {output_path}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vinit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Vinit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Vinit\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Vinit\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


['akashkumar_CV.pdf']
[]
AKASH KUMAR P
Trichy, Tamil Nadu | +916383417457 | Akashmani1936@gmail.com | Linkedin|
Objective
Recent graduate with a Bachelor's degree in Artificial Intelligence and Data Science,
equipped with a strong foundation in machine learning, data analysis, and statistical
modeling. Seeking an entry-level position to apply my technical skills and analytical abilities
to solve real-world problems and contribute to innovative projects in a dynamic and growth-
oriented organization.
Education
B.TECH IN ARTIFICIAL INTELLIGENCE AND DATA SCIENCE
| 2021–2024
M.A.M College of Engineering Trichy.
GCPA:7.5
Skills
• Programming Languages: Python, R, SQL
• Machine Learning & AI: Scikit-learn, Seaborn, TensorFlow, Keras, PyTorch
• Deep Learning & NLP
• Data Visualization: Tableau, Power BI, Matplotlib
• Data Analysis: Pandas, NumPy, Spacy
• Tools & IDEs: VSCode, Jupyter notebook ,google colob
• Web Scraping | GUI Development: Tkinter
• Microsoft Office: Excel, Word, PowerPoint
P