In [32]:
!pip install kagglehub[pandas-datasets] pymupdf
!pip install spacy nltk
!python -m nltk.downloader stopwords
!python -m spacy download en_core_web_sm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [41]:
import os
import re
import nltk
import spacy
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from collections import defaultdict


In [4]:
df_resumes = pd.read_csv("/content/drive/Shareddrives/NLP_Capstone/resume/Resume/Resume.csv")

In [13]:
import fitz  # PyMuPDF

def extract_text_from_resumes(root_folder="data", output_folder="output"):
    print(f"Processing folder: {root_folder}")
    for dirpath, _, filenames in os.walk(root_folder):
        print(f"Processing folder: {dirpath}")
        for filename in filenames:
            if filename.lower().endswith(".pdf"):
                pdf_path = os.path.join(dirpath, filename)

                # Extract text using PyMuPDF
                doc = fitz.open(pdf_path)
                text = ""
                for page in doc:
                    text += page.get_text()

                relative_path = os.path.relpath(dirpath, root_folder)
                output_subfolder = os.path.join(output_folder, relative_path)
                os.makedirs(output_subfolder, exist_ok=True)

                txt_filename = os.path.splitext(filename)[0] + ".txt"
                txt_path = os.path.join(output_subfolder, txt_filename)
                with open(txt_path, "w", encoding="utf-8") as f:
                    f.write(text)

                print(f"Saved: {txt_path}")

In [None]:
root_folder = "/content/drive/Shareddrives/NLP_Capstone/resume/data/data"
output_folder = "/content/drive/Shareddrives/NLP_Capstone/resume/data/output"

extract_text_from_resumes(root_folder, output_folder)

In [33]:
SECTION_HEADERS = {
    "education": ["education", "academic background", "academic qualifications", "education and training", "education details"],
    "experience": ["experience", "professional experience", "work experience", "employment history"],
    "skills": ["skills", "technical skills", "key skills", "core competencies"],
    "projects": ["projects", "personal projects"],
    "certifications": ["certifications", "licenses"],
    "summary": ["summary", "profile", "objective", "career_focus"],
    "interests": ["interests", "hobbies", "interest"],
    "publications": ["publications", "publication"],
}

In [38]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    if not text:
        return ""

    # Remove non-ASCII characters
    text = text.encode("ascii", errors="ignore").decode()

    # Lowercase
    text = text.lower()

    # Remove special characters except spaces and basic punctuation
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Lemmatization & stopword removal with spaCy
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and token.lemma_.lower() not in stop_words and len(token.lemma_) > 1
    ]

    return " ".join(tokens)

def detect_sections(lines, section_headers):
    current_section = None
    sections = defaultdict(list)

    for line in lines:
        line_clean = line.lower().strip()

        matched = False
        for section, headers in section_headers.items():
            if any(re.fullmatch(rf"{h}\s*:?", line_clean) for h in headers):
                current_section = section
                matched = True
                break

        if current_section and not matched:
            sections[current_section].append(line)

    return {sec: "\n".join(content) for sec, content in sections.items()}


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
import csv

def process_and_save_csv(input_folder, output_csv):
    rows = []

    for dirpath, _, filenames in os.walk(input_folder):
        category = os.path.basename(dirpath)
        for filename in filenames:
            if filename.endswith(".txt"):
                txt_path = os.path.join(dirpath, filename)

                with open(txt_path, "r", encoding="utf-8") as f:
                    text = f.read()
                lines = [line.strip() for line in text.splitlines() if line.strip()]
                sections = detect_sections(lines, SECTION_HEADERS)

                row = {
                    "filename": filename,
                    "path": txt_path,
                    "category": category,
                    "summary": preprocess(sections.get("summary", "")),
                    "experience": preprocess(sections.get("experience", "")),
                    "skills": preprocess(sections.get("skills", "")),
                    "education": preprocess(sections.get("education", "")),
                    "interest": preprocess(sections.get("interests", "")),
                    "projects": preprocess(sections.get("projects", "")),
                    "certifications": preprocess(sections.get("certifications", "")),
                    "publications": preprocess(sections.get("publications", ""))
                }
                rows.append(row)

    with open(output_csv, "w", encoding="utf-8", newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=rows[0].keys())
        writer.writeheader()
        writer.writerows(rows)

    print(f"✅ Saved {len(rows)} resumes to {output_csv}")

input_folder = "/content/drive/Shareddrives/NLP_Capstone/resume/data/output"
output_csv = "/content/drive/Shareddrives/NLP_Capstone/resume/data/structured_resumes.csv"
process_and_save_csv(input_folder, output_csv)


✅ Saved 2494 resumes to /content/drive/Shareddrives/NLP_Capstone/resume/data/structured_resumes.csv


In [40]:
df_resumes = pd.read_csv("/content/drive/Shareddrives/NLP_Capstone/resume/data/structured_resumes.csv")
df_resumes.head()

Unnamed: 0,filename,path,category,summary,experience,skills,education,interest,projects,certifications,publications
0,36625776.txt,/content/drive/Shareddrives/NLP_Capstone/resum...,HEALTHCARE,certify spin instructor personal trainer energ...,healthcare consultant october 2014 current com...,acquisition cms content contract negotiation c...,master business administration bachelor scienc...,,,,
1,16132195.txt,/content/drive/Shareddrives/NLP_Capstone/resum...,HEALTHCARE,,06 2013 12 2013 personal healthcare assistant ...,patient focus care excellent interpersonal ski...,2012 professional healthcare service adult beh...,interest include run read painting play piano ...,,cpr certification adult aid certification chil...,
2,18484846.txt,/content/drive/Shareddrives/NLP_Capstone/resum...,HEALTHCARE,,director compound sale specialist july 1997 ma...,administrative function basic benefit chart cl...,high school diploma 1971 griffithville high sc...,,,,
3,10251432.txt,/content/drive/Shareddrives/NLP_Capstone/resum...,HEALTHCARE,successful administrative professional 15 year...,corporate administrator january 2009 january 2...,organize detail orient efficient prioritize mu...,complete 45 credit course include logic ethic ...,,,,
4,17624934.txt,/content/drive/Shareddrives/NLP_Capstone/resum...,HEALTHCARE,,senior manager specialist leader healthcare 19...,seasoned healthcare advisor excellent communic...,bachelor science nursing ursuline college city...,,,,
