In [2]:
# Importing neccessary libraries
import os
import csv
import PyPDF2
from PyPDF2 import PdfReader
import re

In [3]:
# defining regular expressions for skills and education

skills_pattern = r"Skills(.*?)(Education|$)"
education_pattern = r"Education(.*?)(Skills|$)"


In [4]:
# 1.Function to extract text from the pdf using pypdf2
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path,"rb") as f:
            reader = PdfReader(f)
            num_pages = len(reader.pages)
            results = []
            for i in range(num_pages):
                page = reader.pages[i]
                text = page.extract_text()
                results.append(text)
            s = ' '.join(results)
            return s
    except Exception as e:
        print(f"Error reading PDF: {str(e)}")
        return ""

# 2.Function to extract skills and education from text extracted from pdf using re


def extract_skills_and_education(text):
    skills_match = re.search(skills_pattern, text, re.DOTALL)
    education_match = re.search(education_pattern, text, re.DOTALL)

    skills = skills_match.group(1).strip() if skills_match else ""
    education = education_match.group(1).strip() if education_match else ""

    return skills, education

# 3.Function to process pdfs and extract text,skills,category,id


def process_pdfs_in_directory(directory):
    data = []

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                category = os.path.basename(root)
                id = os.path.splitext(file)[0]

                text = extract_text_from_pdf(pdf_path)
                skills, education = extract_skills_and_education(text)
            data.append([id, text, skills, education, category])
    return data


def save_to_csv(data, csv_filename):
    with open(csv_filename, mode="w", newline="", encoding="utf-8") as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(
            ["ID", "resume_str", "Skills", "Education", "Category"])
        csv_writer.writerows(data)


if __name__ == "__main__":
    root_directory = r"D:\MyProjects\resume_recommendation\resume_data\data\data"
    data = process_pdfs_in_directory(root_directory)
    save_to_csv(data, "resumes.csv")


# Extracted the resumes which is in pdf format from the folder and saved in the resumes.csv in csv format
# which contains features like
        id
        resume_str
        Skills
        Education
        Category