In [4]:
import os
import re
import docx
from pdfminer.high_level import extract_text as extract_pdf_text
from tqdm import tqdm

In [5]:
def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def read_docx(file_path):
    return "\n".join([p.text for p in docx.Document(file_path).paragraphs])

def read_pdf(file_path):
    return extract_pdf_text(file_path)

def extract_text(file_path):
    file_path_lower = file_path.lower()
    if file_path_lower.endswith(".pdf"):
        return read_pdf(file_path)
    if file_path_lower.endswith(".docx"):
        return read_docx(file_path)
    if file_path_lower.endswith(".txt"):
        return read_txt(file_path)

def clean_text(text):
    return re.sub(r"\s+", " ", re.sub(r"[\x00-\x1f\x7f-\x9f]", " ", text)).strip()

def extract_all_texts(input_folder, output_folder):
    supported_extensions = (".pdf", ".docx", ".txt")
    for filename in tqdm(os.listdir(input_folder), desc="Extracting files"):
        if not filename.lower().endswith(supported_extensions):
            continue
        input_path = os.path.join(input_folder, filename)
        output_filename = f"{os.path.splitext(filename)[0]}.txt"
        output_path = os.path.join(output_folder, output_filename)
        if os.path.exists(output_path):
            continue
        text = clean_text(extract_text(input_path))
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(text)

In [6]:
extract_all_texts("./data/resume", "./data/resume_extract")

Extracting files: 100%|██████████| 2549/2549 [00:00<00:00, 28320.07it/s]
