In [None]:
import os
import textract
import pypandoc
from tqdm import tqdm

input_folder = "data"
output_folder = "converted_txt"

os.makedirs(output_folder, exist_ok=True)

supported_formats = [".pdf", ".docx", ".doc", ".rtf", ".html", ".htm", ".txt"]

for file in tqdm(os.listdir(input_folder)):
    file_path = os.path.join(input_folder, file)
    name, ext = os.path.splitext(file)

    if ext.lower() not in supported_formats:
        print(f"Skipping unsupported format: {file}")
        continue

    output_file = os.path.join(output_folder, name + ".txt")

    try:
        if ext.lower() in [".pdf", ".docx", ".doc", ".rtf", ".html", ".htm"]:
            # Use textract first
            try:
                text = textract.process(file_path).decode("utf-8", errors="ignore")
            except Exception:
                # fallback to pypandoc
                text = pypandoc.convert_file(file_path, "plain")
        else:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()

        with open(output_file, "w", encoding="utf-8") as out:
            out.write(text)
        print(f"Converted: {file} â†’ {output_file}")

    except Exception as e:
        print(f"Could not convert {file}: {e}")