In [1]:
data_raw_dir = "../data/sample/raw/"
data_extracted_dir = "../data/sample/extracted/"

In [2]:
import os
import json
import re


def save_text_to_json(
    filename_orignial: str,
    current_dir: str,
    text: str,
    extract_dir: str = "../data/sample/extracted/",
) -> None:
    filename_json = extract_dir + os.path.splitext(filename_orignial)[0] + ".json"

    # Keywords is the folder structure. Example: Analysis/3/Lectures/filename -> [Analysis, 3, Lectures]
    keywords = [part for part in re.split(r"[\\/]", current_dir) if part]

    data = {"Key-Words": keywords, "content": text}
    with open(filename_json, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

In [3]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException


def detect_language(text: str) -> str:
    DetectorFactory.seed = 0  # Ensure consistency in detection
    try:
        language = detect(text)
        return language
    except LangDetectException:
        return "unknown"


(
    detect_language(text="Hello this is a text"),
    detect_language(text="Hallo das ist ein text"),
)

('en', 'de')

In [5]:
from deep_translator import GoogleTranslator


def translate_text(text: str, src_lang: str, dest_lang: str) -> str:
    try:
        translated = GoogleTranslator(source=src_lang, target=dest_lang).translate(text)
        return translated
    except Exception as e:
        return f"Translation failed: {e}"


translate_text(
    text="Das ist ein Satz. Das ist ein Test.", src_lang="de", dest_lang="en"
)

'This is a sentence. This is a test.'

In [None]:
import PyPDF2

print(os.getcwd())


def extract_text_from_pdf(pdf_path: str) -> str:
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text


filename = "Einheitliches Referenz Skript.pdf"
save_text_to_json(
    filename_orignial=filename,
    current_dir=data_raw_dir,
    text=extract_text_from_pdf(pdf_path=data_raw_dir + filename),
)

c:\Users\damia\Github\ZHAW\PM4\ByteMentor\data_utils


In [None]:
from pptx import Presentation


def extract_text_from_pptx(pptx_path: str) -> str:
    presentation = Presentation(pptx_path)
    text = ""
    for slide in presentation.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text
    return text


filename = "DENT_SW2_Market and Environment Analysis_ppt templates.pptx"
save_text_to_json(
    filename_orignial=filename,
    current_dir=data_raw_dir,
    text=extract_text_from_pptx(pptx_path=data_raw_dir + filename),
)

In [None]:
from docx import Document


def extract_text_from_docx(docx_path: str) -> str:
    doc = Document(docx_path)
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text


filename = "3.1 Projektcharta Vorlage.docx"
save_text_to_json(
    filename_orignial=filename,
    current_dir=data_raw_dir,
    text=extract_text_from_docx(docx_path=data_raw_dir + filename),
)