In [1]:
# %% 1 – Grundkonfiguration
import pathlib, os

# Pfad zu deinen PDFs anpassen
PDF_DIR = pathlib.Path(r"C:\Daten\vocdata\uploads\pdf")
assert PDF_DIR.exists(), f"Verzeichnis {PDF_DIR} nicht gefunden"

# Datenbank‑URL ggf. anpassen
DB_URL = "mysql+pymysql://root:voc_root@localhost:3306/vocdata?charset=utf8mb4"

print("PDF_DIR =", PDF_DIR)
print("DB_URL  =", DB_URL)


AssertionError: Verzeichnis C:\Daten\vocdata\uploads\pdf nicht gefunden

In [None]:
# %% 2 – Imports und Verbindung
import sqlalchemy as sa, pandas as pd
from sqlalchemy import inspect

engine = sa.create_engine(DB_URL)

try:
    with engine.connect() as conn:
        conn.exec_driver_sql("SELECT 1")
    print("Datenbankverbindung funktioniert")
except Exception as e:
    raise SystemExit(f"Verbindung fehlgeschlagen: {e}")


In [None]:
# %% 3 – Tabellenübersicht
insp = inspect(engine)
tables = insp.get_table_names()
print("Tabellen im Schema:", tables)


In [None]:
# %% 4 – Struktur qual_docs
if "qual_docs" in tables:
    cols = insp.get_columns("qual_docs")
    df_cols = pd.DataFrame(
        [(c["name"], str(c["type"]), c["nullable"]) for c in cols],
        columns=["Spalte", "Typ", "NULL?"]
    )
    display(df_cols)
else:
    print("Tabelle qual_docs existiert NOCH NICHT")


In [None]:
# %% 5 – Dokumentanzahl
if "qual_docs" in tables:
    n_total = engine.scalar(sa.text("SELECT COUNT(*) FROM qual_docs"))
    print("Datensätze in qual_docs:", n_total)
else:
    n_total = 0


In [None]:
# %% 6 – Duplikate nach Dateiname
if n_total:
    sql = """
        SELECT filename, COUNT(*) AS n
        FROM qual_docs
        GROUP BY filename
        HAVING n > 1
        ORDER BY n DESC
        LIMIT 20
    """
    dup = pd.read_sql(sql, engine)
    if dup.empty:
        print("Keine Dateinamen‑Duplikate")
    else:
        print("Mögliche Duplikate (Dateiname):")
        display(dup)


In [None]:
# %% 7 – Duplikate nach sha256
if n_total and "sha256" in [c["name"] for c in cols]:
    sql = """
        SELECT sha256, COUNT(*) AS n, MIN(filename) AS beispiel
        FROM qual_docs
        GROUP BY sha256
        HAVING n > 1
        ORDER BY n DESC
        LIMIT 20
    """
    dup2 = pd.read_sql(sql, engine)
    if dup2.empty:
        print("Keine Hash‑Duplikate")
    else:
        print("Mögliche Duplikate (Hash):")
        display(dup2)


In [None]:
# %% 8 – Dokumenttypen
if n_total and "doc_type" in [c["name"] for c in cols]:
    typ_stat = pd.read_sql(
        "SELECT doc_type, COUNT(*) AS n FROM qual_docs GROUP BY doc_type",
        engine
    )
    display(typ_stat)


In [None]:
# %% 9 – Vergleich FS ↔ DB
pdf_fs   = {p.name for p in PDF_DIR.glob("*.pdf")}
print("PDFs im Ordner:", len(pdf_fs))

if n_total:
    pdf_db   = {r.filename for r in engine.execute(sa.text("SELECT filename FROM qual_docs"))}
    only_fs  = pdf_fs - pdf_db
    only_db  = pdf_db - pdf_fs

    print("Nur im Ordner, nicht in DB:", len(only_fs))
    if only_fs:
        print(sorted(list(only_fs))[:10])   # maximal 10 Beispiele

    print("Nur in DB, nicht im Ordner:", len(only_db))
    if only_db:
        print(sorted(list(only_db))[:10])


In [None]:
# %% 10 – Suche nach Embedding‑/Chunk‑Tabellen
cand = [t for t in tables if "chunk" in t or "embed" in t]
print("Mögliche Tabellen für Embeddings/Chunks:", cand or "keine gefunden")
