# META transcription

Gabriel Bonnin

In [None]:
! pip install -r requirements.txt

# Für eventuellen Grafikkarten-Support:
! pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

In [None]:
from dotenv import load_dotenv
import os
import torch

# === Device Setup ===
if torch.backends.mps.is_available():  # Apple GPU (Metal)
    device = "mps"
    print("Apple GPU (Metal) erkannt – Whisper läuft auf der GPU.")
elif torch.cuda.is_available():  # NVIDIA GPU
    device = "cuda"
    print(f"NVIDIA GPU erkannt ({torch.cuda.get_device_name(0)}) – Whisper läuft auf der GPU.")
else:
    device = "cpu"
    print("Keine GPU erkannt – Whisper läuft auf dem CPU.")

# === Pfade ===
load_dotenv()
data_root = os.getenv("DATA_ROOT")
raw_audio_path = os.path.join(data_root, "raw/raw_audio")   # hier lagen die eingesprochenen Dateien
processed_path = os.path.join(data_root, "processed/processed_transcriptions")  # hier werden die CSV mit den transkribierten Daten, sowie ein Log-File gespeichern
os.makedirs(processed_path, exist_ok=True)

processed_log_path = os.path.join(processed_path, "processed_log.txt")
output_csv = os.path.join(processed_path, "Transcriptions.csv")

if data_root is None:
    raise EnvironmentError("DATA_ROOT ist nicht gesetzt! Bitte .env anlegen.")

# === Whisper Modell Set-Up ===
model_name = "large-v2"  # Je nach GPU Speichergröße bzw. Gerät auf dem der Code läuft. Auswahlmöglichkeiten: "tiny", "base", "small", "medium", "large", "large-v2", "large-v3"

In [None]:
import os
import pandas as pd
import re
import whisper
import torch
import librosa

# === Helper: bereits verarbeitete Dateien laden ===
processed_files = set()
if os.path.exists(processed_log_path):
    with open(processed_log_path, "r", encoding="utf-8") as f:
        for line in f:
            path = line.strip()
            if path:
                path = os.path.abspath(path).replace("\\", "/")  # Normalisieren
                processed_files.add(path)

# === GPU Cache leeren (falls nötig) ===
if device == "cuda":
    torch.cuda.empty_cache()

# === Whisper-Modell laden auf passendem Device ===
model = whisper.load_model(model_name)

# === NATO-Alphabet ===
nato_alphabet = {
    "alpha": "A", "bravo": "B", "charlie": "C", "delta": "D", "echo": "E",
    "foxtrot": "F", "golf": "G", "hotel": "H", "india": "I", "juliett": "J",
    "kilo": "K", "lima": "L", "mike": "M", "november": "N", "oscar": "O",
    "papa": "P", "quebec": "Q", "romeo": "R", "sierra": "S", "tango": "T",
    "uniform": "U", "victor": "V", "whiskey": "W", "xray": "X", "yankee": "Y", "zulu": "Z"
}

# === Erwartete Fragen ===
expected_markers = [
    "Frage 5", "Frage 6", "Frage 7", "Frage 8", "Frage 9", 
    "Frage 10", "Frage 11", "Frage 12", "Frage 13", 
    "Frage 40", "Frage 60", "Frage 61", "Frage 62"
]

def clean_answer(answer: str) -> str:
    answer = re.sub(r'(?i)^\s*hier folgt die antwort auf\s*', '', answer)
    answer = re.sub(r'(?i)\s*hier folgt die antwort auf\s*$', '', answer)
    return answer.strip(" .,-")

def robust_extract_questions(text: str, expected_markers: list) -> dict:
    marker_pattern = re.compile(r'(Frage\s*\d+)', re.IGNORECASE)
    matches = list(marker_pattern.finditer(text))
    answers = {marker: None for marker in expected_markers}

    def normalize_marker(m):
        return re.sub(r'\s+', ' ', m.group()).strip().capitalize()
    
    extracted = []
    for i, m in enumerate(matches):
        marker_text = normalize_marker(m)
        start_index = m.end()
        end_index = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        extracted.append((marker_text, start_index, end_index))

    # NEU: kein break -> letzte Version überschreibt frühere
    for exp_marker in expected_markers:
        for marker_text, start, end in extracted:
            if marker_text.lower() == exp_marker.lower():
                answer = clean_answer(text[start:end].strip())
                answers[exp_marker] = answer
                # kein break -> spätere Treffer überschreiben frühere
    return answers

# === Alle neuen Audio-Dateien sammeln ===
audio_files = []
for root, _, files in os.walk(raw_audio_path):
    for f in files:
        if f.lower().endswith(('.wav', '.mp3', '.m4a', '.flac')) and not f.startswith('._'):
            full_path = os.path.abspath(os.path.join(root, f)).replace("\\", "/")  # Normalisieren
            if full_path not in processed_files:
                audio_files.append(full_path)

print(f"{len(audio_files)} neue Dateien gefunden.")

# === Falls schon CSV existiert, laden ===
if os.path.exists(output_csv):
    df = pd.read_csv(output_csv)
    data = df.to_dict("records")
else:
    data = []

# === Neue Dateien verarbeiten ===
for audio_path in audio_files:
    audio_path = os.path.abspath(audio_path).replace("\\", "/")

    # print(f"Processing: {audio_path}")  # Un-kommentieren, um Pfad sichtbar zu machen

    # --- Transkription: WAV direkt mit librosa laden ---
    audio, sr = librosa.load(audio_path, sr=16000)  # resample auf 16 kHz
    result = model.transcribe(audio, language="de")
    text = result['text']

    # --- Chiffre-Extraktion ---
    code_match = re.search(r'\b([A-Za-z]+)[\s:,-]*((?:\d[\s,]*){6})\b', text)
    code = None
    if code_match:
        nato_word = code_match.group(1).lower()
        digits = re.sub(r'[\s,]+', '', code_match.group(2))
        if len(digits) == 6 and nato_word in nato_alphabet:
            code = f"{nato_alphabet[nato_word]}{digits}"

    # --- Fragen-Extraktion ---
    question_answers = robust_extract_questions(text, expected_markers)

    # Ergebnis speichern
    entry = {"file_path": audio_path, "code": code, "full_transcript": text}
    entry.update({f"q{m.split()[1]}": question_answers.get(m) for m in expected_markers})
    data.append(entry)

    # --- CSV aktualisieren ---
    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False, encoding="utf-8")

    # --- Log aktualisieren ---
    with open(processed_log_path, "a", encoding="utf-8") as log:
        log.write(audio_path + "\n")

# === Abschlussmeldung ===
print("Fertig mit der Verarbeitung aller neuen Dateien.")