In [19]:
import pandas as pd
import numpy as np
import librosa
import glob

In [4]:
# Creating a session
from pyspark.sql import SparkSession

def create_session():
    spark = SparkSession.builder \
        .appName("processing_core") \
        .getOrCreate()
    # Disables warnings
    spark.sparkContext.setLogLevel("ERROR")

In [None]:
import os
import glob
import torch
from faster_whisper import WhisperModel

# --- Folder with audio files ---
folder = "/home/sofiemeyer/Projects/hackathon/data_samples_eduzmena/Data samples"

# List of common audio extensions
audio_exts = ["wav", "mp3", "flac", "ogg", "m4a"]

# Collect all audio files
audio_files = []
for ext in audio_exts:
    audio_files.extend(glob.glob(os.path.join(folder, f"*.{ext}")))
    audio_files.extend(glob.glob(os.path.join(folder, f"*.{ext.upper()}")))  # handle uppercase

print(f"Found {len(audio_files)} audio files.")

# --- Output folder for transcripts ---
output_folder = os.path.join(folder, "transcripts")
os.makedirs(output_folder, exist_ok=True)

# --- Detect device ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# --- Load Whisper model ---
model = WhisperModel("tiny", device=device)

# --- Transcribe only files without existing transcript ---
for audio_path in audio_files:
    filename = os.path.basename(audio_path)
    txt_filename = os.path.splitext(filename)[0] + ".txt"
    txt_path = os.path.join(output_folder, txt_filename)

    if os.path.exists(txt_path):
        print(f"Skipping {filename}, transcript already exists.")
        continue

    print(f"Transcribing {filename}...")

    segments, info = model.transcribe(audio_path)

    # Combine segments into a single transcript
    transcript_text = ""
    for segment in segments:
        start = f"{segment.start:.2f}s"
        end = f"{segment.end:.2f}s"
        transcript_text += f"[{start} -> {end}] {segment.text}\n"

    # Save transcript to a text file
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(transcript_text)

    print(f"Saved transcript to {txt_path}")

print("All missing audio transcripts have been generated.")

[]