In [5]:
import os
import glob
import pandas as pd
import librosa
import torch
from tqdm import tqdm
from transformers import AutoProcessor, SeamlessM4Tv2ForSpeechToText

# Set device (CPU/GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set up directories
base_dir = os.path.dirname(os.getcwd())
uploads_dir = os.path.join(os.getcwd(), "uploads")
data_dir = os.path.join(os.getcwd(), "data")

# Create the data and translated directories if they don't exist
os.makedirs(data_dir, exist_ok=True)

print(f"Looking for MP3 files in: {uploads_dir}")

Looking for MP3 files in: c:\Users\Adria\dev\blindwikiapp\notebooks\uploads


In [4]:

model_name = "facebook/seamless-m4t-v2-large"
processor = AutoProcessor.from_pretrained(model_name)
model = SeamlessM4Tv2ForSpeechToText.from_pretrained(model_name).to(device)


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s]


In [8]:
def transcribe_audio(file_path, target_lang="eng"):
    """
    Transcribe un archivo de audio usando el modelo SeamlessM4T.
    Args:
        file_path (str): Ruta al archivo de audio
        target_lang (str): Código del idioma de destino (por defecto: 'en')
    Returns:
        str: Transcripción del audio
    """
    try:
        # Cargar archivo de audio
        waveform, sample_rate = librosa.load(file_path, sr=16000, mono=True)
        
        # Convertir array numpy a tensor de torch
        audio_tensor = torch.tensor(waveform).unsqueeze(0).to(device)
        
        # Preparar entrada para el modelo
        inputs = processor(
            audios=audio_tensor,
            sampling_rate=sample_rate,
            return_tensors="pt"
        ).to(device)
        
        # Generar transcripción
        with torch.no_grad():
            outputs = model.generate(**inputs, tgt_lang=target_lang)
            transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0]
        
        return transcription
    
    except Exception as e:
        print(f"Error procesando {file_path}: {str(e)}")
        return None


In [9]:
# Find all MP3 files in the uploads directory
mp3_files = glob.glob(os.path.join(uploads_dir, "*.mp3"))
print(f"Found {len(mp3_files)} MP3 files to process")

# Create dictionary to store results
results = {
    'file': [],
    'transcription': []
}

# Process each file
for file_path in tqdm(mp3_files):  # Using regular tqdm, not tqdm.notebook
    file_name = os.path.basename(file_path)
    print(f"Processing: {file_name}")
    
    transcription = transcribe_audio(file_path)
    
    results['file'].append(file_name)
    results['transcription'].append(transcription if transcription else "ERROR")
    
    # Imprimir progreso
    if transcription:
        print(f"Transcripción completada para: {file_name}")
    else:
        print("Transcripción fallida.")
    print("-" * 80)

# Crear un DataFrame con los resultados
df = pd.DataFrame(results)

# Display the first few rows
print(df.head())

Found 6 MP3 files to process


  0%|          | 0/6 [00:00<?, ?it/s]

Processing: acoruna_barco_m67780_a84524_audio_converted.mp3


 17%|█▋        | 1/6 [00:08<00:42,  8.49s/it]

Transcripción completada para: acoruna_barco_m67780_a84524_audio_converted.mp3
--------------------------------------------------------------------------------
Processing: barcelona_JuanNuez_m70566_a87310_audio_converted.mp3


 33%|███▎      | 2/6 [00:13<00:24,  6.23s/it]

Transcripción completada para: barcelona_JuanNuez_m70566_a87310_audio_converted.mp3
--------------------------------------------------------------------------------
Processing: barcelona_Ovvero_m68255_a84999_audio_converted.mp3


 50%|█████     | 3/6 [00:23<00:24,  8.25s/it]

Transcripción completada para: barcelona_Ovvero_m68255_a84999_audio_converted.mp3
--------------------------------------------------------------------------------
Processing: berlin_Dirk_m67384_a84128_audio_converted.mp3


 67%|██████▋   | 4/6 [00:41<00:24, 12.03s/it]

Transcripción completada para: berlin_Dirk_m67384_a84128_audio_converted.mp3
--------------------------------------------------------------------------------
Processing: cuenca_Amalia_m69879_a86623_audio_converted.mp3


 83%|████████▎ | 5/6 [01:02<00:15, 15.22s/it]

Transcripción completada para: cuenca_Amalia_m69879_a86623_audio_converted.mp3
--------------------------------------------------------------------------------
Processing: elbarcelonC3A8s_Martagosa_m71623_a88367_audio_converted.mp3


100%|██████████| 6/6 [01:18<00:00, 13.16s/it]

Transcripción completada para: elbarcelonC3A8s_Martagosa_m71623_a88367_audio_converted.mp3
--------------------------------------------------------------------------------
                                                file  \
0    acoruna_barco_m67780_a84524_audio_converted.mp3   
1  barcelona_JuanNuez_m70566_a87310_audio_convert...   
2  barcelona_Ovvero_m68255_a84999_audio_converted...   
3      berlin_Dirk_m67384_a84128_audio_converted.mp3   
4    cuenca_Amalia_m69879_a86623_audio_converted.mp3   

                                       transcription  
0      In this area, we have three disability seats.  
1                          Well, we are in the mall.  
2  It's already recorded, so it's best to get on ...  
3  So there is a crossing here and next to the tr...  
4  As you walk down the street you find a plastic...  





In [10]:
# Guardar los resultados en un archivo CSV
output_path = os.path.join(data_dir, "transcriptions.csv")
df.to_csv(output_path, index=False)
print(f"Resultados guardados en {output_path}")

# Contar cuántos archivos fueron transcritos exitosamente
success_count = len(df[df['transcription'] != "ERROR"])
print(f"Se transcribieron exitosamente {success_count} de {len(mp3_files)} archivos")

Resultados guardados en c:\Users\Adria\dev\blindwikiapp\notebooks\data\transcriptions.csv
Se transcribieron exitosamente 6 de 6 archivos
