In [5]:
import speech_recognition as sr
import math
from tqdm import tqdm
import os
import pandas as pd
import librosa
from functions import * 

In [3]:
def convert2text(audios, seconds=30, noise=False):
    r = sr.Recognizer()
    archivos = os.listdir(audios)
    datos = []

    for archivo in archivos:
        if archivo.endswith('.wav'):
            audio_path = os.path.join(audios, archivo)
            with sr.AudioFile(audio_path) as source:
                duration = source.DURATION
                splits = math.ceil(duration / seconds)
            
            transcripcion = ""  
            
            for i in tqdm(range(splits)):
                with source as src:
                    if noise:
                        r.adjust_for_ambient_noise(src)
                    audio = r.record(src, offset=i * seconds, duration=seconds)
                
                try:
                    texto = r.recognize_google(audio, language="es-MX")
                    transcripcion += texto + " "
                    speech_speed = get_speech_speed(audio_path)
                except:
                    pass
                
            datos.append([archivo, transcripcion, speech_speed])
            
    df = pd.DataFrame(datos, columns=["Archivo", "Transcripcion", "Velocidad"])
    df = df.loc[df['Transcripcion'].notna() & (df['Transcripcion'] != '')]
    df.to_csv("transcripciones.csv", index=False, encoding='latin-1')


#%time
#carpeta_audio = "C:/Users/DSTHREE/Documents/GITHUB/Speech2Text/GRABACIONES"
#convert2text(carpeta_audio, seconds=10) 
  

In [15]:
def get_speech_speed(audio_path):
    # Se carga el audio
    audio, sr = librosa.load(audio_path)
            
    # Extraer segmentos del audio
    speech_segments = librosa.effects.split(audio, top_db=20)
            
    # Calcular la duración de cada segmento
    segment_durations = librosa.frames_to_time(
        [segment[1] - segment[0] for segment in speech_segments],
        sr=sr
         )
            
    # Calculate the average speed of speech
    average_speed = sum(segment_durations) / len(segment_durations)
            
    return average_speed
        

In [16]:
%time
carpeta_audio = "Z:/CALIDAD/INTELIGENCIA_ARTIFICIAL/LLAMADAS_DIMEX"
convert2text(carpeta_audio, seconds=10)

CPU times: total: 0 ns
Wall time: 0 ns


  0%|          | 0/36 [00:00<?, ?it/s]Exception ignored on calling ctypes callback function: <function ExecutionEngine._raw_object_cache_notify at 0x000001DC16234F70>
Traceback (most recent call last):
  File "c:\Users\DSTHREE\Documents\GITHUB\Speech2Text\speech2txt\lib\site-packages\llvmlite\binding\executionengine.py", line 171, in _raw_object_cache_notify
    def _raw_object_cache_notify(self, data):
KeyboardInterrupt: 
 11%|█         | 4/36 [00:14<01:58,  3.72s/it]

In [4]:
import os
import math
import speech_recognition as sr
import pandas as pd
from tqdm import tqdm
import librosa
from pyAudioAnalysis import audioSegmentation
import speech_recognition as sr

def convertir_audio_a_texto(segmentos_audio):
    r = sr.Recognizer()
    texto_transcripcion = ""
    for segmento in segmentos_audio:
        try:
            texto = r.recognize_google(segmento, language="es-MX")
            texto_transcripcion += texto + " "
        except sr.UnknownValueError:
            print("No se pudo reconocer el audio.")
        except sr.RequestError as e:
            print("Error en la solicitud al servicio de reconocimiento de voz: {0}".format(e))
    return texto_transcripcion

def get_speech_speed(audio_path):
    # Se carga el audio
    audio, sr = librosa.load(audio_path)
            
    # Extraer segmentos del audio
    speech_segments = librosa.effects.split(audio, top_db=20)
            
    # Calcular la duración de cada segmento
    segment_durations = librosa.frames_to_time(
        [segment[1] - segment[0] for segment in speech_segments],
        sr=sr
         )
            
    # Calculate the average speed of speech
    average_speed = sum(segment_durations) / len(segment_durations)
            
    return average_speed

def convert2txt(audios, seconds=30, noise=False):
    archivos = os.listdir(audios)
    datos = []

    for archivo in archivos:
        if archivo.endswith('.wav'):
            audio_path = os.path.join(audios, archivo)
            with sr.AudioFile(audio_path) as source:
                duration = librosa.get_duration(path=audio_path)
                splits = math.ceil(duration / seconds)

            # Obtener los segmentos de audio usando pyAudioAnalysis
            audio, sampling_rate = librosa.load(audio_path, sr=None)
            segmentos_habla = audioSegmentation.speaker_diarization(audio, sampling_rate)

            segmentos_por_persona = {}
            for segmento in segmentos_habla:
                inicio, fin = segmento[0], segmento[1]
                persona_id = segmento[2]
                if persona_id not in segmentos_por_persona:
                    segmentos_por_persona[persona_id] = []
                segmentos_por_persona[persona_id].append(audio[inicio:fin])

            # Transcribir los segmentos de cada persona
            transcripciones_por_persona = {}
            for persona_id, segmentos_persona in segmentos_por_persona.items():
                transcripcion = convertir_audio_a_texto(segmentos_persona)
                transcripciones_por_persona[persona_id] = transcripcion

            speech_speed = get_speech_speed(audio_path)

            datos.append([archivo, transcripciones_por_persona, speech_speed])

    df = pd.DataFrame(datos, columns=["Archivo", "Transcripciones", "Velocidad"])
    df.to_csv("transcripciones.csv", index=False, encoding='utf-8')

# Llamada a la función



In [5]:
convert2txt("Z:/CALIDAD/INTELIGENCIA_ARTIFICIAL/LLAMADAS_DIMEX")

Error: file not found or other I/O error. (DECODING FAILED)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  sig_array_norm -= sig_array_norm.mean()
  ret = ret.dtype.type(ret / rcount)


ValueError: zero-size array to reduction operation maximum which has no identity

In [8]:
from google.cloud import speech_v1p1beta1 as speech  

In [11]:
client = speech.SpeechClient()
speech_file = "Z:/CALIDAD/INTELIGENCIA_ARTIFICIAL/LLAMADAS_DIMEX/ALLOPEZR_DIMEXENOJO.wav"

with open(speech_file, 'rb') as audio_file:
    content = audio_file.read()
audio = speech.RecognitionAudio(content = content)

diarization_config = speech.SpeakerDiarizationConfig(
    enable_speaker_diarization = True,
    min_speaker_count=2,
    max_speaker_count=10,
)

config = speech.RecognitionConfig(
    encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16, 
    sample_rate_hertz=8000,
    language_code="es-MX",
    diarization_config=diarization_config,
)

print("Esperando a que la operación se complete")
response = client.recognize(config=config, audio=audio)
result = response.results[-1]

words_info = result.alternatives[0].words

for word_info in words_info:
    print(
        f"word: '{word_info.word}', speaker_tag: {word_info.speaker_tag}"
    )
    
return result

DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.