## **Procesar y cortar Audio**

In [7]:
import librosa
import IPython.display as ipd
import numpy as np
import os
%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display
import scipy
from scipy.io.wavfile import write
import shutil

def delete_splited_folder():
    if os.path.exists("splited"):
        shutil.rmtree("splited")

delete_splited_folder()

#@markdown - Sample Rate:
sr = 22050 #@param {type:"integer"}
#@markdown - Ruta de archivo + nombre:
ruta_de_archivo_de_audio = "/content/drive/MyDrive/Killjoy.wav" #@param {type:"string"}
#@markdown - prefijo, se puede dejar en blanco:
prefijo = "" #@param {type:"string"}
#@markdown ----
#@markdown - Descartar audios cortados de menos de X segundos:
descartar_menos_de = 1 #@param {type:"integer"}
#@markdown - Aplicar preenfasis al audio:
preemph = False #@param {type:"boolean"}
#@markdown - Coeficiente de preenfasis:
preemphasis = 0.8 #@param {type:"number"}

#@markdown - Tiempo mínimo entre segmentos (milisegundos):
min_diff = 400 #@param {type:"integer"}

#@markdown - Tamaño mínimo de segmentos (milisegundos):
min_size = 5000 #@param {type:"integer"}

#@markdown - Umbral de dB para detectar cortes:
db = 60 #@param {type:"number"}
def split_audio_to_list(source, preemph=True, preemphasis=0.8, min_diff=3000, min_size=3000, db=50):
    if preemph:
        source = np.append(source[0], source[1:] - preemphasis * source[:-1])
    split_list = librosa.effects.split(source, top_db=db).tolist()
    i = len(split_list) - 1
    while i > 0:
        if split_list[i][-1] - split_list[i][0] > min_size:
            now = split_list[i][0]
            prev = split_list[i - 1][1]
            diff = now - prev
            if diff < min_diff:
                split_list[i - 1] = [split_list[i - 1][0], split_list.pop(i)[1]]
        else:
            split_list.pop(i)
        i -= 1
    return [x for x in split_list if x[-1] - x[0] > min_size]

def trim_custom(audio, begin_db=25, end_db=30):
    begin = librosa.effects.trim(audio, top_db=begin_db)[1][0]
    end = librosa.effects.trim(audio, top_db=end_db)[1][1]
    return audio[begin:end]
# ...

def save_splits(audio, output_dir, i):
    output_path = os.path.join(output_dir, f"{prefijo}{i}.wav")
    scipy.io.wavfile.write(output_path, sr, audio)
    

y, _ = librosa.load(ruta_de_archivo_de_audio, sr=sr)
y_split = split_audio_to_list(y, preemph=preemph, preemphasis=preemphasis, min_diff=min_diff, min_size=min_size, db=db)
output_dir = "splited"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
for i, part in enumerate(y_split):
    audio = trim_custom(y[slice(*part)])
    
    # Descartar segmentos de audio de menos de un segundo
    if len(audio) < sr*descartar_menos_de:
        continue
    
    save_splits(audio, output_dir, i)
print("[1;32marchivos guardados correctamente")

[1;32marchivos guardados correctamente


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@markdown #(Opcional)Mostrar audios
audio_dir = "splited"

# Obtener la lista de archivos de audio
audio_files = os.listdir(audio_dir)
audio_files = [x for x in audio_files if os.path.splitext(x)[1] == ".wav"]
# Ordenar la lista de archivos de audio por número
audio_files = sorted(audio_files, key=lambda x: int(os.path.splitext(x)[0]))

# Mostrar cada archivo de audio
for audio_file in audio_files:
    audio_path = os.path.join(audio_dir, audio_file)
    print(f"Reproduciendo {audio_file}:")
    ipd.display(ipd.Audio(audio_path))


In [47]:
from IPython.display import clear_output 
import os
#@markdown #(Opcional)Mostrar audios y elimina no deseados (de momento el boton con el nombre del audio lo elimina, esto es una opcion muy experimental, usala solo si entiendes de errores de programacion)
audio_dir = "splited"
def on_delete_clicked(b):
    # Obtener el nombre del archivo de audio a eliminar
    audio_file = b.original_description
    # Construir la ruta del archivo de audio
    audio_path = os.path.join(audio_dir, audio_file)
    # Eliminar el archivo
    os.remove(audio_path)
    # Refrescar el directorio de archivos de audio
    audio_files = os.listdir(audio_dir)
    audio_files = [x for x in audio_files if os.path.splitext(x)[1] == ".wav"]
    audio_files = sorted(audio_files, key=lambda x: int(os.path.splitext(x)[0]))
    # Refrescar el valor de start y end
    start = 0
    end = 10
    # Mostrar los archivos de audio
    clear_output()
    display(next_button, previous_button)
    show_audio_files(audio_files, start, end)

def show_audio_files(audio_files, start, end):
    print(f'mostrando audios del {start} al {end}')
    for audio_file in audio_files[start:end-1]:
        audio_path = os.path.join(audio_dir, audio_file)
        print(f"Reproduciendo {audio_file}:")
        ipd.display(ipd.Audio(audio_path))
        delete_button = widgets.Button(description=f"Eliminar {audio_file}")
        delete_button.original_description = audio_file
        delete_button.on_click(on_delete_clicked)
        display(delete_button)
        
def show_next_audio_files(b):
    global start, end
    start += 10
    end += 10
    audio_files = os.listdir(audio_dir)
    audio_files = [x for x in audio_files if os.path.splitext(x)[1] == ".wav"]
    audio_files = sorted(audio_files, key=lambda x: int(os.path.splitext(x)[0]))
    clear_output()
    display(next_button, previous_button)
    show_audio_files(audio_files, start, end)

def show_previous_audio_files(b):
    global start, end
    start -= 10
    end -= 10
    audio_files = os.listdir(audio_dir)
    audio_files = [x for x in audio_files if os.path.splitext(x)[1] == ".wav"]
    audio_files = sorted(audio_files, key=lambda x: int(os.path.splitext(x)[0]))
    clear_output()
    display(next_button, previous_button)
    show_audio_files(audio_files, start, end)

next_button = widgets.Button(description="Siguiente")
previous_button = widgets.Button(description="Anterior")

next_button.on_click(show_next_audio_files)
previous_button.on_click(show_previous_audio_files)

display(next_button, previous_button)
show_audio_files(audio_files, start, end)


Button(description='Siguiente', style=ButtonStyle())

Button(description='Anterior', style=ButtonStyle())

mostrando audios del 0 al 10
Reproduciendo 2.wav:


Button(description='Eliminar 2.wav', style=ButtonStyle())

Reproduciendo 4.wav:


Button(description='Eliminar 4.wav', style=ButtonStyle())

Reproduciendo 5.wav:


Button(description='Eliminar 5.wav', style=ButtonStyle())

Reproduciendo 6.wav:


Button(description='Eliminar 6.wav', style=ButtonStyle())

Reproduciendo 8.wav:


Button(description='Eliminar 8.wav', style=ButtonStyle())

Reproduciendo 10.wav:


Button(description='Eliminar 10.wav', style=ButtonStyle())

Reproduciendo 11.wav:


Button(description='Eliminar 11.wav', style=ButtonStyle())

Reproduciendo 13.wav:


Button(description='Eliminar 13.wav', style=ButtonStyle())

Reproduciendo 15.wav:


Button(description='Eliminar 15.wav', style=ButtonStyle())

In [None]:
#@markdown #(Opcional)Descarga todos los wavs separados en un zip
import shutil
from google.colab import files

def zip_and_download(folder_path, zip_file_name):
    shutil.make_archive(zip_file_name, "zip", folder_path)
    files.download(f"{zip_file_name}.zip")

# Crear ZIP y descargar archivo
zip_and_download("splited", "splited_files")

## **Whisper**

In [4]:
#@markdown # Dependencias
!pip install git+https://github.com/openai/whisper.git 
!sudo apt update && sudo apt install ffmpeg
!pip install librosa

import whisper
import time
import librosa
import soundfile as sf
import re
import os

# model = whisper.load_model("tiny.en")
# model = whisper.load_model("base.en")   
#model = whisper.load_model("small.en") # load the small model
#model = whisper.load_model("medium")
model = whisper.load_model("large")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-mqg5_khn
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-mqg5_khn
  Resolved https://github.com/openai/whisper.git to commit 28769fcfe50755a817ab922a7bc83483159600a9
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers>=4.19.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpeg-python==0.2.0
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

100%|██████████████████████████████████████| 2.87G/2.87G [00:23<00:00, 133MiB/s]


In [5]:
#@markdown # Procesaremos todos los audios cortados anteriormente

# This will create the WhisperAudio files if they don't exist.

import shutil

eliminar_textos_anteriores = True #@param {type:"boolean"}

def delete_whisper_folder():
    if os.path.exists("/content/WhisperAudio/TextFiles"):
        print("borrando")
        shutil.rmtree("/content/WhisperAudio/TextFiles")

if eliminar_textos_anteriores:
  delete_whisper_folder()

folders =  ["WhisperAudio/", "WhisperAudio/TextFiles/"]
for folder in folders:
  path = folder
  if not os.path.exists(path): # Create the folder if it does not exist
    os.mkdir(path)

# Assuming the audio files are in a folder called "splited" in the root of the drive
audio_folder = "splited"  #@param {type:"string"}
select_language = 'es'#@param {type:"string"}
# Get a list of all the file paths and names in the folder
import os
audio_files = []
audio_names = []
for file in os.listdir(audio_folder):
  if file.endswith(".m4a") or file.endswith(".mp3") or file.endswith(".wav"):
    audio_files.append(audio_folder + "/" + file)
    audio_names.append(file)

for f in audio_files:    
  print(f)

if len(audio_files) == 0:
  print("You have no files.")

# Loop through the audio files, split each audio file based on pauses in speech then transcribe them with Whisper.
for i, file in enumerate(audio_files): # For each audio file
  print(f"Processing {audio_names[i]}...")
  # Load the audio file and convert it to 16 kHz mono
  audio, sr = librosa.load(file, sr=16000, mono=True)
  # Detect pauses and split the audio. We use a threshold of -30 dB and a minimum pause length of 0.5 seconds.
  pauses = librosa.effects.split(audio, top_db=30, frame_length=2048, hop_length=128)
  # Transcribe each segment and concatenate the results
  transcription = ""
  for start, end in pauses: # For each segment
    segment = audio[start:end]
    # Save the segment as a temporary wav file
    temp_file = "temp.wav"
    sf.write(temp_file, segment, sr, subtype='PCM_16')
    if os.path.getsize(temp_file) > 10000:
      #continue
      # Transcribe the segment with Whisper
      result = model.transcribe(temp_file, language= select_language)
      text = result["text"]
      # Append the text to the transcription
      print(len(transcription.split(" ")), "words processed")
      transcription += text.strip() + " "
      # Delete the temporary file
      os.remove(temp_file)
  # Print the transcription
  print(f"Transcription of {audio_names[i]}:\n")
  print(transcription)
  print("\n")
 
  # Convert the spaces between sections into paragraph breaks and save the transcription as a txt document in the same folder as MyAudio.
  transcription = re.sub(r"\s\s+", "\n\n", transcription) # Replace multiple spaces with newlines
  text_file = 'WhisperAudio' + "/TextFiles/" + audio_names[i][:-4] + ".txt" # Create the text file name
  with open(text_file, "w") as f: # Write the transcription to the text file
    f.write(transcription)
  print(f"Saved transcription as {text_file}")


splited/171.wav
splited/94.wav
splited/206.wav
splited/161.wav
splited/70.wav
splited/6.wav
splited/191.wav
splited/95.wav
splited/111.wav
splited/255.wav
splited/20.wav
splited/174.wav
splited/97.wav
splited/226.wav
splited/167.wav
splited/179.wav
splited/44.wav
splited/139.wav
splited/58.wav
splited/149.wav
splited/109.wav
splited/5.wav
splited/145.wav
splited/219.wav
splited/203.wav
splited/117.wav
splited/187.wav
splited/243.wav
splited/8.wav
splited/47.wav
splited/85.wav
splited/1.wav
splited/182.wav
splited/35.wav
splited/208.wav
splited/185.wav
splited/259.wav
splited/150.wav
splited/177.wav
splited/66.wav
splited/121.wav
splited/36.wav
splited/91.wav
splited/69.wav
splited/261.wav
splited/137.wav
splited/190.wav
splited/143.wav
splited/193.wav
splited/250.wav
splited/197.wav
splited/267.wav
splited/209.wav
splited/119.wav
splited/204.wav
splited/134.wav
splited/217.wav
splited/146.wav
splited/194.wav
splited/78.wav
splited/199.wav
splited/225.wav
splited/133.wav
splited/258.wav

In [None]:
#@markdown # Juntamos todos los textos generados en uno solo y agregamos el prefijo necesario para el dataset

import glob
import re

def reorder_text_file(text_dir, output_file, encoding="utf-8"):
    # Abrir el archivo de salida para escribir
    with open(output_file, "w", encoding=encoding) as out:
        # Obtener la lista de archivos de texto
        text_files = glob.glob(os.path.join(text_dir, "*.txt"))
        
        # Ordenar la lista de archivos de texto por número
        text_files = sorted(text_files, key=lambda x: int(os.path.splitext(os.path.basename(x))[0].split("_")[-1]))
        
        # Recorrer la lista de archivos de texto
        for text_file in text_files:
            # Abrir el archivo de texto para leer
            with open(text_file, encoding=encoding) as f:
                # Leer el contenido del archivo de texto
                text = f.read()
                
                # Extraer el nombre del archivo sin la extensión
                file_name = os.path.splitext(os.path.basename(text_file))[0]
                
                # Escribir el prefijo y el contenido del archivo de texto en el archivo de salida
                out.write(f"wavs/{file_name}.wav|{text}\n")

# Llamar a la función para mezclar los archivos de texto
reorder_text_file("WhisperAudio/TextFiles", "output.txt")

def clean_text(text):
    # Reemplazar tildes
    text = text.replace("á", "a").replace("é", "e").replace("í", "i").replace("ó", "o").replace("ú", "u")
    text = text.replace("Á", "A").replace("É", "E").replace("Í", "I").replace("Ó", "O").replace("Ú", "U")
    # Quitar signos de interrogación y admiración
    text = text.replace("?", "").replace("!", "").replace("¡", "").replace("¿", "").replace("...", "").replace(" .", ".")
    # Quitar ñ remplazando por ni
    text = text.replace("ñ", "ni")
    # Asegurar que hay un punto al final de cada línea
    text = [line.strip() + "." for line in text.split("\n")]
    return "\n".join(text)


def clean_output_file(input_file, output_file, encoding="utf-8"):
    # Abrir el archivo de entrada para leer
    with open(input_file, encoding=encoding) as f:
        # Leer el contenido del archivo de entrada
        text = f.read()
        
    # Limpiar el texto
    text = clean_text(text)
    
    # Abrir el archivo de salida para escribir
    with open(output_file, "w", encoding=encoding) as f:
        # Escribir el texto limpio en el archivo de salida
        f.write(text)

# Utilizar la función
input_file = "output.txt"
output_file = "output_clean.txt"
clean_output_file(input_file, output_file)
