In [None]:
!pip install pydub

## Splitting Audio File

In [None]:
from pydub import AudioSegment
import os

In [None]:
# set the path to your large audio file
audio_path = "audio.wav" # 1 hour length

In [None]:
# set the length of each clip in milliseconds (20 seconds in this case)
clip_length = 20000

In [None]:
# create an AudioSegment object from the audio file
audio = AudioSegment.from_file(audio_path)

In [None]:
# get the total length of the audio file in milliseconds
audio_length = len(audio)

In [None]:
# calculate the number of clips we need to create
num_clips = int(audio_length/clip_length) + 1

In [None]:
# create a directory to store the clips
output_dir = "Audio"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
# loop through the audio file and extract each clip
for i in range(num_clips):
    # calculate the start and end time for the clip
    start_time = i * clip_length
    end_time = min((i+1) * clip_length, audio_length)
    
    # extract the clip
    clip = audio[start_time:end_time]
    
    # save the clip to a file
    clip.export(os.path.join(output_dir, f"voice_{i}.wav"), format="wav")

## Audio Transcription

In [None]:
# Install the required code libraries
!pip install git+https://github.com/openai/whisper.git 
!sudo apt update && sudo apt install ffmpeg
!pip install librosa

In [None]:
import whisper
import time
import librosa
import soundfile as sf
import re
import os

In [None]:
# model = whisper.load_model("tiny.en")
# model = whisper.load_model("base.en")   
# model = whisper.load_model("small.en")
model = whisper.load_model("medium.en")
# model = whisper.load_model("large")

In [None]:
# Allow access to your Google Drive and add new folders

# Connect Google Drive 
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [None]:
# This will create the WhisperAudio files if they don't exist.
folders =  ["WhisperAudio/", "WhisperAudio/ProcessedAudio/", "WhisperAudio/TextFiles/"]

for folder in folders:
    path = "/content/drive/MyDrive/" + folder
    # Create the folder if it does not exist
    if not os.path.exists(path):
        os.mkdir(path)

Upload any audio files you want transcribed in the "WhisperAudio" folder in your Google Drive.

In [None]:
# Assuming the audio files are in a folder called "WhisperAudio" in the root of the drive
audio_folder = "/content/drive/MyDrive/WhisperAudio/"

In [None]:
# Get a list of all the file paths and names in the folder
import os
audio_files = []
audio_names = []
for file in os.listdir(audio_folder):
    if file.endswith(".wav") or file.endswith(".mp3"):
        audio_files.append(audio_folder + file)
        audio_names.append(file)
        
for f in audio_files:    
    print(f)
    
if len(audio_files) == 0:
    print("You have no files.")

In [None]:
# Loop through the audio files, split each audio file based on pauses in speech then transcribe them with Whisper.
for i, file in enumerate(audio_files): # For each audio file
    print(f"Processing {audio_names[i]}...")
    # Load the audio file and convert it to 16 kHz mono
    audio, sr = librosa.load(file, sr=16000, mono=True)
    # Detect pauses and split the audio. We use a threshold of -30 dB and a minimum pause length of 0.5 seconds.
    pauses = librosa.effects.split(audio, top_db=30, frame_length=2048, hop_length=128)
    # Transcribe each segment and concatenate the results
    transcription = ""
    for start, end in pauses: # For each segment
        segment = audio[start:end]
        # Save the segment as a temporary wav file
        temp_file = "temp.wav"
        sf.write(temp_file, segment, sr, subtype='PCM_16')
        if os.path.getsize(temp_file) > 10000:
            # Transcribe the segment with Whisper
            result = model.transcribe(temp_file)
            text = result["text"].lstrip()
            # Append the text to the transcription
            print(len(transcription.split(" ")), "words processed")
            transcription += text.strip() + " "
            # Delete the temporary file
            os.remove(temp_file)
    # Print the transcription
    print(f"Transcription of {audio_names[i]}:\n")
    print(transcription)
    print("\n")
    
    # Convert the spaces between sections into paragraph breaks and 
    # save the transcription as a txt document in the same folder as MyAudio.
    
    # Replace multiple spaces with newlines
    transcription = re.sub(r"\s\s+", "\n\n", transcription)
    # Create the text file name
    text_file = audio_folder + "/TextFiles/" + audio_names[i][:-4] + ".txt"
    # Write the transcription to the text file
    with open(text_file, "w") as f:
        f.write(transcription)
    print(f"Saved transcription as {text_file}")

In [None]:
# Move the audio files to "/content/drive/MyDrive/WhisperAudio/Processed"
import shutil
processed_folder = "/content/drive/MyDrive/WhisperAudio/ProcessedAudio/"

# Create the folder if it does not exist
if not os.path.exists(processed_folder):
    os.mkdir(processed_folder
 
# Move each audio file to the processed folder
for file in audio_files:
    shutil.move(file, processed_folder + os.path.basename(file))
    print(f"Moved {file} to {processed_folder}")

In [None]:
# Directories to the audio samples, test files and where txt file will be stored
audio_folder = "/content/drive/MyDrive/WhisperAudio/wavs/"
txt_folder = "/content/drive/MyDrive/WhisperAudio/TextFiles"
metadata_file = "/content/drive/MyDrive/WhisperAudio/metadata.txt"