In [None]:
from pydub import AudioSegment, utils
import speech_recognition as sr
import os
from tqdm.notebook import tqdm

def convert_and_chunk(m4a_file_path, output_folder):
    # Extract the base name of the file for creating the text file
    base_name = os.path.splitext(os.path.basename(m4a_file_path))[0]
    output_text_file = os.path.join(output_folder, f"{base_name}_transcription.txt")

    # Convert m4a file to wav
    audio = AudioSegment.from_file(m4a_file_path, format="m4a")

    # Initialise the recognizer
    r = sr.Recognizer()

    # Create a graphical progress bar in Jupyter Notebook
    progress = tqdm(total=len(audio), desc=f"Processing {m4a_file_path}")

    # Open the text file for writing transcriptions
    with open(output_text_file, 'a') as file:
        for i in range(0, len(audio), 30000):  # Process in 30-second chunks
            chunk = audio[i:i+30000]
            chunk_name = os.path.join(output_folder, f"{base_name}_chunk{i//30000}.wav")
            chunk.export(chunk_name, format="wav")

            with sr.AudioFile(chunk_name) as source:
                audio_data = r.record(source)
            try:
                text = r.recognize_google(audio_data)
                print(f"Writing Chunk {i//30000} transcription to file.")
                file.write(text + " ")
                file.flush()  # Flush the written text to disk
            except sr.UnknownValueError:
                print(f"Chunk {i//30000}: Google Speech Recognition could not understand audio")
            except sr.RequestError as e:
                print(f"Chunk {i//30000}: Could not request results from Google Speech Recognition service; {0}".format(e))
            # Delete the temporary chunk file
            os.remove(chunk_name)

            # Update the graphical progress bar in Jupyter Notebook
            progress.update(len(chunk))

    progress.close()  # Close the progress bar when done

# Specify the input and output folders
input_folder = "Data"
output_folder = os.path.join(input_folder, "Transcription")

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Get a list of all M4A files in the input folder
m4a_files = [f for f in os.listdir(input_folder) if f.endswith(".m4a")]

# Process all M4A files in the input folder
for m4a_file in m4a_files:
    m4a_file_path = os.path.join(input_folder, m4a_file)
    convert_and_chunk(m4a_file_path, output_folder)