# GPT Speech To Text
---

### Imports

In [None]:
# imports 

import json
import os
import shutil

import librosa
import openai
import soundfile as sf

import yt_dlp as youtube_dl
from youtube_dl.utils import DownloadError

from dotenv import load_dotenv
load_dotenv()
print(openai.__version__)
assert os.getenv("OPENAI_API_KEY") is not None, "GO TO .env FILE AND SET OPENAI_API_KEY"

### Increment File

In [None]:
def file_incr(path, extension=".mp3"):
    audio_files = []
    for root, dirs, files in os.walk(path):
        for f in files:
            if f.endswith(extension):
                audio_files.append(os.path.join(root, f))

    return audio_files

### Youtube to Mp3 Conversion

In [None]:
def youtube_to_mp3(youtube_url: str, output_dir: str) -> str:
    ydl_config = {
        "format": "bestaudio/best",
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "mp3",
                "preferredquality": "192",
            }
        ],
        "outtmpl": os.path.join(output_dir, "%(title)s.%(ext)s"),
        "verbose": True,
    }

    if not os.path.exists(output_dir): # null check for dir 
        os.makedirs(output_dir)

    print(f"Capturing mp3 from: {youtube_url}")

    try:
        with youtube_dl.YoutubeDL(ydl_config) as ydl:
            ydl.download([youtube_url])
    except DownloadError:
        with youtube_dl.YoutubeDL(ydl_config) as ydl:
            ydl.download([youtube_url])

    audio_filename = file_incr(output_dir)[0]
    return audio_filename

### Break down files 
gpt api caps at 25mb per call, so larger files will need to be quantized 

In [None]:
def quantize_mp3(filename, segment_length: int, output_dir):

    print(f"Chunking audio to {segment_length} second segments...")

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    audio, sr = librosa.load(filename, sr=44100)

    duration = librosa.get_duration(y=audio, sr=sr)
    num_segments = int(duration / segment_length) + 1

    print(f"Quantizing {num_segments} chunks...")
    
    for i in range(num_segments):
        start = i * segment_length * sr
        end = (i + 1) * segment_length * sr
        segment = audio[start:end]
        sf.write(os.path.join(output_dir, f"segment_{i}.mp3"), segment, sr)

    chunked_audio_files = file_incr(output_dir)
    return sorted(chunked_audio_files)

### Transcribe Audio with gpt whisper api 

In [None]:
def transcribe_audio(audio_files: list, output_file=None, model="whisper-1") -> list:

    print("Converting audio to text...")

    transcripts = []
    for audio_file in audio_files:
        with open(audio_file, "rb") as audio:
            response = openai.audio.transcriptions.create(
                model=model,
                file=audio,
                response_format="json",
                # response_format="verbose_json", timestamp_granularities=["word"], #TODO take a look at "verbose_json" -> may have some interesting applications
                
            )
            print(response)
            transcripts.append(response.text)  # or response['text'], depending on the structure

    if output_file is not None:
        with open(output_file, "w") as file:
            for transcript in transcripts:
                file.write(transcript + "\n")
    json.dumps(transcripts)
    return transcripts

### Raw Transcription

In [None]:
def print_transcriptions(youtube_url, outputs_dir):
    raw_audio_dir = f"{outputs_dir}/raw_audio/"
    chunks_dir = f"{outputs_dir}/chunks"
    transcripts_file = f"{outputs_dir}/transcripts.txt"
    summary_file = f"{outputs_dir}/summary.txt"
    segment_length = 10 * 60  # chunk to 10 minute segments

    if os.path.exists(outputs_dir):
        # delete the outputs_dir folder and start from scratch
        shutil.rmtree(outputs_dir)
        os.mkdir(outputs_dir)

    audio_filename = youtube_to_mp3(youtube_url, output_dir=raw_audio_dir)

    chunked_audio_files = quantize_mp3(
        audio_filename, segment_length=segment_length, output_dir=chunks_dir
    )

    transcriptions = transcribe_audio(chunked_audio_files, transcripts_file)
    
    # print(transcriptions)
    
    return transcriptions
    
    
youtube_url = "https://www.youtube.com/watch?v=fGdmjSYJtb8"
outputs_dir = "outputs/"    
transcription = print_transcriptions(youtube_url, outputs_dir)
pre_processed_transcript = json.dumps(transcription)


print(pre_processed_transcript)
    

### Post-processing with gpt 
Optional 

In [None]:
system_prompt = "You are a helpful assistant. Your task is to correct any spelling discrepancies in the transcribed text. Only add necessary punctuation such as periods, commas, and capitalization, and use only the context provided."

#TODO: work on how to work around token limit 
def gen_enhanced_transcript(temperature, system_prompt, pre_processed_transcipt):
    enhanced_transcript = openai.chat.completions.create(
        model="gpt-4o",
        temperature=temperature,
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": pre_processed_transcipt
            }
        ],
    )
    return enhanced_transcript

final_transcript = gen_enhanced_transcript(0, system_prompt, pre_processed_transcript) 
print(final_transcript)