# Test tts and joining audio files

In [57]:
from dotenv import load_dotenv
load_dotenv()

True

In [18]:
from pathlib import Path
from openai import OpenAI
client = OpenAI()

speech_file_path = "/Users/phamanhdung/Documents/AI/multiple_speaker_podcast/audio/host.wav"
with client.audio.speech.with_streaming_response.create(
  model="tts-1",
  voice="alloy",
  input="Hi everyone, welcome to the podcast. Today we have a special guest, who is a very talented person. Let's welcome him!",
  response_format="wav"
) as response:
    response.stream_to_file(speech_file_path)

In [19]:
from pathlib import Path
from openai import OpenAI
client = OpenAI()

speech_file_path = "/Users/phamanhdung/Documents/AI/multiple_speaker_podcast/audio/guest.wav"
with client.audio.speech.with_streaming_response.create(
  model="tts-1",
  voice="ash",
  input="Hi, I'm Ash. Thanks for having me on the podcast. I'm excited to be here.",
  response_format="wav"
) as response:
    response.stream_to_file(speech_file_path)

In [22]:
import wave

def concatenate_audio_wave(audio_clip_paths, output_path):
    """Concatenates several audio files into one audio file using Python's built-in wav module
    and save it to `output_path`. Note that extension (wav) must be added to `output_path`"""
    data = []
    for clip in audio_clip_paths:
        w = wave.open(clip, "rb")
        data.append([w.getparams(), w.readframes(w.getnframes())])
        w.close()
    output = wave.open(output_path, "wb")
    output.setparams(data[0][0])
    for i in range(len(data)):
        output.writeframes(data[i][1])
    output.close()

In [24]:
from pydub import AudioSegment

def concatenate_audio(audio_clip_paths, output_path):
    """Concatenates multiple audio files into one and saves it as a WAV file."""
    
    combined = AudioSegment.empty()  # Start with an empty AudioSegment

    for clip in audio_clip_paths:
        audio = AudioSegment.from_file(clip)  # Automatically detects format
        combined += audio  # Append to the final audio

    combined.export(output_path, format="wav")  # Save as WAV
    print(f"Audio saved to {output_path}")

# Example usage:
audio_files = [
    "/Users/phamanhdung/Documents/AI/multiple_speaker_podcast/audio/host.wav",
    "/Users/phamanhdung/Documents/AI/multiple_speaker_podcast/audio/guest.wav"
]
output_file = "/Users/phamanhdung/Documents/AI/multiple_speaker_podcast/audio/merged.wav"

concatenate_audio(audio_files, output_file)


Audio saved to /Users/phamanhdung/Documents/AI/multiple_speaker_podcast/audio/merged.wav


In [55]:
import boto3
from loguru import logger
from botocore.exceptions import NoCredentialsError

def upload_to_s3(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified, file_name is used
    :return: True if file was uploaded, else False
    """
    s3 = boto3.client('s3', aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"))
    if object_name is None:
        object_name = file_name

    try:
        # Upload the file to S3
        s3.upload_file(file_name, bucket, object_name)
        logger.info(f"File {file_name} uploaded successfully to {bucket}/{object_name}")
        public_url = f"https://{bucket}.s3.us-east-1.s3.amazonaws.com/{object_name}"
        return public_url
    except FileNotFoundError:
        logger.error("The file was not found")
        return False
    except NoCredentialsError:
        logger.error("Credentials not available")
        return False

In [56]:
upload_to_s3(file_name="/Users/phamanhdung/Documents/AI/multiple_speaker_podcast/audio/merged.wav", bucket="oigpt", object_name="merged")

[32m2025-02-20 22:51:23.024[0m | [1mINFO    [0m | [36m__main__[0m:[36mupload_to_s3[0m:[36m20[0m - [1mFile /Users/phamanhdung/Documents/AI/multiple_speaker_podcast/audio/merged.wav uploaded successfully to oigpt/merged[0m


'https://oigpt.s3.us-east-1.s3.amazonaws.com/merged'

# Script generation

In [61]:
from pydantic import Field, BaseModel
from langchain_openai import ChatOpenAI
from typing import List

In [72]:
class TurnConversation(BaseModel):
    speaker: str = Field(description="Name of the speaker.")
    text: str = Field(description="Text spoken by the speaker.")

class ScriptResponse(BaseModel):
    """Always use this tool to structure your response to the user."""
    script_title: str = Field(description="Short title of generated script. For podcast purpose.")
    turns: List[TurnConversation] = Field(description="List of turns in the conversation.")

def gen_script(words_to_review: List[str]) -> ScriptResponse:
    model = ChatOpenAI(model="gpt-4o-mini", temperature=0.8)
    model_with_structure = model.with_structured_output(ScriptResponse)
    prompt = '''
    With given words to review in tripple backticks. Generate a script for a podcast episode which must contains
    all the given words in the conversation and talking about technology or AI. 
    The script is a conversation between two people. The first person is the host of the podcast. 
    The second person is the guest. The host name is Quinn and the guest name is AD.
    The script must be around 1000 words long. And each word in the provided list must be used at least twice.
    The script must be engaging, funny and informative.
    ```Words to review: {}```
    '''.format(", ".join(words_to_review))
    structured_output = model_with_structure.invoke(prompt)
    return structured_output

In [73]:
words_to_reivew = ["factotum", "even", "abreast", "abrogation", "abruptly", "juggle", "jungle"]
script = gen_script(words_to_review=words_to_reivew)

In [74]:
script

ScriptResponse(script_title='Juggling Technology in the Jungle of AI', turns=[TurnConversation(speaker='Quinn', text='Welcome back to another episode of Tech Talk Today! I’m your host, Quinn, and today we have an exciting guest who knows a thing or two about navigating the jungle of technology. Please welcome AD, a true factotum in the world of artificial intelligence!'), TurnConversation(speaker='AD', text="Thanks for having me, Quinn! It's great to be here. I love talking about tech, especially when it involves juggling all these new advancements."), TurnConversation(speaker='Quinn', text='Absolutely! Technology evolves so quickly. It’s like trying to keep abreast of a never-ending jungle of information. How do you manage to stay on top of it all?'), TurnConversation(speaker='AD', text="Well, I have to admit, sometimes I feel like I’m just a factotum, juggling different projects and trying not to drop the ball! But I believe it's essential to keep learning and adapting."), TurnConver