In [7]:
#!pip install pydub

In [8]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import scipy
from pydub import AudioSegment
import os

def generate_music(music_prompt):
    """
    Generate music based on a given prompt and save it as a WAV and MP3 file.

    Parameters:
        music_prompt (str): The prompt for generating music.

    This function initializes a processor and a model from the 'facebook/musicgen-small' pre-trained model. It prepares the inputs, generates audio, saves it as a WAV file, converts it to MP3, and then deletes the WAV file.

    The generated music is saved in the current directory as 'musicgen_out.mp3'.
    """
    
    # Initialize the processor and model
    processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

    # Prepare inputs
    inputs = processor(
        text=[f'{music_prompt}'],
        padding=True,
        return_tensors="pt",
    )

    # Generate audio
    audio_values = model.generate(**inputs, max_new_tokens=512)

    # Save as WAV file
    sampling_rate = model.config.audio_encoder.sampling_rate
    scipy.io.wavfile.write("musicgen_out.wav", rate=sampling_rate, data=audio_values[0, 0].numpy())

    # Convert WAV to MP3
    wav_audio = AudioSegment.from_wav("musicgen_out.wav")
    wav_audio.export("musicgen_out.mp3", format="mp3")

    # Delete the WAV file
    os.remove("musicgen_out.wav")

In [9]:
from langchain.llms import Cohere
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path='.env')

def generate_music_descriptions(video_prompt):
    """
    Generate music descriptions based on a given video prompt using a language model and template.

    Parameters:
        video_prompt (str): The prompt for generating music descriptions.

    This function creates a template for providing music descriptions for given situations, fills in the template with the video prompt, and then sends the template to a language model for generating music descriptions. It prints the generated response from the language model.
    """

    # Create prompt template
    template = """Instructions: Give a description for music that can be used for a given situation\n\nInput: 80s Coffee shop\nOutput: 80s pop track with bassy drums and synth, heavy drumming, pop music, rock music, electric.\n\n\nInput: {video_prompt}\nOutput:"""
    print("Prompt being sent to the language model")
    print(template)

    prompt = PromptTemplate(template=template, input_variables=["video_prompt"])

    llm = Cohere(cohere_api_key=os.getenv("COHERE_API_KEY") , stop=['\n\n\n'], temperature=0.5, model='command')

    llm_chain = LLMChain(prompt=prompt, llm=llm)

    response = llm_chain.run(video_prompt)
    print("response from language model")
    print(response)

music_description = generate_music_descriptions("Car Chase")
generate_music(music_description)

Prompt being sent to the language model
Instructions: Give a description for music that can be used for a given situation

Input: 80s Coffee shop
Output: 80s pop track with bassy drums and synth, heavy drumming, pop music, rock music, electric.


Input: {video_prompt}
Output:
response from language model
 Car chase music is typically fast-paced and high-energy to mimic the intensity of a high-speed pursuit. It can be anything from dubstep to electronic dance music to house music. 




In [None]:
import os
import shutil
import requests
from dotenv import load_dotenv

load_dotenv(dotenv_path='.env')

def voice_generation(dialogue, gender, output_audio_destination_path):
    """
    Generate voice using the Elevenlabs Text-to-Speech API and save the output audio to a destination path.

    Parameters:
        dialogue (str): The text to be converted into speech.
        gender (str): The gender of the voice (either 'female' or 'male').
        output_audio_destination_path (str): The path where the generated audio will be saved.
    """

    # Create Voice using Elevenlabs
    CHUNK_SIZE = 1024
    id = ""
    if gender == "female":
        id = "21m00Tcm4TlvDq8ikWAM"
    elif gender == "male":
        id = "ODq5zmih8GrVes37Dizd"

    url = f"https://api.elevenlabs.io/v1/text-to-speech/{id}"

    headers = {
    "Accept": "audio/mpeg",
    "Content-Type": "application/json",
    "xi-api-key": os.getenv("ELEVENLABS_API_KEY")
    }

    data = {
    "text": dialogue,
    "model_id": "eleven_monolingual_v1",
    "voice_settings": {
        "stability": 0.5,
        "similarity_boost": 0.5
    }
    }

    response = requests.post(url, json=data, headers=headers)
    with open('output.mp3', 'wb') as f:
        for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
            if chunk:
                f.write(chunk)

    # Copy the generated audio to the unreal engine project
    shutil.copy('output.mp3', output_audio_destination_path)