In [None]:
!pip install torch audiocraft transformers pyttsx3 gradio numpy scipy diffusers matplotlib librosa soundfile

In [None]:
import torch
!pip install --upgrade diffusers

from audiocraft.models import MusicGen
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pyttsx3
import gradio as gr
from tempfile import NamedTemporaryFile
import numpy as np
import scipy.io.wavfile as wavfile
from diffusers import StableDiffusionPipeline
import matplotlib.pyplot as plt
import librosa.display
import librosa
import soundfile as sf
from PIL import Image
import os

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize Models
# MusicGen
music_model = MusicGen.get_pretrained("small", device=device)

# GPT-2 for conversation
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

# Stable Diffusion for image generation
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
pipe = pipe.to(device)

# Emotion detection for Text-to-Audio
def get_emotion_tone(text):
    if any(word in text.lower() for word in ["happy", "joy", "excited"]):
        return "happy"
    elif any(word in text.lower() for word in ["sad", "down", "melancholy"]):
        return "sad"
    elif any(word in text.lower() for word in ["angry", "frustrated"]):
        return "angry"
    else:
        return "neutral"

# Image generation using Stable Diffusion
def generate_image(prompt, style="realistic"):
    styled_prompt = f"{style} style {prompt}"
    try:
        image = pipe(styled_prompt).images[0]
        temp_image = NamedTemporaryFile(delete=False, suffix=".png")
        image.save(temp_image.name)
        return temp_image.name
    except Exception as e:
        return f"Error generating image: {e}"

# Convert Text to Audio with Emotion
def text_to_audio(text):
    emotion = get_emotion_tone(text)
    engine = pyttsx3.init()
    engine.setProperty('rate', 150 if emotion == "neutral" else 180 if emotion == "happy" else 100 if emotion == "sad" else 200)
    engine.setProperty('volume', 0.8 if emotion == "neutral" else 1.0 if emotion == "happy" or emotion == "angry" else 0.5)

    temp_file = NamedTemporaryFile(delete=False, suffix=".mp3")
    engine.save_to_file(text, temp_file.name)
    engine.runAndWait()
    return temp_file.name

# Music generation using MusicGen
def generate_music(prompt):
    try:
        descriptions = [prompt]
        wav = music_model.generate(descriptions)
        temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
        audio_data = wav.cpu().numpy()
        wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
        return temp_file.name
    except Exception as e:
        return f"Error generating music: {e}"

# Spectrogram generation from audio
def generate_spectrogram(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        S = librosa.feature.melspectrogram(y, sr=sr)
        S_dB = librosa.power_to_db(S, ref=np.max)

        plt.figure(figsize=(10, 4))
        librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap='coolwarm')
        plt.colorbar(format='%+2.0f dB')
        plt.title('Mel-frequency spectrogram')
        temp_image = NamedTemporaryFile(delete=False, suffix=".png")
        plt.savefig(temp_image.name)
        plt.close()
        return temp_image.name
    except Exception as e:
        return f"Error generating spectrogram: {e}"

# Chat with AI (GPT-2)
def chat_with_ai(user_input):
    try:
        inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
        outputs = gpt2_model.generate(inputs, max_length=50, num_return_sequences=1)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response
    except Exception as e:
        return f"Error in chat generation: {e}"

# Simulate Video Generation using a Sequence of Images
def generate_video(prompt):
    frames = []
    for i in range(5):  # Generate 5 frames as a sequence
        frame_prompt = f"{prompt} frame {i+1}"
        frame_path = generate_image(frame_prompt)
        frames.append(Image.open(frame_path))

    temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
    frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
    return temp_video.name

# Main interface logic
def main_interface(input_text, task_type, style):
    try:
        if task_type == "Conversation":
            response = chat_with_ai(input_text)
            image_path = generate_image(f"conversation about {input_text}", style)
            return response, None, image_path

        elif task_type == "Music":
            audio_path = generate_music(input_text)
            spectrogram_path = generate_spectrogram(audio_path)
            return "Music Generated", audio_path, spectrogram_path

        elif task_type == "Text to Audio":
            audio_path = text_to_audio(input_text)
            image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
            return "Audio Generated", audio_path, image_path

        elif task_type == "Video Generation":
            video_path = generate_video(input_text)
            audio_path = generate_music(input_text)
            return "Video Generated", audio_path, video_path
    except Exception as e:
        return f"Error: {e}", None, None

# Gradio interface setup
interface = gr.Interface(
    fn=main_interface,
    inputs=[
        gr.Textbox(label="Enter Text or Prompt"),
        gr.Radio(["Conversation", "Music", "Text to Audio", "Video Generation"], label="Select Task"),
        gr.Dropdown(["realistic", "abstract", "comic"], label="Select Style"),
    ],
    outputs=[
        gr.Textbox(label="Generated Output"),
        gr.Audio(label="Generated Audio", type="filepath"),
        gr.Image(label="Generated Image", type="filepath"),
    ],
    live=False,
)

interface.launch()


In [2]:
#working code
#!pip install transformers diffusers gradio librosa audiocraft pyttsx3
#!pip install --upgrade torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
import torch
from audiocraft.models import MusicGen
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pyttsx3
import gradio as gr
from tempfile import NamedTemporaryFile
import numpy as np
import scipy.io.wavfile as wavfile
from diffusers import StableDiffusionPipeline
import matplotlib.pyplot as plt
import librosa.display
import librosa
import soundfile as sf
from PIL import Image
import os

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# MusicGen
music_model = MusicGen.get_pretrained("small", device=device)

# GPT-2 for conversation
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

# Stable Diffusion for image generation
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
pipe = pipe.to(device)

# Emotion detection for Text-to-Audio
def get_emotion_tone(text):
    if any(word in text.lower() for word in ["happy", "joy", "excited"]):
        return "happy"
    elif any(word in text.lower() for word in ["sad", "down", "melancholy"]):
        return "sad"
    elif any(word in text.lower() for word in ["angry", "frustrated"]):
        return "angry"
    else:
        return "neutral"

# Image generation using Stable Diffusion
def generate_image(prompt, style="realistic"):
    styled_prompt = f"{style} style {prompt}"
    try:
        image = pipe(styled_prompt).images[0]
        temp_image = NamedTemporaryFile(delete=False, suffix=".png")
        image.save(temp_image.name)
        return temp_image.name
    except Exception as e:
        return f"Error generating image: {e}"

# Convert Text to Audio with Emotion
def text_to_audio(text):
    emotion = get_emotion_tone(text)
    engine = pyttsx3.init()
    engine.setProperty('rate', 150 if emotion == "neutral" else 180 if emotion == "happy" else 100 if emotion == "sad" else 200)
    engine.setProperty('volume', 0.8 if emotion == "neutral" else 1.0 if emotion == "happy" or emotion == "angry" else 0.5)

    temp_file = NamedTemporaryFile(delete=False, suffix=".mp3")
    engine.save_to_file(text, temp_file.name)
    engine.runAndWait()
    return temp_file.name

# Music generation using MusicGen
def generate_music(prompt):
    try:
        descriptions = [prompt]
        wav = music_model.generate(descriptions)
        temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
        audio_data = wav.cpu().numpy()
        wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
        return temp_file.name
    except Exception as e:
        return f"Error generating music: {e}"

# Spectrogram generation from audio
def generate_spectrogram(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        S = librosa.feature.melspectrogram(y, sr=sr)
        S_dB = librosa.power_to_db(S, ref=np.max)

        plt.figure(figsize=(10, 4))
        librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap='coolwarm')
        plt.colorbar(format='%+2.0f dB')
        plt.title('Mel-frequency spectrogram')
        temp_image = NamedTemporaryFile(delete=False, suffix=".png")
        plt.savefig(temp_image.name)
        plt.close()
        return temp_image.name
    except Exception as e:
        return f"Error generating spectrogram: {e}"

# Chat with AI (GPT-2)
def chat_with_ai(user_input):
    try:
        inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
        outputs = gpt2_model.generate(inputs, max_length=50, num_return_sequences=1)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response
    except Exception as e:
        return f"Error in chat generation: {e}"

# Simulate Video Generation using a Sequence of Images
def generate_video(prompt):
    frames = []
    for i in range(5):  # Generate 5 frames as a sequence
        frame_prompt = f"{prompt} frame {i+1}"
        frame_path = generate_image(frame_prompt)
        frames.append(Image.open(frame_path))

    temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
    frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
    return temp_video.name

# Main interface logic
def main_interface(input_text, task_type, style):
    try:
        if task_type == "Conversation":
            response = chat_with_ai(input_text)
            image_path = generate_image(f"conversation about {input_text}", style)
            return response, None, image_path

        elif task_type == "Music":
            audio_path = generate_music(input_text)
            spectrogram_path = generate_spectrogram(audio_path)
            return "Music Generated", audio_path, spectrogram_path

        elif task_type == "Text to Audio":
            audio_path = text_to_audio(input_text)
            image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
            return "Audio Generated", audio_path, image_path

        elif task_type == "Video Generation":
            video_path = generate_video(input_text)
            audio_path = generate_music(input_text)
            return "Video Generated", audio_path, video_path
    except Exception as e:
        return f"Error: {e}", None, None

# Gradio interface setup
interface = gr.Interface(
    fn=main_interface,
    inputs=[
        gr.Textbox(label="Enter Text or Prompt"),
        gr.Radio(["Conversation", "Music", "Text to Audio", "Video Generation"], label="Select Task"),
        gr.Dropdown(["realistic", "abstract", "comic"], label="Select Style"),
    ],
    outputs=[
        gr.Textbox(label="Generated Output"),
        gr.Audio(label="Generated Audio", type="filepath"),
        gr.Image(label="Generated Image", type="filepath"),
    ],
    live=False,
)

interface.launch()



  return torch.load(file, map_location=device)
  return torch.load(file, map_location=device)
  WeightNorm.apply(module, name, dim)


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f6806ba319a838d2bf.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


