<a href="https://colab.research.google.com/github/Chinmaysahoo03/Multimodal_GenAI_Assistant/blob/main/multimodal_appp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
%%writefile app.py
import streamlit as st
import torch
from diffusers import StableDiffusionPipeline, TextToVideoSDPipeline
from PIL import Image
import os
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")  # Suppress minor warnings for cleaner output

# Setup Groq + LangChain for Chatbot
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Streamlit App Layout
st.title("Multimodal AI Assistant")
st.markdown("""
Enter your Groq API key below to enable text generation. Then, use the input field to:
- Chat with the assistant
- Generate images with: **generate image: [description]**
- Generate videos with: **generate video: [description]**
""")

# Input field for Groq API key
if "groq_api_key" not in st.session_state:
    st.session_state.groq_api_key = ""

groq_api_key = st.text_input("Enter your Groq API Key:", type="password", key="groq_api_input")
if groq_api_key:
    st.session_state.groq_api_key = groq_api_key
    st.success("Groq API key set successfully!")
else:
    st.warning("Please enter a valid Groq API key to proceed.")
    st.stop()

# Initialize Groq model (Gemma-2-9B-IT)
try:
    llm = ChatGroq(api_key=st.session_state.groq_api_key, model="gemma2-9b-it", temperature=0.7)
    # Prompt template for conversational style with history
    prompt_template = PromptTemplate(
        input_variables=["history", "input"],
        template="{history}\nHuman: {input}\nAssistant:"
    )
    chain = LLMChain(llm=llm, prompt=prompt_template)
except Exception as e:
    st.error(f"Failed to initialize Groq model: {e}")
    st.stop()

# Setup Text-to-Image: Stable Diffusion
try:
    pipe_image = StableDiffusionPipeline.from_pretrained(
        "CompVis/stable-diffusion-v1-4",
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )
    if torch.cuda.is_available():
        pipe_image = pipe_image.to("cuda")
        st.info("GPU enabled for faster image generation!")
    else:
        st.info("Running image generation on CPU.")
except Exception as e:
    st.error(f"Failed to load Stable Diffusion model: {e}")
    pipe_image = None

# Setup Text-to-Video: ModelScope Text-to-Video
try:
    pipe_video = TextToVideoSDPipeline.from_pretrained(
        "damo-vilab/text-to-video-ms-1.7b",
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        variant="fp16"
    )
    if torch.cuda.is_available():
        pipe_video = pipe_video.to("cuda")
        st.info("GPU enabled for faster video generation!")
    else:
        st.info("Running video generation on CPU.")
except Exception as e:
    st.error(f"Failed to load Text-to-Video model: {e}")
    pipe_video = None

if pipe_image and pipe_video:
    st.success("Models loaded successfully! Ready for chat, image, and video generation.")

def generate_chat_response(user_input, history=""):
    """
    Generate a text response using Groq and LangChain.
    """
    try:
        input_dict = {"input": user_input, "history": history}
        response = chain.invoke(input_dict)
        response_text = response['text'] if 'text' in response else response.get('output', 'No valid response')
        new_history = f"{history}\nHuman: {user_input}\nAssistant: {response_text}"
        return response_text.strip(), new_history
    except Exception as e:
        st.error(f"Error generating text response: {e}")
        return "Error generating response.", history

def generate_image(prompt, negative_prompt="blurry, low quality", num_steps=50):
    """
    Generate an image from a text prompt using Stable Diffusion.
    """
    if pipe_image is None:
        st.error("Image generation model not loaded.")
        return None, None
    try:
        image = pipe_image(
            prompt,
            negative_prompt=negative_prompt,
            num_inference_steps=num_steps
        ).images[0]
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        image_path = f"generated_image_{timestamp}.png"
        image.save(image_path)
        return image, image_path
    except Exception as e:
        st.error(f"Error generating image: {e}")
        return None, None

def generate_video(prompt, num_frames=16, num_steps=25, fps=8):
    """
    Generate a video from a text prompt using ModelScope Text-to-Video pipeline.
    """
    if pipe_video is None:
        st.error("Video generation model not loaded.")
        return None
    try:
        video_frames = pipe_video(
            prompt,
            num_inference_steps=num_steps,
            height=320,
            width=512,
            num_frames=num_frames
        ).frames[0]
        from diffusers.utils import export_to_video
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        video_path = f"generated_video_{timestamp}.mp4"
        export_to_video(video_frames, video_path, fps=fps)
        return video_path
    except Exception as e:
        st.error(f"Error generating video: {e}")
        return None

# Initialize session state for history
if "history" not in st.session_state:
    st.session_state.history = ""

# Input field for user query
user_input = st.text_input("Your Input:", key="user_input")

# Buttons for actions
col1, col2, col3 = st.columns(3)
with col1:
    chat_button = st.button("Generate Text Response")
with col2:
    image_button = st.button("Generate Image")
with col3:
    video_button = st.button("Generate Video")

# Process user input
if user_input:
    if chat_button or (user_input and not user_input.lower().startswith(("generate image:", "generate video:"))):
        # Handle text response
        with st.spinner("Generating text response..."):
            response, st.session_state.history = generate_chat_response(user_input, st.session_state.history)
            st.write("**Assistant Response:**")
            st.write(response)

            # Offer to generate image or video based on response
            st.write("Generate media based on this response?")
            col4, col5 = st.columns(2)
            with col4:
                if st.button("Generate Image from Response"):
                    image_prompt = response[:100] + "..." if len(response) > 100 else response
                    with st.spinner("Generating image..."):
                        image, image_path = generate_image(image_prompt)
                        if image:
                            st.image(image, caption="Generated Image")
                            st.write(f"Image saved as {image_path}")
                            with open(image_path, "rb") as file:
                                st.download_button("Download Image", file, file_name=image_path)
            with col5:
                if st.button("Generate Video from Response"):
                    video_prompt = response[:100] + "..." if len(response) > 100 else response
                    with st.spinner("Generating video (this may take a while)..."):
                        video_path = generate_video(video_prompt)
                        if video_path:
                            st.video(video_path)
                            st.write(f"Video saved as {video_path}")
                            with open(video_path, "rb") as file:
                                st.download_button("Download Video", file, file_name=video_path)

    elif user_input.lower().startswith("generate image:") and image_button:
        # Handle direct image generation
        image_prompt = user_input[15:].strip()
        with st.spinner("Generating image..."):
            image, image_path = generate_image(image_prompt)
            if image:
                st.image(image, caption="Generated Image")
                st.write(f"Image saved as {image_path}")
                with open(image_path, "rb") as file:
                    st.download_button("Download Image", file, file_name=image_path)

    elif user_input.lower().startswith("generate video:") and video_button:
        # Handle direct video generation
        video_prompt = user_input[15:].strip()
        with st.spinner("Generating video (this may take a while)..."):
            video_path = generate_video(video_prompt)
            if video_path:
                st.video(video_path)
                st.write(f"Video saved as {video_path}")
                with open(video_path, "rb") as file:
                    st.download_button("Download Video", file, file_name=video_path)

# Demo section
st.subheader("Run Demo")
if st.button("Run Demo"):
    demo_inputs = [
        "Hi, what's generative AI?",
        "Can you describe a scene of AI creating art?",
        "generate image: AI painting a masterpiece",
        "generate video: AI robot dancing"
    ]
    st.session_state.history = ""
    for user_input in demo_inputs:
        st.write(f"**Demo Input:** {user_input}")
        if user_input.lower().startswith("generate image:"):
            image_prompt = user_input[15:].strip()
            with st.spinner("Generating demo image..."):
                image, image_path = generate_image(image_prompt)
                if image:
                    st.image(image, caption=f"Demo Image: {image_prompt}")
                    st.write(f"Image saved as {image_path}")
                    with open(image_path, "rb") as file:
                        st.download_button("Download Demo Image", file, file_name=image_path)
        elif user_input.lower().startswith("generate video:"):
            video_prompt = user_input[15:].strip()
            with st.spinner("Generating demo video..."):
                video_path = generate_video(video_prompt)
                if video_path:
                    st.video(video_path)
                    st.write(f"Video saved as {video_path}")
                    with open(video_path, "rb") as file:
                        st.download_button("Download Demo Video", file, file_name=video_path)
        else:
            with st.spinner("Generating demo text response..."):
                response, st.session_state.history = generate_chat_response(user_input, st.session_state.history)
                st.write("**Assistant Response:**")
                st.write(response)

Overwriting app.py


In [11]:
# Install dependencies
!pip install streamlit langchain-groq transformers torch diffusers accelerate pillow moviepy opencv-python pyngrok -q

# Import required libraries
from pyngrok import ngrok
from google.colab import userdata
import subprocess

# Set ngrok authtoken from Colab Secrets
ngrok_auth_token = userdata.get('ngrok_auth_token')
if not ngrok_auth_token:
    print("Error: Please set the 'ngrok_auth_token' in Colab Secrets.")
else:
    ngrok.set_auth_token(ngrok_auth_token)

    # Start Streamlit server in the background
    subprocess.Popen(["streamlit", "run", "app.py", "--server.port", "8501"])

    # Create ngrok tunnel
    public_url = ngrok.connect(8501)
    print(f"Access the Streamlit app at: {public_url}")



PyngrokNgrokHTTPError: ngrok client exception, API returned 502: {"error_code":103,"status_code":502,"msg":"failed to start tunnel","details":{"err":"failed to start tunnel: The authtoken credential '32wwqziCfTj28VWfkF0SccsvuIo' has been revoked\nand is no longer valid.\r\n\r\nERR_NGROK_300\r\n"}}
