# **Step 1: Installing Dependancies**

In [None]:
!pip install -qU google-generativeai
!pip install -qU google-ai-generativelanguage
!pip install -qU langchain
!pip install -qU langchain-google-genai
!pip install -qU murf
!pip install -qU pdfplumber
!pip install -qU pdfminer.six

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-google-genai 2.1.9 requires google-ai-generativelanguage<0.7.0,>=0.6.18, but you have google-ai-generativelanguage 0.6.15 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-generativeai 0.8.5 requires google-ai-generativelanguage==0.6.15, but you have google-ai-generativelanguage 0.6.18 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-generativeai 0.8.5 requires google-ai-generativelanguage==0.6.15, but you have google-ai-generativelanguage 0.6.18 which is

# **Step 2: Importing Libraries**

In [None]:
import os
import base64
import mimetypes
import ipywidgets as widgets
from IPython.display import display, Audio, HTML
from google.colab import userdata, auth
from getpass import getpass
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
from murf import Murf
import requests

# **Step 3: Setting up API Key & Environment Variable**

In [None]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ['GOOGLE_API_KEY'] = getpass("🔑 Enter your Google API Key: ")

if "MURF_API_KEY" not in os.environ:
    os.environ['MURF_API_KEY'] = getpass("🔊 Enter your Murf API Key: ")

🔑 Enter your Google API Key: ··········
🔊 Enter your Murf API Key: ··········


# **Step 4: Initalizing the Agents**

In [None]:
llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-flash-latest", temperature=0.8, google_api_key=os.environ['GOOGLE_API_KEY'])
murf_client = Murf(api_key=os.environ["MURF_API_KEY"])

# **Step 5: Creating UI Components**

In [None]:
upload_widget = widgets.FileUpload(
    accept='',
    multiple=False,
    description="📤 Upload File",
    layout=widgets.Layout(width='50%', height='60px')
)
upload_widget.style.button_color = '#ff6600'
upload_widget.style.font_color = 'black'


submit_button = widgets.Button(
    description="📄 Analyze File",
    button_style='primary',
    icon='search',
    layout=widgets.Layout(width='200px')
)

tts_button = widgets.Button(
    description="🔊 Text to Speech",
    button_style='success',
    icon='volume-up',
    disabled=True,
    layout=widgets.Layout(width='200px')
)

black_style = widgets.Layout(
    border='solid 2px white',
    background_color='black',
    color='white',
    padding='10px',
    width='auto'
)

output_area = widgets.Output(layout=black_style)
audio_area = widgets.Output(layout=black_style)

# **Agent 1: Processes Uploaded File**

In [None]:
def handle_analysis(btn):
    output_area.clear_output()
    audio_area.clear_output()
    tts_button.disabled = True

    if not upload_widget.value:
        with output_area:
            print("Please upload a file first.")
        return

    file_info = next(iter(upload_widget.value.values()))
    filename = file_info['metadata']['name']
    file_bytes = file_info['content']
    mime_type, _ = mimetypes.guess_type(filename)
    if mime_type is None:
        mime_type = "application/octet-stream"

    try:
        with output_area:
            print(f"📄 Processing `{filename}` (type: {mime_type})")

        if mime_type.startswith("text/") or "pdf" in mime_type or "word" in mime_type:
            if "pdf" in mime_type:
                import pdfplumber, io
                with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
                    extracted_text_local = "\n".join(page.extract_text() or "" for page in pdf.pages)
                prompt = f"Here is the extracted text from the document '{filename}'. Simply, print the full raw text from it.\n\n{extracted_text_local}"
                parts = [HumanMessage(content=prompt)]
            else:
                prompt = f"Extract full raw text from the document named '{filename}'."
                parts = [
                    HumanMessage(content=prompt),
                    HumanMessage(content=[{"mime_type": mime_type, "data": file_bytes}])
                ]

        elif mime_type.startswith("image/") or mime_type == "image/gif":
            prompt = f"Analyze the visual content of the image '{filename}'."
            base64_image = base64.b64encode(file_bytes).decode('utf-8')
            parts = [
                HumanMessage(content=prompt),
                HumanMessage(content=[{
                    "type": "image_url",
                    "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}
                }])
            ]
        else:
            with output_area:
                print(f"Unsupported file type: `{mime_type}`")
            return

        response = llm.invoke(parts)
        global extracted_text
        extracted_text = response.content

        with output_area:
            print("\n")
            print(extracted_text)

        tts_button.disabled = False

    except Exception as e:
        with output_area:
            print(f"Error during processing: {e}")

# **Agent 2: Convert Agent 1 Output to Speech**

In [None]:
def handle_tts(btn):
    audio_area.clear_output()
    try:
        res = murf_client.text_to_speech.generate(
            text=extracted_text,
            voice_id="en-IN-rohan",
            style="Conversational",
            format="MP3",
            variation=3
        )
        audio_url = res.audio_file
        if audio_url:
            response = requests.get(audio_url)
            response.raise_for_status()
            with open("output.mp3", "wb") as f:
                f.write(response.content)
            with audio_area:
                display(Audio("output.mp3"))
        else:
            with audio_area:
                print("Failed to generate audio.")
    except Exception as e:
        with audio_area:
            print(f"TTS Error: {e}")

# **Step 6: Customizing UI & Final Output Layout**

In [None]:
submit_button.on_click(handle_analysis)
tts_button.on_click(handle_tts)

display(HTML("""
    <style>
        body {
            background-color: black !important;
            color: white !important;
        }
        .widget-label {
            color: white !important;
        }
        .widget-button {
            margin: 10px;
        }
    </style>
    <div style="text-align:center; padding:20px; background-color:black; color:white; border-radius:10px;">
        <h2>👁️‍🗨️ VocalEyes <br> Hear the World, See the Possibilities</h2>
        <p style="font-size:16px;"><br>Upload a document or image ➜ Analyze with Our AI Agent ➜ Convert Result to Audio</p>
        <p style="font-size:14px;"><br>Supported File Types: .txt .pdf .docx .jpg .jpeg .png .gif  <br> Maximum Upload size: ~100 MB</p>
    </div>
"""))
display(widgets.VBox([
    widgets.HBox([upload_widget], layout=widgets.Layout(justify_content='center', padding='10px')),
    widgets.HBox([submit_button, tts_button], layout=widgets.Layout(justify_content='center', padding='10px')),
    widgets.Label(value="📃 Text Output", style={'description_width': 'initial'}),
    output_area,
    widgets.Label(value="🎧 Audio Output", style={'description_width': 'initial'}),
    audio_area
]))

VBox(children=(HBox(children=(FileUpload(value={}, description='📤 Upload File', layout=Layout(height='60px', w…