# Text Input

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
from langchain.agents import create_agent

agent = create_agent(
    model = "gpt-5-nano",
    system_prompt="You are a Hindi Fiction Story Writer, Create a capital city of anything."
)

In [None]:
from langchain.messages import HumanMessage

question = HumanMessage(content=[
    {
        "type" : "text",
        "text" : "What is Capital of Mars?"
    }]
)

response = agent.invoke(
    {"messages" : [question]}
)

print(response['messages'][-1].content)

# Image Input

In [None]:
from ipywidgets import FileUpload
from IPython.display import display

uploader = FileUpload(accept='.jpeg', multiple = False)
display(uploader)

In [None]:
print(uploader.value)

In [None]:
import base64

# Get the first (and only) uploaded file dict
uploaded_file = uploader.value[0]

# This is a memoryview
content_mv = uploaded_file["content"]

# Convert memoryview -> bytes
img_bytes = bytes(content_mv)

# Now base64 encode
img_64 = base64.b64encode(img_bytes).decode("utf-8")

In [None]:
multimodal_question = HumanMessage(content=[
    { "type" : "text",
      "text" : "Tell me about this"
    },
    { "type" : "image",
      "base64" : img_64, 
      "mime_type" : "image/jpeg"
    }
])

response = agent.invoke(
    { "messages" : [multimodal_question] }
)

print(response['messages'][-1].content)


# Audio Inputs

In [None]:
!pip install pyaudio

In [None]:
import sounddevice as sd
from scipy.io.wavfile import write
import base64
import io
import time
from tqdm import tqdm


# Recording Settings
duration = 5 # seconds
sample_rate = 44100

print("Recording...")

audio = sd.rec(int(duration*sample_rate), samplerate = sample_rate, channels=1)

# Progress Bar for the Duration
for _ in tqdm (range(duration * 10)):
    time.sleep(0.1)
sd.wait()
print("Done")

# Write WAV file to an in-memory Buffer
buf = io.BytesIO()
write(buf, sample_rate, audio)
wav_bytes = buf.getvalue()

aud_64 = base64.b64encode(wav_bytes).decode("utf-8")

In [None]:
agent = create_agent(
    model = 'gpt-40-audio-preview',
)

multimodal_question = HumanMessage(content=[
    {
        "type" : "text", 
        "text" : "Tell me about this audio file."
    }, 
    {
        "type" : "audio",
        "base64" : aud_64, 
        "mime_type": "audio/wav"
    }
])

response = agent.invoke(
    { "messages":[multimodal_question] }
)

print( response['messages'][-1].content )