# Install and Imports
This code demostration is a copy from kaggle notebook

In [None]:
# !pip install langgraph
# !pip install -q timm==1.0.17
# !pip install git+https://github.com/huggingface/transformers.git

In [None]:
import kagglehub
import torch
import gc
import os
from transformers import AutoProcessor, AutoModelForImageTextToText

from langchain_core.messages import BaseMessage
from typing_extensions import TypedDict, Annotated
from typing import Optional, Sequence
from langgraph.graph import StateGraph, START, END, add_messages # (we can remove if not needed, we will not using langgraph only the add_message is needed we can mimic that)


# Agent State / Model

In [None]:
class AgentState(TypedDict):
    """Defines the state of our agent."""
    audio_path: Optional[str]
    image_path: Optional[str] 
    transcribed_text: Optional[str]
    image_description: Optional[str] 
    news_report: Annotated[Sequence[BaseMessage], add_messages]
    current_feedback: Optional[str]

    # instruct version
    gemma3n_2b_model_path = kagglehub.model_download("google/gemma-3n/transformers/gemma-3n-e2b-it")

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(device)

    processor = AutoProcessor.from_pretrained(gemma3n_2b_model_path)
    model = AutoModelForImageTextToText.from_pretrained(gemma3n_2b_model_path, torch_dtype="auto").to(device)

# Tools

### Function helper

In [None]:
# Function helper to call gemma
def generate(messages):
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(device, dtype=model.dtype)
    
    outputs = model.generate(**inputs, max_new_tokens=512, disable_compile=True)
    text = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])
    
    # clean-up the variables to free-up GPU RAM
    del inputs
    del outputs
    torch.cuda.empty_cache()
    gc.collect()
    
    return text

### Transcribe

In [None]:
# Sample audio
import requests
from IPython.display import Audio

audio_url='https://drive.google.com/uc?export=download&id=1sJrtX0N_Das3LnKOwIMagOrVG7mX9EVG'

# Download the audio file
response = requests.get(audio_url)
with open("sample_audio.wav", "wb") as f:
    f.write(response.content)

# Play the downloaded audio file
Audio("sample_audio.wav")

In [None]:
# transcribe audio
def transcribe_audio_node(state: AgentState) -> dict:
    """Transcribes the audio file specified in the state."""
    print("--- 🎤 TRANSCRIBING AUDIO ---")
    # audio_path = state['audio_path']
    audio_path = state.get('audio_path')
    
    messages = [{
        "role": "user",
        "content": [
            {"type": "audio", "audio": audio_path},
            {"type": "text", "text": "Transcribe the following audio. Provide only the transcribed text."}
        ]
    }]
    
    transcribed_text = generate(messages)
    print(f"   > Transcription: {transcribed_text[:800]}...")
    return {"transcribed_text": transcribed_text}

In [None]:
#### Testing ####
test_state = {
    "audio_path": "sample_audio.wav"}

# 2. Call the function directly with the test state
print("--- 🧪 TESTING transcribe_audio_node ---")
transcription_result = transcribe_audio_node(test_state)

# 3. Print the result to verify the output
print("\n--- ✅ TEST COMPLETE ---")
transcription_result

### Describe Image

In [None]:
# sampel image
from IPython.display import Image
image_url = 'https://drive.google.com/uc?export=download&id=1-p8xLxUPZzoAwNTUFV29b90N08t6wZfl'
Image(url=image_url,height=480,width=480)


response = requests.get(image_url)
with open("river.jpg", "wb") as f:
    f.write(response.content)
Image('river.jpg')


In [None]:
def describe_image_tool(state: AgentState) -> dict:
    """
    Takes an image path from the state,
    describes the image like a news anchor, reporter, and journalist using a multimodal LLM if an image_path is present.
    and returns a dictionary to update the state.
    """
    print("--- 🖼️ DESCRIBING IMAGE ---")
    image_path = state.get('image_path')


    # Prepare the prompt for the model
    messages = [{
        "role": "user",
        "content": [
            # The model needs both the image and a text prompt
            {"type": "image", "image": image_path},
            {"type": "text", "text": 'Describe this image in detail.'}
        ]
    }]

    # Call the existing helper function to get the description
    image_description = generate(messages)
    print(f"   > Description generated successfully.")

    # Return a dictionary with the state field to update
    return {"image_description": image_description}

In [None]:
### Testing ###
test_state = {
    "image_path": 'river.jpg'}

print("--- 🧪 TESTING Decribe Image ---")
image_result = describe_image_tool(test_state)

print("\n--- ✅ TEST COMPLETE ---")
image_result

### Ai Report Agent

In [None]:
from langchain_core.messages import AIMessage
def ai_agent_reporter(state: dict) -> dict:
    """
    Generates a news report from transcription and/or image description using Gemma-3n.
    """
    print("--- ✍️ GENERATING NEWS REPORT ---")
    
    # 1. Build the prompt from the state
    context_parts = [
        "You are an expert news reporter. Your task is to write a clear, concise, and factual news report based on the following information.",
        "Synthesize all available information into a single, coherent story. Present it as a professional news report."
    ]

    transcribed_text = state.get('transcribed_text')
    image_description = state.get('image_description')

    if not transcribed_text and not image_description:
        return {"news_report": [AIMessage(content="No input provided to generate a report.")]}

    if transcribed_text:
        context_parts.append(f"--- Transcribed Audio ---\n\"{transcribed_text}\"")
    if image_description:
        context_parts.append(f"--- Image Description ---\n\"{image_description}\"")
    
    # 2. Call the correct (Gemma-3n) model
    prompt = "\n\n".join(context_parts)
    

    # The 'content' must be a list containing a text dictionary
    # to match the multimodal format expected by our 'generate' function.
    messages = [{
        "role": "user",
        "content": [
            {"type": "text", "text": prompt}
        ]
    }]
    
    report_content = generate(messages)
    print("   > Report generated successfully.")

    # 3. Return only the updated part of the state
    return {"news_report": [AIMessage(content=report_content)]}

In [None]:
### Testing ###
# 1. Create a sample state with dummy data
test_state = {
    "transcribed_text": "The city council today approved the new budget for the fiscal year. The measure passed with a 7-2 vote after a lengthy debate.",
    "image_description": "A photograph shows a group of five officials sitting at a long wooden desk in a formal meeting room. The official in the center is speaking into a microphone.",
    "news_report": [] # This is required by the AgentState definition
}

# 2. Call the corrected function
report_result = ai_agent_reporter(test_state)

# 3. Print the result
print("\n--- ✅ TEST COMPLETE ---")

# You can also inspect the content directly
print("\nGenerated Report:")
print(report_result['news_report'][0].content)

### Revise Report

In [None]:
def revise_report_node(state: dict) -> dict:
    """Revises the news report based on the latest human feedback using Gemma-3n."""
    print("--- 🔄 REVISING REPORT ---")
    
    # Get context from the state
    transcribed = state.get("transcribed_text", "Not available.")
    
    # Get the latest human feedback and the last AI report from the message history
    human_feedback = state['news_report'][-1].content
    last_ai_report = state['news_report'][-2].content

    # Construct the prompt for the model
    prompt = f"""You are a professional news editor.
Revise the news report to address the feedback. Ensure clarity, grammar, and style are improved, while staying faithful to the original transcription.

**Original Transcription:**
"{transcribed}"

**Current Draft of News Report:**
"{last_ai_report}"

**Latest Human Feedback:**
"{human_feedback}"

Provide only the full, revised news report as your response.
"""
    
    # Format the messages correctly for our multimodal 'generate' function
    messages = [{"role": "user", 
                 "content": [{"type": "text", "text": prompt}]}]
    
    # Use the 'generate' function for Gemma-3n
    revised_content = generate(messages)
    print("   > Revision complete.")

    # Return the new AI message to be added to the state
    return {"news_report": [AIMessage(content=revised_content)]}

In [None]:
# Testing

# 1. Create a sample state that mimics a conversation history.
#    The 'news_report' list must have an AI message followed by a Human message.
test_state_for_revision = {

    "transcribed_text": "The city council today approved the new budget for the fiscal year.",
    
    "news_report": [
        AIMessage(content="The council has approved the new budget for the upcoming fiscal year after a lengthy debate."),
        HumanMessage(content="Revise the report to be in tagalog langguage.")
    ]
}

# 2. Call the corrected revision function
revision_result = revise_report_node(test_state_for_revision)

# 3. Print the result
print("\nRevised Report Content:")
print(revision_result['news_report'][0].content)
print("\n--- ✅ TEST COMPLETE ---")

# Save

In [None]:
def save_report_node(state: dict) -> dict:
    """Saves the latest AI-generated news report to a text file."""
    print("--- 💾 SAVING REPORT ---")
    
    # Find the latest message from the AI
    latest_report_msg = next(
        (msg for msg in reversed(state["news_report"]) if isinstance(msg, AIMessage)), 
        None
    )
    
    if not latest_report_msg:
        return {"final_message": "Error: No report available to save."}

    # Prepare to save the file
    output_dir = "saved_reports"
    os.makedirs(output_dir, exist_ok=True)
    filename = os.path.join(output_dir, "news_report.txt")

    # Write the content to the file
    with open(filename, "w", encoding="utf-8") as f:
        f.write(latest_report_msg.content)
    
    final_message = f"✅ News report successfully saved to: {filename}"
    print(f"   > {final_message}")

    # Return a dictionary to update the state
    return {"final_message": final_message}

# Gradio

In [None]:
# ===== GRADIO APPLICATION LOGIC =====
def run_initial_generation(audio_path, image_path):
    """Handles the first step and returns all necessary outputs for the UI."""
    if not audio_path and not image_path:
        return "Please provide an audio or image file.", None, gr.update(visible=False), None, None, None

    # Run the pipeline
    state = AgentState(audio_path=audio_path, image_path=image_path, news_report=[])
    state.update(transcribe_audio_node(state))
    state.update(describe_image_tool(state))
    state.update(ai_agent_reporter(state))

    # Extract info for UI
    latest_report = state["news_report"][-1].content
    transcribed_text = state.get('transcribed_text') or "No audio was provided to transcribe."
    image_description = state.get('image_description') or "No image was provided to describe."

    return latest_report, state, gr.update(visible=True), "", transcribed_text, image_description

def run_revision(feedback, current_state):
    """Handles revision and ensures all UI fields are correctly populated."""
    if not feedback or not feedback.strip():
        latest_report = next((msg.content for msg in reversed(current_state["news_report"]) if isinstance(msg, AIMessage)), "")
        transcribed_text = current_state.get('transcribed_text', "")
        image_description = current_state.get('image_description', "")
        return latest_report, current_state, "Please provide feedback.", transcribed_text, image_description

    # Run revision pipeline
    current_state["news_report"] = add_messages(current_state["news_report"], [HumanMessage(content=feedback)])
    current_state.update(revise_report_node(current_state))

    # Extract info for UI
    latest_report = current_state["news_report"][-1].content
    transcribed_text = current_state.get('transcribed_text') or "No audio was provided."
    image_description = current_state.get('image_description') or "No image was provided."

    return latest_report, current_state, "", transcribed_text, image_description

def run_save(current_state):
    save_update = save_report_node(current_state)
    return save_update["final_message"]


In [None]:
 # ===== GRADIO UI DEFINITION =====
import gradio as gr
with gr.Blocks(theme=gr.themes.Soft(), title="Multimodal News Reporter") as demo:
    agent_state = gr.State(value=None)

    gr.Markdown("# 📰 Multimodal News Reporter AI")
    gr.Markdown("Upload an audio recording and/or a relevant image. The AI will generate a news report that you can then revise and save.")

    with gr.Row():
        with gr.Column(scale=1):
            audio_input = gr.Audio(label="Audio Evidence", type="filepath")
            image_input = gr.Image(label="Image Evidence", type="filepath")
            generate_btn = gr.Button("📝 Generate Initial Report", variant="primary")
        with gr.Column(scale=2):
            report_output = gr.Textbox(label="Generated News Report", lines=12, interactive=False)
            status_output = gr.Markdown(value="")
            
            # --- NEW: Collapsible section for source info ---
            with gr.Accordion("Show Source Information", open=False) as source_info_accordion:
                transcribed_audio_output = gr.Textbox(label="🎤 Transcribed Audio", interactive=False, lines=5)
                image_description_output = gr.Textbox(label="🖼️ Image Description", interactive=False, lines=5)

            with gr.Group(visible=False) as revision_group:
                gr.Markdown("### ✍️ Provide Feedback for Revision")
                feedback_input = gr.Textbox(label="Your Feedback", placeholder="e.g., 'Make the tone more formal.'")
                with gr.Row():
                    revise_btn = gr.Button("🔄 Revise Report")
                    save_btn = gr.Button("💾 Save Final Report")

    # --- Event Handlers (UPDATED) ---
    generate_btn.click(
        fn=run_initial_generation,
        inputs=[audio_input, image_input],
        outputs=[report_output, agent_state, revision_group, status_output, transcribed_audio_output, image_description_output]
    )
    revise_btn.click(
        fn=run_revision,
        inputs=[feedback_input, agent_state],
        outputs=[report_output, agent_state, status_output, transcribed_audio_output, image_description_output]
    ).then(fn=lambda: "", outputs=[feedback_input])
    save_btn.click(
        fn=run_save,
        inputs=[agent_state],
        outputs=[status_output]
    )

# 5. ===== LAUNCH THE APP =====
if __name__ == "__main__":
    demo.launch(debug=True)

---

# Write up

📝 Project Title: Newsly – Field Reporting Reinvented
🗣️ Why We Built Newsly
“In every storm, every quake, every conflict—
our reporters are there.
Not for fame.
But to get the truth out.

Sometimes, they miss details. They make mistakes.
Not because they don’t care—
but because they’re doing everything manually.

Often, we forget the reporters.
No sleep. No food.
Calamity strikes, and still—
they stay awake all night and day,
just to deliver the truth.
Writing, editing, transcribing… all by hand.

We see their hard work.
We see the danger they face.

Newsly changes that.

One tap to transcribe.
One photo to describe.
One second to report.

No signal. No delay. Just the truth.

📌 Problem Statement
Journalists and field reporters often work in extreme conditions — natural disasters, power outages, no signal. Yet they are expected to capture, write, and deliver accurate news, often without sleep, rest, or proper tools.
This manual workload increases errors, delays reporting, and contributes to burnout.

🚀 Solution: Newsly App
Newsly is a mobile-first AI assistant that helps field reporters transcribe, describe, and generate reports on the fly — even offline. Built with a multimodal LLM, it handles audio, images, and text to deliver fast, accurate, editable news drafts anytime, anywhere.

⚙️ Key Features
🎤 Audio Recording + Transcription
Record interviews and transcribe them instantly — offline.

📸 Photo Capture + Auto Description
Snap photos and generate scene-based descriptions.

🧠 AI-Powered Report Generation
Turn interviews, photos, and field notes into editable reports in seconds.

📝 Human Review & Edits
Edit headlines, adjust tone, and finalize reports in-app.

📡 Works Offline
Full functionality without internet access — ideal for disaster zones.

👥 Target Users
Journalists & field reporters

Local news teams

Emergency response media units

NGOs and humanitarian storytellers

💻 Technology Stack
Multimodal LLM for text, image, and voice understanding

Lightweight mobile interface (Android-first)

Local-first processing for offline use

Supports Tagalog and English transcription and generation

🌍 Impact
Newsly doesn’t replace the reporter — it empowers them.
It reduces cognitive load, prevents burnout, and speeds up truth-telling during high-stakes situations.
With Newsly, reporters stay focused on the story — not the typing.

