In [1]:
!pip install gradio

# # 1️⃣ Install kokoro
!pip install -q kokoro>=0.3.4 soundfile

# # 2️⃣ Install espeak, used for English OOD fallback and some non-English languages
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
# # 🇪🇸 'e' => Spanish es
# # 🇫🇷 'f' => French fr-fr
# # 🇮🇳 'h' => Hindi hi
# # 🇮🇹 'i' => Italian it
# # 🇧🇷 'p' => Brazilian Portuguese pt-br


Collecting gradio
  Downloading gradio-5.15.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.0 (from gradio)
  Downloading gradio_client-1.7.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.9.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [2]:
from transformers import pipeline, AutoTokenizer
import torch
import gradio as gr
import requests
from PIL import Image
from io import BytesIO
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
import numpy as np
from google.colab import files
import time


In [3]:
def image_captioning(image_source):
    """
    Generates a caption for an image using the Salesforce BLIP image captioning model.

    Args:
      image_source: The path to a local image file or a URL to an image, or a PIL.Image object.

    Returns:
      A string containing the generated caption, or an error message if the image cannot be processed.
    """

    try:
        # load BLIP model
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device_map="auto")

        if isinstance(image_source, str):
            if image_source.startswith("http"):
                # URL
                headers = {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
                    "Referer": "https://www.sinchew.com.my/"
                }
                response = requests.get(image_source, headers=headers, stream=True)
                response.raise_for_status()
                image = Image.open(BytesIO(response.content))
            else:
                # local path
                image = Image.open(image_source)
        else:
            # image file
            image = image_source

        text = image_to_text(image)
        result = text[0]["generated_text"]
        print(f"{result}")

        return result

    except Exception as e:
        return f"An error occurred: {e}"




In [4]:
def llm_deepseek(messages, max_output_words=200):

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    pipe = pipeline(
        "text-generation",
        model=model_name,
        device=device,
        batch_size=8,
    )

    temperature = 1 if max_output_words < 1000 else 0.7
    result = pipe(
        messages,
        max_new_tokens=max_output_words,
        return_full_text=True,
        temperature = temperature,
    )
    result_text = result[0]["generated_text"] # question and answer

    question = result_text[0]['content']
    answer = result_text[1]["content"]
    if "</think>" in answer:
        thinking = answer.split("</think>", 1)[0].strip()
        response = answer.split("</think>", 1)[-1].strip()
    else:
        thinking = ''
        response = answer

    return response



In [5]:
def tts_kokoro(text, download=0, output_format='wav'):

    pipeline = KPipeline(lang_code='a')
    generator = pipeline(
        text,
        voice='af_heart',
        speed=1,
        split_pattern=r'\n\n'
    )

    # concat tts audio clips
    audio_clips = [audio for (_, _, audio) in generator]
    full_audio = np.concatenate(audio_clips)
    output_file = f"output.{output_format}"
    sf.write(output_file, full_audio, 24000)
    display(Audio(data=full_audio, rate=24000))

    if(download):
        files.download(output_file)

    return output_file


In [6]:
def process_image_caption(image):
    caption = image_captioning(image)
    return caption

def process_tts(text):
    audio = tts_kokoro(text)
    return audio

def process_text_generation(text, max_words):
    checkpoint_ending = max(round(max_words*0.7), max_words-50)
    max_token = max(800, max_words*3)
    messages = [
        {
            "role": "user",
            "content": f"""You are a very professional and creative story-telling assistant.
                Please generate a story based on the following [PROMT] tag within {max_words} words without any explanations or thinking process.
                Please directly fill the generated story right below the [STORY] tag
                The story must be strictly more than {checkpoint_ending} words, and below {max_words}.
                Once the story has reached the minimum requirements of {checkpoint_ending} words, you can start to end the story in next few sentences.
                [PROMPT]{text}
                [STORY]"""
        }
    ]
    response = llm_deepseek(messages, max_token)
    return response


def full_pipeline(image, max_word):

    # image captioning
    yield gr.update(value="🚀 Step 1: Generating Image Caption..."), None, None, None
    caption = process_image_caption(image)

    # story generation
    yield gr.update(value="🚀 Step 2: Generating Story..."), caption, None, None
    story = process_text_generation(caption, max_word)

    # text to audio
    yield gr.update(value="🚀 Step 3: Generating Speech..."), caption, story, None
    speech = process_tts(story)

    yield gr.update(value="✅ All steps completed!"), caption, story, speech





In [7]:
def main():

    # release GPU
    torch.cuda.empty_cache()

    # preload all pipeline models when start
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
    pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", batch_size=8, device=device)
    KPipeline(lang_code='a')

    # build gradio UI
    with gr.Blocks() as demo:
        gr.Markdown("## AI Story Teller 🤖")

        with gr.Row():
            img_input = gr.Image(type="pil", label="Upload an Image")
            max_word = gr.Slider(minimum=50, maximum=300, step=1, value=100, label="Word Count")

        status_box = gr.Textbox(label="Status", interactive=False)

        with gr.Row():
            img_output = gr.Textbox(label="Generated Caption")

        with gr.Row():
            text_output = gr.Textbox(label="Generated Story")

        with gr.Row():
            tts_output = gr.Audio(label="Generated Speech")

        run_button = gr.Button("Generate Full Story")
        run_button.click(
            full_pipeline,
            inputs=[img_input, max_word],
            outputs=[status_box, img_output, text_output, tts_output]
        )

    demo.launch()



if __name__ == "__main__":
    main()

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda


config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Device set to use cuda


config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

  WeightNorm.apply(module, name, dim)


kokoro-v1_0.pth:   0%|          | 0.00/327M [00:00<?, ?B/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://186e61e8d8f67e45e5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
