<a href="https://colab.research.google.com/github/Dhrisheta/71762233011-DCS/blob/main/APRL28.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ------------------------------------------------------------
# 🚀 CLEAN INSTALLATION for Whisper + Gemini + TTS + Diffusers
# ------------------------------------------------------------

# Upgrade important core libraries first
!pip install --upgrade numpy pandas networkx --quiet

# Install Whisper (directly from OpenAI GitHub)
!pip install git+https://github.com/openai/whisper.git --quiet

# Install Google's Generative AI (Gemini API)
!pip install -q -U google-generativeai

# Install HuggingFace Diffusers for Stable Diffusion
!pip install -q diffusers transformers accelerate scipy safetensors

# Install TTS (Text to Speech)
!pip install -q TTS

# Install additional helpful libraries
!pip install -q matplotlib nltk pillow

# Show final versions
!pip show numpy pandas networkx whisper google-generativeai diffusers TTS


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.5 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.5 which is incompatible.[0m[31m
[0m  Installing build dependencies ... [?25l[?2

In [1]:
!pip install whisper



In [2]:
# ----------------------------------------
# ✅ IMPORT EVERYTHING
# ----------------------------------------

import torch
import whisper
import google.generativeai as genai
from diffusers import StableDiffusionPipeline
from PIL import Image, ImageFilter, ImageOps
import matplotlib.pyplot as plt
from TTS.api import TTS
import nltk
import re
import os
from google.colab import files


In [3]:


# -------------------------------------------
# DOWNLOAD NLTK PUNKT
# -------------------------------------------
nltk.download('punkt')

# -------------------------------------------
# USER INPUT - Upload audio file
# -------------------------------------------
uploaded = files.upload()
audio_file = list(uploaded.keys())[0]
print(f"Uploaded Audio File: {audio_file}")

# -------------------------------------------
# WHISPER - Transcribe audio
# -------------------------------------------
model = whisper.load_model("base")
result = model.transcribe(audio_file)
transcribed_text = result["text"]
print("\nTranscribed Text:\n", transcribed_text)

# -------------------------------------------
# GEMINI - Generate short story
# -------------------------------------------
GEMINI_API_KEY = "AIzaSyAe5iu2CLsGnkDJZQfm8CX8P38UBtThQ8I"  # <<< PUT YOUR OWN KEY HERE
genai.configure(api_key=GEMINI_API_KEY)

prompt = f"Create a short and engaging story based on the following text:\n{transcribed_text}\n\nStory:"
gemini_model = genai.GenerativeModel("gemini-1.5-pro")
response = gemini_model.generate_content(prompt)

story_text = response.text
print("\nGenerated Story:\n", story_text)

# -------------------------------------------
# TEXT SPLIT - Split story into sentences
# -------------------------------------------
def simple_sentence_split(text):
    sentences = re.split(r'(?<=[.!?]) +', text.strip())
    return [s for s in sentences if s]

story_sentences = simple_sentence_split(story_text)

# -------------------------------------------
# STABLE DIFFUSION - Generate comic frames
# -------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
comic_model_id = "ogkalu/Comic-Diffusion"

pipe = StableDiffusionPipeline.from_pretrained(
    comic_model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    safety_checker=None
)
pipe = pipe.to(device)

def generate_comic_frame(prompt, frame_num):
    style_prompt = f"{prompt}, comic book style, cartoon illustration, vibrant colors, bold outlines, pop art"
    image = pipe(style_prompt, height=512, width=512, num_inference_steps=30).images[0]
    image = image.filter(ImageFilter.SMOOTH_MORE)
    image = ImageOps.posterize(image, 4)
    image.save(f"frame_{frame_num}.png")
    return image

def create_comic_strip(frames):
    plt.figure(figsize=(20, 8))
    for i, frame in enumerate(frames):
        plt.subplot(1, len(frames), i+1)
        plt.imshow(frame)
        plt.axis('off')
    plt.tight_layout()
    plt.savefig("comic_strip.png")
    plt.show()

# Generate Comic Frames
if len(story_sentences) > 0:
    print("\nGenerating Comic Frames...")
    frames = [generate_comic_frame(sentence, idx+1) for idx, sentence in enumerate(story_sentences)]
    create_comic_strip(frames)
    print("Comic generated as comic_strip.png!")
else:
    print("No story text to generate comic frames.")

# -------------------------------------------
# TTS - Upload your speaker WAV file
# -------------------------------------------
print("\nPlease upload your speaker WAV file (for voice cloning)...")
uploaded_speaker = files.upload()
speaker_wav = list(uploaded_speaker.keys())[0]
print(f"Speaker WAV Uploaded: {speaker_wav}")

# -------------------------------------------
# TTS - Clone voice and generate final audio
# -------------------------------------------
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")

tts.tts_to_file(
    text=story_text,
    speaker_wav=speaker_wav,
    language="en",
    file_path="final_story_audio.wav"
)

print("\n✅ Final Story Audio Generated: final_story_audio.wav")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Saving bavs1_enhanced.wav to bavs1_enhanced.wav
Uploaded Audio File: bavs1_enhanced.wav


100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 175MiB/s]



Transcribed Text:
  It was a cold December morning and small dawn of Elm's murder was slowly waking up under a blanket of snow. The streets were quiet except for the soft crunch of boats and occasionally bark of dog chasing snowflakes. In a tiny break house at the end of the maple streets lived a boy named Arjun who had just turned 12. Arjun was like ors and like other kids. While most children were asking for video games and toys for Christmas, Arjun only wanted one thing, a poor photo of his father. His dad had disappeared when he was a baby leaving behind only stories of his mother whispered to him at night. Stories of brave, kindhearted man who loved music, nature and warm cocoa on rainy days. That morning Arjun sat by the window for his breath fogging up the glass. Maybe this year he whispered, Santa will bring me something different. He didn't believe in magic much anymore but deep down a small part of him still hoped. As a clock struck nine there was a knock at the door. Arjun 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

(…)omic-Diffusion%20%C2%B7%20Hugging%20Face:   0%|          | 0.00/15.7M [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/492M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

diffusion_pytorch_model.bin:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/748 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/581 [00:00<?, ?B/s]

diffusion_pytorch_model.bin:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

An error occurred while trying to fetch /root/.cache/huggingface/hub/models--ogkalu--Comic-Diffusion/snapshots/ff684f581ab24e094e2055d9422e9ee076d139a8/unet: Error no file named diffusion_pytorch_model.safetensors found in directory /root/.cache/huggingface/hub/models--ogkalu--Comic-Diffusion/snapshots/ff684f581ab24e094e2055d9422e9ee076d139a8/unet.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
An error occurred while trying to fetch /root/.cache/huggingface/hub/models--ogkalu--Comic-Diffusion/snapshots/ff684f581ab24e094e2055d9422e9ee076d139a8/vae: Error no file named diffusion_pytorch_model.safetensors found in directory /root/.cache/huggingface/hub/models--ogkalu--Comic-Diffusion/snapshots/ff684f581ab24e094e2055d9422e9ee076d139a8/vae.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiff


Generating Comic Frames...


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Comic generated as comic_strip.png!

Please upload your speaker WAV file (for voice cloning)...


Saving JAYANT_16k.wav to JAYANT_16k.wav
Speaker WAV Uploaded: JAYANT_16k.wav
 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--your_tts


 98%|█████████▊| 416M/425M [00:05<00:00, 79.5MiB/s]

 > Model's license - CC BY-NC-ND 4.0
 > Check https://creativecommons.org/licenses/by-nc-nd/4.0/ for more info.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | 

100%|██████████| 425M/425M [00:20<00:00, 79.5MiB/s]

 > Processing time: 39.52234172821045
 > Real-time factor: 0.4472573357197389

✅ Final Story Audio Generated: final_story_audio.wav


In [4]:
# -------------------------------------------
# TTS - Clone voice and generate audio per sentence
# -------------------------------------------
print("\nGenerating audio per sentence using cloned voice...")

tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")

# Loop over sentences and generate audio files
for idx, sentence in enumerate(story_sentences):
    output_filename = f"story_audio_sentence_{idx+1}.wav"
    print(f"🔊 Generating audio for sentence {idx+1}: {sentence}")
    tts.tts_to_file(
        text=sentence,
        speaker_wav=speaker_wav,
        language="en",
        file_path=output_filename
    )

print("\n✅ Sentence-wise audio generation complete!")



Generating audio per sentence using cloned voice...
 > tts_models/multilingual/multi-dataset/your_tts is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > n

In [5]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.27.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.9.0 (from gradio)
  Downloading gradio_client-1.9.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

In [8]:
import gradio as gr
from PIL import Image
import os
import time
import wave


# Load story length
total_frames = len(story_sentences)

# Function to fetch a specific frame, sentence, and audio
def show_comic_frame(index):
    if index < 0 or index >= total_frames:
        return None, "Invalid index", None
    image_path = f"frame_{index+1}.png"
    audio_path = f"story_audio_sentence_{index+1}.wav"
    sentence = story_sentences[index]
    return Image.open(image_path), sentence, audio_path

# Autoplay function to go through all frames
def autoplay(state):
    outputs = []
    for i in range(state, total_frames):
        time.sleep(5)  # Adjust timing as needed (5 seconds per frame)
        image, sentence, audio = show_comic_frame(i)
        outputs.append((i, image, sentence, audio))
    return outputs

# Set up Gradio app
with gr.Blocks() as demo:
    gr.Markdown("## 🎨 Comic Story Viewer with Narration")
    gr.Markdown("Use the slider to manually browse or click 'Play All' to autoplay the comic.")

    with gr.Row():
        index_slider = gr.Slider(minimum=0, maximum=total_frames-1, step=1, value=0, label="Frame Number")
        play_button = gr.Button("▶️ Play All")
        stop_button = gr.Button("⏹️ Stop")

    with gr.Column():
        image_output = gr.Image(label="Comic Frame")
        sentence_output = gr.Textbox(label="Sentence")
        audio_output = gr.Audio(label="Voice", type="filepath")

    # Control manual slider
    index_slider.change(fn=show_comic_frame, inputs=index_slider,
                        outputs=[image_output, sentence_output, audio_output])

    # Set up state for autoplay
    current_frame = gr.State(0)
    # Function to get audio duration
    def get_audio_duration(filepath):
        with wave.open(filepath, 'r') as f:
            frames = f.getnframes()
            rate = f.getframerate()
            return frames / float(rate)

    # Define a helper function for autoplay with button control
    def play_all(current):
        for i in range(current, total_frames):
            image, sentence, audio_path = show_comic_frame(i)
            duration = get_audio_duration(audio_path)

            # Show frame
            yield i, image, sentence, audio_path

            # Wait for audio to finish before next frame
            time.sleep(duration)

    # Hook autoplay to play button
    play_button.click(
        play_all,
        inputs=current_frame,
        outputs=[index_slider, image_output, sentence_output, audio_output],
        concurrency_limit=1
    )

demo.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://008d1ae431b01f01ea.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:

!pip install moviepy




In [None]:
from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips
import warnings

# Optional: Suppress syntax warnings from moviepy
warnings.filterwarnings("ignore", category=SyntaxWarning)

# Function to create video
def create_video_with_audio(image_paths, audio_paths, output_file="comic_video.mp4"):
    clips = []
    for img, audio in zip(image_paths, audio_paths):
        audio_clip = AudioFileClip(audio)
        img_clip = ImageClip(img).set_duration(audio_clip.duration)
        img_clip = img_clip.set_audio(audio_clip)
        clips.append(img_clip)

    final_video = concatenate_videoclips(clips)
    final_video.write_videofile(output_file, fps=24)
    print(f"🎬 Video saved as: {output_file}")

# Define image and audio paths
image_paths = [f"frame_{i+1}.png" for i in range(len(story_sentences))]
audio_paths = [f"story_audio_sentence_{i+1}.wav" for i in range(len(story_sentences))]

# Run it
create_video_with_audio(image_paths, audio_paths)


100%|██████████| 425M/425M [23:14<00:00, 90.5MiB/s]

Moviepy - Building video comic_video.mp4.
MoviePy - Writing audio in comic_videoTEMP_MPY_wvf_snd.mp3



chunk:   0%|          | 0/2437 [00:00<?, ?it/s, now=None][A
chunk:   4%|▍         | 109/2437 [00:00<00:02, 960.68it/s, now=None][A
chunk:   8%|▊         | 206/2437 [00:00<00:02, 955.98it/s, now=None][A
chunk:  12%|█▏        | 302/2437 [00:00<00:02, 874.81it/s, now=None][A
chunk:  16%|█▌        | 391/2437 [00:00<00:02, 786.73it/s, now=None][A
chunk:  19%|█▉        | 471/2437 [00:00<00:02, 699.69it/s, now=None][A
chunk:  22%|██▏       | 543/2437 [00:00<00:02, 673.91it/s, now=None][A
chunk:  25%|██▌       | 612/2437 [00:00<00:03, 594.95it/s, now=None][A
chunk:  28%|██▊       | 673/2437 [00:00<00:03, 580.27it/s, now=None][A
chunk:  30%|███       | 741/2437 [00:01<00:02, 603.14it/s, now=None][A
chunk:  33%|███▎      | 803/2437 [00:01<00:02, 567.66it/s, now=None][A
chunk:  35%|███▌      | 861/2437 [00:01<00:02, 570.50it/s, now=None][A
chunk:  38%|███▊      | 928/2437 [00:01<00:02, 597.31it/s, now=None][A
chunk:  41%|████      | 989/2437 [00:01<00:02, 585.41it/s, now=None][A
ch

MoviePy - Done.
Moviepy - Writing video comic_video.mp4




t:   0%|          | 0/2652 [00:00<?, ?it/s, now=None][A
t:   1%|          | 32/2652 [00:00<00:08, 316.34it/s, now=None][A
t:   2%|▏         | 64/2652 [00:00<00:10, 258.70it/s, now=None][A
t:   3%|▎         | 91/2652 [00:00<00:18, 141.84it/s, now=None][A
t:   4%|▍         | 110/2652 [00:00<00:21, 116.86it/s, now=None][A
t:   5%|▍         | 125/2652 [00:00<00:23, 109.75it/s, now=None][A
t:   5%|▌         | 138/2652 [00:01<00:24, 102.66it/s, now=None][A
t:   6%|▌         | 154/2652 [00:01<00:21, 113.97it/s, now=None][A
t:   6%|▋         | 167/2652 [00:01<00:21, 113.62it/s, now=None][A
t:   7%|▋         | 183/2652 [00:01<00:19, 124.00it/s, now=None][A
t:   8%|▊         | 199/2652 [00:01<00:18, 131.54it/s, now=None][A
t:   8%|▊         | 214/2652 [00:01<00:17, 135.92it/s, now=None][A
t:   9%|▊         | 229/2652 [00:01<00:17, 138.24it/s, now=None][A
t:   9%|▉         | 245/2652 [00:01<00:16, 141.89it/s, now=None][A
t:  10%|▉         | 263/2652 [00:01<00:15, 149.62it/s, now=No

Moviepy - Done !
Moviepy - video ready comic_video.mp4
🎬 Video saved as: comic_video.mp4


In [None]:
from IPython.display import Video

# Just display the video
Video("comic_video.mp4", embed=True, width=720)


In [None]:
from diffusers import StableDiffusionPipeline
import torch

# Choose model based on your preference
model_id = "Lykon/DreamShaper-8"  # For DreamShaper
# model_id = "stabilityai/stable-diffusion-xl-base-1.0"  # For SDXL

pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")

def generate_fairytale_image(prompt):
    image = pipe(prompt).images[0]
    image.show()

# Sample fairytale prompt
prompt = "A young sorceress standing in a glowing enchanted forest, surrounded by floating lights, magical creatures, and ancient trees with sparkling leaves. In the background, an elegant, mysterious castle with soft, pastel hues. The scene is magical, whimsical, with a dreamy, ethereal quality. Fantasy art style, highly detailed, soft lighting, gentle and mysterious mood."

generate_fairytale_image(prompt)


model_index.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/520 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/796 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/756 [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]




  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
# Generate and display the image
image = pipe(prompt).images[0]
image.show()  # This will display the image in the notebook


In [None]:
from diffusers import StableDiffusionPipeline
import torch
from PIL import Image

# Load DreamShaper model
model_id = "Lykon/DreamShaper-8"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")

# Fairytale prompt
prompt = ("A young sorceress with flowing silver hair, wearing a glittering blue gown, standing in the heart of a glowing enchanted forest. "
          "The forest is filled with mystical creatures, such as glowing fairies and magical animals. Floating lights hover around her, casting a soft, warm glow. "
          "In the background, a majestic castle made of shimmering crystal rises into the sky, surrounded by twinkling stars and mist. "
          "The atmosphere is dreamy, magical, and ethereal, with soft lighting and a whimsical, fantastical mood. "
          "Fantasy art style, highly detailed, delicate colors, and magical elements.")

# Generate image
image = pipe(prompt).images[0]

# Display the generated image
image.show()


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]


Token indices sequence length is longer than the specified maximum sequence length for this model (119 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['twinkling stars and mist. the atmosphere is dreamy, magical, and ethereal, with soft lighting and a whimsical, fantastical mood. fantasy art style, highly detailed, delicate colors, and magical elements.']


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
# Generate and display the image
image = pipe(prompt).images[0]
image.show()  # This will display the image in the notebook


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['twinkling stars and mist. the atmosphere is dreamy, magical, and ethereal, with soft lighting and a whimsical, fantastical mood. fantasy art style, highly detailed, delicate colors, and magical elements.']


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
import gradio as gr

def generate_fairytale_image():
    # Generate the image using the same code
    image = pipe(prompt).images[0]
    return image

# Gradio Interface
gr.Interface(fn=generate_fairytale_image, inputs=None, outputs=gr.Image(label="Fairytale Sorceress")).launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a6ac07ad4e0b17cc43.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
!pip install genai

Collecting genai
  Downloading genai-2.1.0-py3-none-any.whl.metadata (6.5 kB)
Collecting ipython<9.0.0,>=8.10.0 (from genai)
  Downloading ipython-8.36.0-py3-none-any.whl.metadata (5.1 kB)
Collecting openai<0.28.0,>=0.27.0 (from genai)
  Downloading openai-0.27.10-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken<0.4.0,>=0.3.2 (from genai)
  Downloading tiktoken-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting jedi>=0.16 (from ipython<9.0.0,>=8.10.0->genai)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting stack_data (from ipython<9.0.0,>=8.10.0->genai)
  Downloading stack_data-0.6.3-py3-none-any.whl.metadata (18 kB)
Collecting traitlets>=5.13.0 (from ipython<9.0.0,>=8.10.0->genai)
  Downloading traitlets-5.14.3-py3-none-any.whl.metadata (10 kB)
Collecting executing>=1.2.0 (from stack_data->ipython<9.0.0,>=8.10.0->genai)
  Downloading executing-2.2.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting asttokens>=2.

In [None]:
import gradio as gr
import nltk
import whisper
import genai
import torch
from PIL import Image, ImageFilter, ImageOps
import re
import matplotlib.pyplot as plt
from TTS.api import TTS
from diffusers import StableDiffusionPipeline

# -------------------------------------------
# DOWNLOAD NLTK PUNKT
# -------------------------------------------
nltk.download('punkt')

# -------------------------------------------
# USER INPUT - Upload audio file
# -------------------------------------------
uploaded = files.upload()
audio_file = list(uploaded.keys())[0]
print(f"Uploaded Audio File: {audio_file}")

# -------------------------------------------
# WHISPER - Transcribe audio
# -------------------------------------------
model = whisper.load_model("base")
result = model.transcribe(audio_file)
transcribed_text = result["text"]
print("\nTranscribed Text:\n", transcribed_text)

# -------------------------------------------
# GEMINI - Generate short story
# -------------------------------------------
GEMINI_API_KEY = "AIzaSyAe5iu2CLsGnkDJZQfm8CX8P38UBtThQ8I"  # <<< PUT YOUR OWN KEY HERE
genai.configure(api_key=GEMINI_API_KEY)

prompt = f"Create a short and engaging story based on the following text:\n{transcribed_text}\n\nStory:"
gemini_model = genai.GenerativeModel("gemini-1.5-pro")
response = gemini_model.generate_content(prompt)

story_text = response.text
print("\nGenerated Story:\n", story_text)

# -------------------------------------------
# TEXT SPLIT - Split story into sentences
# -------------------------------------------
def simple_sentence_split(text):
    sentences = re.split(r'(?<=[.!?]) +', text.strip())
    return [s for s in sentences if s]

story_sentences = simple_sentence_split(story_text)

# -------------------------------------------
# NEW IMAGE GENERATION MODEL - DREAMSHAPER or SDXL
# -------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "Lykon/DreamShaper-8"  # Replace Comic-Diffusion with DreamShaper

pipe = StableDiffusionPipeline.from_pretrained(
    model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    safety_checker=None
)
pipe = pipe.to(device)

# Function to generate images for each sentence
def generate_image_for_sentence(sentence, frame_num):
    style_prompt = f"{sentence}, magical, fairytale, vibrant colors, dreamy atmosphere"
    image = pipe(style_prompt, height=512, width=512, num_inference_steps=30).images[0]
    image = image.filter(ImageFilter.SMOOTH_MORE)
    image = ImageOps.posterize(image, 4)
    image.save(f"frame_{frame_num}.png")
    return image

# -------------------------------------------
# GRADIO INTERFACE - Displaying Generated Images for Each Sentence
# -------------------------------------------

def display_images_for_story():
    frames = []
    for idx, sentence in enumerate(story_sentences):
        image = generate_image_for_sentence(sentence, idx + 1)
        frames.append(image)
    return frames

# Gradio Interface to display images
gr.Interface(fn=display_images_for_story, inputs=None, outputs=gr.Gallery(label="Generated Comic Frames")).launch()

# -------------------------------------------
# TTS - Upload your speaker WAV file
# -------------------------------------------
print("\nPlease upload your speaker WAV file (for voice cloning)...")
uploaded_speaker = files.upload()
speaker_wav = list(uploaded_speaker.keys())[0]
print(f"Speaker WAV Uploaded: {speaker_wav}")

# -------------------------------------------
# TTS - Clone voice and generate final audio
# -------------------------------------------
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")

tts.tts_to_file(
    text=story_text,
    speaker_wav=speaker_wav,
    language="en",
    file_path="final_story_audio.wav"
)

print("\n✅ Final Story Audio Generated: final_story_audio.wav")


ModuleNotFoundError: No module named 'genai'