In [10]:
!pip install datasets diffusers google-generativeai torch soundfile moviepy transformers




Prompt and storyline

In [17]:
import google.generativeai as genai


import json , os , ast
def create_prompt(content_type, style, topic_or_question):
  if content_type == "Topic Learning":
    if style == "Fictional":
      prompt = f"""
              Create an immersive, movie-like story that explains {topic_or_question}. The story should flow like a complete film with clear scene progression and character development.

              Return the response as a list of dictionaries, where each dictionary represents a story frame:
              [
                  {{'image_prompt': 'Detailed cinematic scene description 1',
                  'narrator': 'Rich narrative text combining story and educational elements 1'}},
                  {{'image_prompt': 'Detailed cinematic scene description 2',
                  'narrator': 'Rich narrative text combining story and educational elements 2'}}
              ]

              Important:
                  - Explanations should be in narration and not inside the image
                  - Return ONLY the JSON array, no additional text or explanations.

              Story Requirements:
              1. Opening Scene:
              - Establish the main characters and their world
              - Introduce an intriguing situation related to {topic_or_question}

              2. Character Development:
              - Create memorable characters whose actions and challenges relate to the concept
              - Each character should represent or interact with different aspects of {topic_or_question}

              3. Plot Structure:
              - Develop a clear three-act structure (setup, conflict, resolution)
              - Each scene should naturally flow into the next
              - Maintain suspense while weaving in educational content

              4. Educational Integration:
              - Seamlessly blend explanations of {topic_or_question} into the story
              - Use character dialogue and actions to demonstrate concepts
              - Include accurate scientific/theoretical details without breaking story immersion

              5. Visual Descriptions:
              - Each image_prompt should be cinematically detailed
              - Include setting, character positions, actions, and relevant visual metaphors
              - Ensure visuals help explain the concept

              6. Narration Style:
              - Provide rich, detailed narration that builds the world
              - Include character thoughts, feelings, and motivations
              - Balance story elements with clear conceptual explanations

              7. Conclusion:
              - Resolve both the story and educational elements
              - Make clear connections between the story events and real-world applications
              - Leave readers with a clear understanding of {topic_or_question}

              Return only the list of dictionaries, formatted exactly as shown above. Each scene should advance both the story and understanding of the concept.
              """

    elif style == "Theoretical":
      prompt = f"""
              Create a systematic, technically-detailed explanation of {topic_or_question} structured as a progressive narrative.

              Return the response as a list of dictionaries:
              [
                  {{'image_prompt': 'Technical visualization description 1',
                  'narrator': 'Detailed theoretical explanation 1'}},
                  {{'image_prompt': 'Technical visualization description 2',
                  'narrator': 'Detailed theoretical explanation 2'}}
              ]

              Requirements:
              1. Structure:
              - Begin with foundational concepts
              - Build complexity progressively
              - Each section should connect logically to the next

              2. Technical Detail:
              - Include precise scientific/theoretical explanations
              - Reference relevant equations, principles, and laws
              - Explain underlying mechanisms thoroughly

              3. Visualizations:
              - Describe detailed technical diagrams
              - Include charts, graphs, or models as needed
              - Ensure visuals clearly illustrate concepts

              4. Applications:
              - Connect theory to real-world examples
              - Demonstrate practical implementations
              - Explain industrial or research applications

              5. Narration:
              - Maintain clear, academic tone
              - Use precise technical language
              - Provide thorough explanations of each concept


              Important:
                  - Explanations should be in narration and not inside the image
                  - Return ONLY the JSON array, no additional text or explanations.


              Return only the list of dictionaries with the specified format.
              """

  elif content_type == "Math Problem":
    prompt = f"""
  Create a detailed mathematical solution for: {topic_or_question}

  Return the response as a list of dictionaries, with maximum 5 steps per frame:
  [
      {{
          'image_prompt': r'''\\begin{{align*}}
              & \text{{Step 1:}} \\
              & [mathematical step 1] \\[1em]
              & \text{{Step 2:}} \\
              & [mathematical step 2] \\[1em]
              & \text{{Step 3:}} \\
              & [mathematical step 3]
          \\end{{align*}}''',
          'narrator': 'Comprehensive explanation covering these steps including mathematical reasoning'
      }},
      {{
          'image_prompt': r'''\\begin{{align*}}
              & \text{{Step 4:}} \\
              & [mathematical step 4] \\[1em]
              & \text{{Step 5:}} \\
              & [mathematical step 5]
          \\end{{align*}}''',
          'narrator': 'Detailed explanation of these steps'
      }}
  ]

  Requirements:
  1. Mathematical Format:
      - Maximum 5 steps per frame
      - Use proper LaTeX formatting with aligned equations
      - Include step numbers and clear spacing between steps
      - Use \\[1em] for spacing between steps

  2. Explanations:
      - Provide clear explanation for each group of steps
      - Include mathematical reasoning
      - Explain the process clearly

  Important:
      - Explanations should be in narration and not inside the image
      - Return ONLY the JSON array, no additional text or explanations.

  Return only the list of dictionaries with 'image_prompt' and 'narrator' keys.
  Use proper LaTeX notation throughout and ensure all steps are clearly numbered.
  """

  return prompt
os.environ["GEMINI_API_KEY"]="AIzaSyDXE8_COEoV1oyjFZwTlMPFQJyX3QJVDt8"
api_key = os.environ["GEMINI_API_KEY"]  # Alternatively, replace this with your API key string

def generate_content( content_type, style, topic_or_question):
  prompt = create_prompt(content_type, style, topic_or_question)
  try:
    # Configure the API key
    genai.configure(api_key=api_key)

    # Initialize the GenerativeModel
    model = genai.GenerativeModel(
        model_name='gemini-pro',  # Specify the model you want to use
        generation_config={
            'temperature': 0.7,       # Controls randomness. Lower value = more deterministic output.
            'top_p': 0.9,            # Controls diversity via cumulative probability (nucleus sampling).
            'max_output_tokens': 2048,  # Limit the number of tokens in the output (Gemini uses tokens instead of max_tokens).
        }
    )
    response = model.generate_content(prompt)

    candidate = response.candidates[0]

    # Extract content parts
    parts = candidate.content.parts
    if not parts:
        raise ValueError("No parts returned in the response.")

    # Parse the text (JSON-like) from the first part
    raw_text = parts[0].text
    parsed_content = ast.literal_eval(raw_text)  # Convert to Python object

    # Extract image prompts and narrators
    image_prompts = [frame['image_prompt'] for frame in parsed_content]
    narrators = [frame['narrator'] for frame in parsed_content]

    return image_prompts, narrators

  except Exception as e:
      print(f"Error generating content: {str(e)}")
      # Return fallback content or raise the error depending on your needs
      raise

In [4]:
# torch.cuda.empty_cache()
!nvidia-smi

Sun Jan  5 07:16:59 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              52W / 400W |  32609MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
# Cell 1: Initialize the models
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from diffusers import FluxPipeline
import torch

def initialize_models():
    # Initialize the SpeechT5 models
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
    pipe.to("cuda")

    return processor, model, vocoder, pipe

# Initialize the models and store them globally
processor, model, vocoder, pipe = initialize_models()
print("Models initialized.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

model_index.json:   0%|          | 0.00/536 [00:00<?, ?B/s]

Fetching 23 files:   0%|          | 0/23 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.53G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/246M [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

text_encoder_2/config.json:   0%|          | 0.00/782 [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

(…)t_encoder_2/model.safetensors.index.json:   0%|          | 0.00/19.9k [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/588 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/705 [00:00<?, ?B/s]

tokenizer_2/special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer_2/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

(…)pytorch_model-00001-of-00003.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

tokenizer_2/tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

transformer/config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

(…)pytorch_model-00002-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

(…)pytorch_model-00003-of-00003.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

(…)ion_pytorch_model.safetensors.index.json:   0%|          | 0.00/121k [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/774 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Models initialized.


In [26]:
def content_generator(content_type,style,topic):

  image_prompts ,story = generate_content(content_type,style,topic)
  print("Generated image prompts and story")
  return image_prompts ,story
image_prompts ,story = content_generator("Topic Learning","Fictional","Deforestation")

Generated image prompts and story


In [27]:
from datasets import load_dataset



def generate_images(pipe,image_prompts):
  for i in  range(len(image_prompts)):

    image = pipe(
          image_prompts[i],
          guidance_scale=0.9,
          num_inference_steps=4,
          max_sequence_length=128,
          generator=torch.Generator("cuda").manual_seed(0)
      ).images[0]
    image.save(f"flux-schnell_{i}.png")

generate_images(pipe,image_prompts)


# if __name__ == "__main__":
#     main("Topic Learning","Theoretical","Gravity")
#     # topic = input("Enter the topic for content generation (e.g., 'Differential calculus'): ")
#     # content_type = input("Enter content type (Topic Learning / Math Problem): ")

#     # # Ask for the style only if content_type is 'Topic Learning'
#     # if content_type.lower() == "topic learning":
#     #     style = input("Enter learning style (Educational / Fictional): ")
#     # else:
#     #     style = "Educational"  # Default for Math Problem
#     # main(content_type,style,topic)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [28]:
import soundfile as sf

def generate_audio(processor, model, vocoder,story):
  for i in  range(len(story)):
    inputs = processor(text=story[i], return_tensors="pt")
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
    sf.write(f"speech{i}.wav", speech.numpy(), samplerate=16000)

generate_audio(processor, model, vocoder,story)


In [29]:
from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips, TextClip, CompositeVideoClip

def create_video_from_images_and_audio( fps=12):
    clips = []

    for i in range(7):
        # Create the image and audio clips
        image_clip = ImageClip(f"flux-schnell_{i}.png").resize(0.5).set_duration(AudioFileClip(f"speech{i}.wav").duration).set_fps(fps)
        audio_clip = AudioFileClip(f"speech{i}.wav")

        # Combine image and audio
        image_clip = image_clip.set_audio(audio_clip)
        clips.append(image_clip)

    # Concatenate all video clips
    final_video = concatenate_videoclips(clips, method="compose")

    # Write the final video
    final_video.write_videofile("final_video_no_subtitles.mp4", fps=fps, codec='libx264', preset="ultrafast")


create_video_from_images_and_audio(12)
print("Generated final video")

Moviepy - Building video final_video_no_subtitles.mp4.
MoviePy - Writing audio in final_video_no_subtitlesTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video final_video_no_subtitles.mp4



                                                                 

Moviepy - Done !
Moviepy - video ready final_video_no_subtitles.mp4
Generated final video


