<a href="https://colab.research.google.com/github/Fawziah7/generated_videos/blob/main/final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Autonomous TikTok-Style Content Generation System

This project implements an autonomous pipeline for generating short-form TikTok-style videos using locally hosted AI models. The system is built with the following major components:
- **Text Generation:** Uses Mistral 7B (`mistralai/Mistral-7B-v0.1`) for script/caption creation.
- **Image Generation:** Uses Stable Diffusion v1.4 (`CompVis/stable-diffusion-v1-4`) for generating visuals.
- **Audio Generation:** Uses Coqui TTS (`tts_models/en/ljspeech/tacotron2-DDC`) for AI-generated narration.
- **Video Production:** Combines images and audio with `ffmpeg` to create a final video, formatted appropriately for social media.

The system is designed to run autonomously on Colab+ with an A100 GPU. Use the UI below to input a prompt and generate a video.

In [1]:
# !pip install transformers diffusers torch torchvision ipywidgets
!pip install TTS

import os
import subprocess
import uuid
import ipywidgets as widgets
from IPython.display import display, Video, clear_output

#For text generation
from transformers import pipeline as hf_pipeline

#For image generation
from diffusers import StableDiffusionPipeline
import torch

#Set device for GPU usage
device = "cuda" if torch.cuda.is_available() else "cpu"



In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

In [3]:
print("Loading text generation model (Mistral 7B)...")
text_generator = hf_pipeline("text-generation", model="mistralai/Mistral-7B-v0.1", device=0 if device=="cuda" else -1)

print("Loading image generation model...")
image_generator = StableDiffusionPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    torch_dtype=torch.float16 if device=="cuda" else None
)
image_generator.to(device)

print("Audio generation will use your pre-selected model.")

Loading text generation model (Mistral 7B)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Loading image generation model...


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Audio generation will use your pre-selected model.


In [4]:
def generate_script(prompt, max_length=100):

    #Generate a script from a prompt using the mistral 7B text generation model.

    try:
        generated = text_generator(prompt, max_length=max_length, do_sample=True)
        script = generated[0]['generated_text']
        print("Script generated successfully.")
        return script
    except Exception as e:
        print(f"Error in text generation: {e}")
        return None

def generate_images(script, num_images=1):
    try:
        prompt_for_image = script[:200]
        images = []
        for i in range(num_images):
            image = image_generator(prompt_for_image).images[0]
            filename = f"generated_image_{uuid.uuid4().hex}.png"
            image.save(filename)
            images.append(filename)
        print("Image(s) generated successfully.")
        return images
    except Exception as e:
        print(f"Error in image generation: {e}")
        return None

def generate_audio(script):
    """
    Generate audio narration from the script using a TTS model.
    This version uses Coqui TTS with a pre-trained Tacotron2-DDC model.
    """
    try:
        from TTS.api import TTS
        tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=True)
        audio_filename = f"generated_audio_{uuid.uuid4().hex}.wav"

        tts.tts_to_file(text=script, file_path=audio_filename)
        print("Audio generated successfully with TTS voice over.")
        return audio_filename
    except Exception as e:
        print(f"Error in audio generation: {e}")
        return None

def produce_video(image_files, audio_file, output_file="final_video.mp4", duration_per_image=1):
    """
    Combine the generated images and audio into a video using ffmpeg.
    Each image is displayed for `duration_per_image` seconds.
    """
    try:
        list_filename = "images_list.txt"
        with open(list_filename, "w") as f:
            for image in image_files:
                f.write(f"file '{os.path.abspath(image)}'\n")
                f.write(f"duration {duration_per_image}\n")

            f.write(f"file '{os.path.abspath(image_files[-1])}'\n")

        video_temp = "temp_video.mp4"
        cmd_images = f"ffmpeg -f concat -safe 0 -i {list_filename} -vsync vfr -pix_fmt yuv420p {video_temp} -y"
        subprocess.run(cmd_images, shell=True, check=True)

        cmd_merge = f"ffmpeg -i {video_temp} -i {audio_file} -c:v copy -c:a aac -strict experimental {output_file} -y"
        subprocess.run(cmd_merge, shell=True, check=True)

        os.remove(list_filename)
        os.remove(video_temp)
        print("Video produced successfully.")
        return output_file
    except Exception as e:
        print(f"Error in video production: {e}")
        return None

def cleanup_temp_files(file_list):
    for file in file_list:
        if os.path.exists(file):
            os.remove(file)

In [5]:
def generate_content_pipeline(prompt):
    """
    Main function to generate a video from a prompt.
    This ties together text, image, and audio generation with video production.
    """
    clear_output(wait=True)
    print("Starting content generation pipeline...")

    #Generate the script using mistral 7B model.
    script = generate_script(prompt)
    if script is None:
        print("Failed to generate script. Aborting pipeline.")
        return

    #Generate images based on the script.
    images = generate_images(script, num_images=10)
    if images is None or len(images) == 0:
        print("Failed to generate images. Aborting pipeline.")
        return

    #Generate audio narration using TTS.
    audio_file = generate_audio(script)
    if audio_file is None:
        print("Failed to generate audio. Aborting pipeline.")
        return

    #Produce the final video with faster image transitions.
    output_video = produce_video(images, audio_file, duration_per_image=2)
    if output_video is None:
        print("Failed to produce video. Aborting pipeline.")
        return

    print("Content generation pipeline completed successfully!")
    display(Video(output_video, embed=True))

    cleanup_temp_files(images + [audio_file])


prompt_input = widgets.Text(
    value='',
    placeholder='Type your video prompt here...',
    description='Prompt:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='90%')
)

generate_button = widgets.Button(
    description='Generate Video',
    button_style='success'
)

output_area = widgets.Output()

def on_generate_button_clicked(b):
    with output_area:
        clear_output()
        prompt = prompt_input.value.strip()
        if prompt:
            generate_content_pipeline(prompt)
        else:
            print("Please enter a valid prompt.")

generate_button.on_click(on_generate_button_clicked)

display(widgets.VBox([prompt_input, generate_button, output_area]))

VBox(children=(Text(value='', description='Prompt:', layout=Layout(width='90%'), placeholder='Type your video …