In [1]:
#uncomment the below lines to install packages
#!pip install langchain cohere python-dotenv
#!pip install diffusers --upgrade
#pip install invisible_watermark transformers accelerate safetensors
#install cuda 12.1
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
#create .env file to store cohere api key, copy the below line into the file
#COHERE_API_KEY=apikey

In [2]:
from langchain.llms import Cohere
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from dotenv import load_dotenv
import os
from diffusers import DiffusionPipeline
import torch

load_dotenv(dotenv_path='.env')

# To change n_detail you must change the instructions in template mentioning how many scene descriptions to create 
# and you will need to add more examples to match the number you are specifying
n_detail = 3

def generate_scene_descriptions(input):
    """
    Generate scene descriptions based on an initial input using a template.

    Args:
        input (str): The input used to generate scene descriptions.

    Returns:
        list: A list of dictionaries, each containing a scene description.
    """

    template = """Instructions: Create a series of three cohesive scene descriptions that build upon each other, starting from an initial input. When presenting any dialogue, make sure to enclose it within <dialogue></dialogue> tags, with the speaker's name outside these tags. The dialogue should be the only content within these tags. It is crucial to include dialogue only in the second scene; the remaining scenes should not contain any dialogue. After the </dialogue> tag, maintain consistency by specifying the character's gender in brackets, like (male) or (female). In the second scene, it is essential to describe the facial features of a single character along with their dialogue. The first scene should focus on describing the environment rather than the character. THE SECOND SCENE MUST HAVE A <dialogue></dialogue> FOLLOWED BY (male/female). All the scenes must have the name of the character.\n\n\nInput: Samurai\n\n\n1. Begin with a vivid scene, Picture a serene setting with a high-quality view of a cherry blossom tree beside a beautiful pond.\n2. Now, introduce a middle-aged samurai named Jin. He has long and black hair, He is wearing a white kimono and is positioned at the center, holding a sword. He says <dialogue>"I will train to become the best and strongest samurai in the entire world even if i have to train for eternity"</dialogue> (male) .\n3. The samurai is practicing swinging his sword in a captivating display.\n\n\nInput: Coffee Barista\n\n\n1. Imagine a small, cozy coffee bar tucked away in a quiet corner of the city. The aroma of freshly brewed coffee wafts through the air, and the sound of a cappuccino machine hisses in the background.\n2. Meet Lily, the friendly coffee barista behind the counter. She has a warm smile and lively green eyes. She has blond and short hair. As she hands a customer their latte, she says, <dialogue>"Enjoy your coffee! Let me know if you need anything else, have a nice day!"</dialogue> (female).\n3. Lily deftly steams milk, expertly pours latte art, and efficiently manages the bustling coffee bar.\n\n\nInput: {input}
    """
    print("Prompt being sent to the language model")
    print(template)


    prompt = PromptTemplate(template=template, input_variables=["input"])

    llm = Cohere(cohere_api_key=os.getenv("COHERE_API_KEY") , stop=['\n\n\n'], temperature=0.5, model='command')

    llm_chain = LLMChain(prompt=prompt, llm=llm)

    response = llm_chain.run(input)
    print("response from language model")
    print(response)

    llm_lines = response.split('\n')
    formatted_llm_lines = []
    for line in llm_lines:
        line = line.split('. ', 1)[1]  # Remove the starting number
        if '<dialogue>' in line:
            formatted_llm_lines.append({'sadtalker': line})
        else:
            formatted_llm_lines.append({'animatediff': line})
    print("Extracted response in structured form")
    print(formatted_llm_lines)

    return formatted_llm_lines

def generate_images_and_save(texts, output_dir):
    """
    Generate images from text descriptions and save them to an output directory.

    Args:
        texts (list): A list of text descriptions to generate images from.
        output_dir (str): The directory where generated images will be saved.
    """

    # Load the DiffusionPipeline model
    pipe = DiffusionPipeline.from_pretrained("SG161222/RealVisXL_V2.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
    pipe.to("cuda")

    for i, text_dict in enumerate(texts):
        text = next(iter(text_dict.values()))  # Get the text from the dictionary
        text =  "4k, photorealistic, realistic" + text

        # Generate images from text
        images = pipe(prompt=text).images[0]

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Save generated images
        images.save(f"{output_dir}/{i + 1}.jpg")

def autonomous_video_planning(input):
    """
    Perform autonomous video planning based on an initial input.

    Args:
        input (str): The initial input for video planning.

    Returns:
        tuple: A tuple containing the formatted scene descriptions and the output directory.
    """
    
    formatted_llm_lines = generate_scene_descriptions(input=input)
    output_directory = f"reference_images/{input}"
    generate_images_and_save(formatted_llm_lines, output_directory)
    return formatted_llm_lines, output_directory

# Example usage
autonomous_video_planning("John Wick in the streets")

Prompt being sent to the language model
Instructions: Create a series of three cohesive scene descriptions that build upon each other, starting from an initial input. When presenting any dialogue, make sure to enclose it within <dialogue></dialogue> tags, with the speaker's name outside these tags. The dialogue should be the only content within these tags. It is crucial to include dialogue only in the second scene; the remaining scenes should not contain any dialogue. After the </dialogue> tag, maintain consistency by specifying the character's gender in brackets, like (male) or (female). In the second scene, it is essential to describe the facial features of a single character along with their dialogue. The first scene should focus on describing the environment rather than the character. THE SECOND SCENE MUST HAVE A <dialogue></dialogue> FOLLOWED BY (male/female). All the scenes must have the name of the character.


Input: Samurai


1. Begin with a vivid scene, Picture a serene setti

Your text contains a trailing whitespace, which has been trimmed to ensure high quality generations.


response from language model
1. Imagine a dark and gritty street in the heart of a city, the sounds of traffic and people bustling by. The scene is lit by the neon lights of nearby businesses.
2. John Wick, a legendary assassin, stands in the middle of the street, his eyes cold and his face impassive. He is wearing a black suit and holding a gun in his hand. He says, <dialogue>"I'm going to kill them all"</dialogue> (male).
3. John Wick is a deadly assassin, known for his prowess in taking down his targets with precision and efficiency. He is a man on a mission, and he will stop at nothing to get what he wants. With his gun at the ready, he stalks the streets, searching for his next target.
Extracted response in structured form
[{'animatediff': 'Imagine a dark and gritty street in the heart of a city, the sounds of traffic and people bustling by. The scene is lit by the neon lights of nearby businesses.'}, {'sadtalker': 'John Wick, a legendary assassin, stands in the middle of the stre

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

([{'animatediff': 'Imagine a dark and gritty street in the heart of a city, the sounds of traffic and people bustling by. The scene is lit by the neon lights of nearby businesses.'},
  {'sadtalker': 'John Wick, a legendary assassin, stands in the middle of the street, his eyes cold and his face impassive. He is wearing a black suit and holding a gun in his hand. He says, <dialogue>"I\'m going to kill them all"</dialogue> (male).'},
  {'animatediff': 'John Wick is a deadly assassin, known for his prowess in taking down his targets with precision and efficiency. He is a man on a mission, and he will stop at nothing to get what he wants. With his gun at the ready, he stalks the streets, searching for his next target.'}],
 'reference_images/John Wick in the streets')