In [52]:
from moviepy.editor import *

In [53]:
from moviepy.editor import VideoFileClip, ImageClip, concatenate_videoclips, CompositeVideoClip, TextClip, AudioFileClip
from moviepy.video.fx.all import crop, resize

In [54]:
# Load the bottom video and set it to a 9:16 aspect ratio
bottom_clip = VideoFileClip("/Users/arinair/Downloads/subway-surfers.mov") # CHANGE TO BACKGROUND VIDEO PATH

# Resize the video to a 9:16 aspect ratio
bottom_clip = bottom_clip.resize(height=1920).crop(x_center=bottom_clip.w/2, width=1080)  # Set the width to 1080 for a 9:16 aspect ratio

# Get the dimensions of the resized video
width, height = bottom_clip.size

# Crop the bottom half of the video
bottom_half_clip = bottom_clip.crop(y1=height/2 - 150, y2=height - 150)

In [55]:
import requests
import json
import base64

VOICE_ID = "jsCqWAovK2LkecY7zXl4"  # Freya
YOUR_XI_API_KEY = "" # INSERT API KEY

url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/with-timestamps"

headers = {
  "Content-Type": "application/json",
  "xi-api-key": YOUR_XI_API_KEY
}

# CHANGE TEXT TO SCRIPT FROM OpenAI
data = {
  "text": "Okay, so like, making mac n' cheese is super easy $ First, you boil the pasta until it's, like, perfectly al dente, then drain it $ In a separate pan, melt some butter and whisk in flour to make this cute little roux $ Slowly add milk and stir until it's, like, thick and creamy $ Then, mix in your cheese until it’s all melty and dreamy $ Combine that with your pasta, and if you want it extra fabulous, sprinkle more cheese on top and bake until it’s golden and bubbly $ And, voila! You’ve got, like, the yummiest mac n' cheese ever",
  "model_id": "eleven_multilingual_v2",
  "voice_settings": {
    "stability": 0.5,
    "similarity_boost": 0.75
  }
}


response = requests.post(
    url,
    json=data,
    headers=headers,
)

if response.status_code != 200:
  print(f"Error encountered, status: {response.status_code}, "
          f"content: {response.text}")
  quit()

# convert the response which contains bytes into a JSON string from utf-8 encoding
json_string = response.content.decode("utf-8")

# parse the JSON string and load the data as a dictionary
response_dict = json.loads(json_string)

# the "audio_base64" entry in the dictionary contains the audio as a base64 encoded string, 
# we need to decode it into bytes in order to save the audio as a file
audio_bytes = base64.b64decode(response_dict["audio_base64"])

with open('output.mp3', 'wb') as f:
  f.write(audio_bytes)

words = []

curr_word = ''
characters = response_dict['alignment']['characters']
characters_start_times_seconds = response_dict['alignment']['character_start_times_seconds']
characters_end_times_seconds = response_dict['alignment']['character_end_times_seconds']



In [56]:
for i in range(len(response_dict['alignment']['characters'])):
    if characters[i] == ' ':
        words.append((curr_word, characters_start_times_seconds[i - len(curr_word)], characters_end_times_seconds[i - 1]))
        words.append((" ", characters_start_times_seconds[i], characters_end_times_seconds[i]))
        curr_word = ''
    else:
      curr_word += characters[i]

[('Okay,', 0.0, 0.499), (' ', 0.499, 0.604), ('so', 0.604, 0.894), (' ', 0.894, 1.091), ('like,', 1.091, 1.486), (' ', 1.486, 1.591), ('making', 1.591, 1.892), (' ', 1.892, 1.997), ('mac', 1.997, 2.229), (' ', 2.229, 2.276), ("n'", 2.276, 2.334), (' ', 2.334, 2.368), ('cheese', 2.368, 2.717), (' ', 2.717, 2.868), ('is', 2.868, 3.111), (' ', 3.111, 3.344), ('super', 3.344, 3.68), (' ', 3.68, 3.738), ('easy', 3.738, 4.075), (' ', 4.075, 4.238), ('$', 4.238, 4.714), (' ', 4.714, 4.853), ('First,', 4.853, 5.271), (' ', 5.271, 5.352), ('you', 5.352, 5.457), (' ', 5.457, 5.561), ('boil', 5.561, 5.782), (' ', 5.782, 5.828), ('the', 5.828, 5.886), (' ', 5.886, 5.933), ('pasta', 5.933, 6.42), (' ', 6.42, 6.513), ('until', 6.513, 6.769), (' ', 6.769, 6.815), ("it's,", 6.815, 6.966), (' ', 6.966, 7.001), ('like,', 7.001, 7.303), (' ', 7.303, 7.663), ('perfectly', 7.663, 8.092), (' ', 8.092, 8.197), ('al', 8.197, 8.382), (' ', 8.382, 8.487), ('dente,', 8.487, 9.091), (' ', 9.091, 9.288), ('then', 

In [57]:
# Load the images and create ImageClips, scaling them to fit the top half of the 9:16 frame
image_files = ["1.jpeg", "2.jpeg", "3.jpeg", "4.jpeg", "5.jpeg", "6.jpeg", "7.jpg"]  # REPLACE WITH OPEN SOURCE IMAGES
image_end_times = []

# sentence_start = 0
for i in range(len(words)):
    (word, start, end) = words[i]
    if word == "$":
        image_end_times.append(end + 1)
        # sentence_start = end
    if i == len(words) - 1:
        image_end_times.append(end + 1)

image_clips = []

for i in range(len(image_files)):
    img_clip = ImageClip(image_files[i]).resize(height=1920/2).crop(x_center=bottom_clip.w/2, width=1080)
    img_clip = (img_clip
                    .set_start(0 if i == 0 else image_end_times[i - 1])
                    .set_end(image_end_times[i]))

    # Scale the image to fit the top half of the 9:16 clip
    img_clip = img_clip.resize(width=width)

    # Apply a Ken Burns effect (zoom in)
    ken_burns_clip = img_clip.fx(resize, lambda t: 1 + 0.02*t)

    # Crop the image to fit in the top half of the 9:16 frame
    cropped_image_clip = ken_burns_clip.crop(y2=height/2).set_position(("center", "top"))

    image_clips.append(cropped_image_clip)

# Concatenate the image clips with clean transitions (crossfade)
top_half_clip = concatenate_videoclips(image_clips, method="compose", padding=-0.2)

[5.714, 11.751, 17.637, 23.268, 27.843, 36.794, 42.471]


In [59]:


# Combine the top half with the bottom video in a 9:16 frame
final_clip = CompositeVideoClip([top_half_clip, bottom_half_clip.set_position(("center", "bottom"))], size=(width, height))

# Load your audio file
audio_clip = AudioFileClip("output.mp3")  # Replace with your audio file

# Set the audio to the final video clip
final_clip = final_clip.set_audio(audio_clip)

# Create TextClips for animated word-by-word captions
caption_clips = []
for text, start, end in words:
    if text == '$': # USE A DELIMITER BETWEEN SENTENCES
        continue
    word_duration = (end - start)
    word_clip = TextClip(text, fontsize=100, color='white', stroke_color='black', stroke_width=2, size=(width/3, 500), font='Impact', align='center')
    word_clip = (word_clip
                    .set_position(('center', 'center'))  # Centered text
                    .set_start(start)  # Start time for each word
                    .set_duration(word_duration)  # Duration for each word
                    .crossfadein(0.1))  # Smooth fade-in effect
    caption_clips.append(word_clip)

# Overlay word-by-word captions on the video
final_video_with_captions = CompositeVideoClip([final_clip] + caption_clips)

Okay,
 
so
 
like,
 
making
 
mac
 
n'
 
cheese
 
is
 
super
 
easy
 
$
 
First,
 
you
 
boil
 
the
 
pasta
 
until
 
it's,
 
like,
 
perfectly
 
al
 
dente,
 
then
 
drain
 
it
 
$
 
In
 
a
 
separate
 
pan,
 
melt
 
some
 
butter
 
and
 
whisk
 
in
 
flour
 
to
 
make
 
this
 
cute
 
little
 
roux
 
$
 
Slowly
 
add
 
milk
 
and
 
stir
 
until
 
it's,
 
like,
 
thick
 
and
 
creamy
 
$
 
Then,
 
mix
 
in
 
your
 
cheese
 
until
 
it’s
 
all
 
melty
 
and
 
dreamy
 
$
 
Combine
 
that
 
with
 
your
 
pasta,
 
and
 
if
 
you
 
want
 
it
 
extra
 
fabulous,
 
sprinkle
 
more
 
cheese
 
on
 
top
 
and
 
bake
 
until
 
it’s
 
golden
 
and
 
bubbly
 
$
 
And,
 
voila!
 
You’ve
 
got,
 
like,
 
the
 
yummiest
 
mac
 
n'
 
cheese
 


In [51]:
final_video_with_captions.write_videofile("/Users/arinair/Documents/UMime/Test-Clips/final-clip-08.mp4", audio_codec="aac")

Moviepy - Building video /Users/arinair/Documents/UMime/Test-Clips/final-clip-08.mp4.
MoviePy - Writing audio in final-clip-08TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /Users/arinair/Documents/UMime/Test-Clips/final-clip-08.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/arinair/Documents/UMime/Test-Clips/final-clip-08.mp4
