In [None]:
!pip install SpeechRecognition

# Generate transcript from audio

In [None]:
import speech_recognition as sr

# Load the audio file
audio_file = "song.mp3"

# Initialize the recognizer
r = sr.Recognizer()

# Transcribe the audio
with sr.AudioFile(audio_file) as source:
    audio = r.record(source)
transcript = r.recognize_google(audio, language="en-US")

# Save the transcript to a text file
with open("transcript.txt", "w", encoding="utf-8") as f:
    f.write(transcript)

# Make stanzas from the transcript

In [None]:
# Read the transcript
with open("transcript.txt", "r", encoding="utf-8") as f:
    transcript = f.read()

# Split the transcript
lines = transcript.split("\n")

# Group the lines into stanzas
stanzas = []
current_stanza = []
for line in lines:
    if line.strip() == "":
        if current_stanza:
            stanzas.append("\n".join(current_stanza))
            current_stanza = []
    else:
        current_stanza.append(line)
if current_stanza:
    stanzas.append("\n".join(current_stanza))

# Save the stanzas
with open("poem.txt", "w", encoding="utf-8") as f:
    f.write("\n\n".join(stanzas))

# Use Bing API to generate images

In [None]:
from serpapi import BingSearch
import os

# Set up the Bing API parameters
params = {
    "api_key": "your_api_key",
    "q": "coffee",
    "engine": "bing_images",
    "count": 1,
    "num": 1,
    "start": 1,
    "device": "desktop",
    "safe": "off",
    "imagesize": "MEDIUM",
    "aspect": "SQUARE",
    "color": "COLOR",
    "type": "PHOTO",
    "license": "ANY",
}

# Read the poem from the file
with open("poem.txt", "r", encoding="utf-8") as f:
    poem = f.read().split("\n\n")

# Generate images for each stanza
for i, stanza in enumerate(poem):
    lines = stanza.split("\n")
    for j, line in enumerate(lines):
        params["q"] = line
        search = BingSearch(params)
        results = search.get_dict()
        image_url = results["images_results"][0]["thumbnail"]
        image_filename = f"{i+1}_{j+1}.png"
        os.makedirs("images", exist_ok=True)
        with open(os.path.join("images", image_filename), "wb") as f:
            f.write(requests.get(image_url).content)

# Generate music for the video

In [None]:
import librosa

# Load the audio file
audio_file = "song.mp3"
y, sr = librosa.load(audio_file)

# Detect the beats
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
beat_times = librosa.frames_to_time(beat_frames, sr=sr)

# Save the beat times to a file
with open("beats.txt", "w") as f:
    for beat_time in beat_times:
        f.write(f"{beat_time:.2f}\n")

# Merge the audio and images to export the video

In [None]:
from moviepy.editor import *

# Load the audio file
audio = AudioFileClip("song.mp3")

# Load the images
image_files = sorted(os.listdir("images"))
image_clips = [ImageClip(os.path.join("images", image_file)).set_duration(2) for image_file in image_files]

# Sync the images to the audio
final_clip = concatenate_videoclips(image_clips, method="compose")
final_clip = final_clip.set_audio(audio)

# Add transitions and animations
for i in range(len(image_clips) - 1):
    final_clip.clips[i] = final_clip.clips[i].crossfadein(1)
    final_clip.clips[i] = final_clip.clips[i].crossfadeout(1)

# Export the video
final_clip.write_videofile("output.mp4", fps=30, codec="libx264", bitrate="4000k")