In [None]:
# This notebook is adapted from the OpenAI cookbook. 
from IPython.display import display, Image, Audio

import cv2  # We're using OpenCV to read video
import base64
import time
import openai
import os
import requests

In [None]:
video = cv2.VideoCapture("data/dam.mp4")

base64Frames = []
while video.isOpened():
    success, frame = video.read()
    if not success:
        break
    _, buffer = cv2.imencode(".jpg", frame)
    base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

video.release()
print(len(base64Frames), "frames read.")

In [None]:
display_handle = display(None, display_id=True)
for img in base64Frames:
    display_handle.update(Image(data=base64.b64decode(img.encode("utf-8"))))
    time.sleep(0.025)

In [None]:
# simple video processing
# videos are 30fps, pass in a sample every 5 seconds so we don't hit rate limit 10k tokens/min
PROMPT_MESSAGES = [
    {
        "role": "user",
        "content": [
            "These are frames of a video. Create a short voiceover script in the style of Mike Breen. Damian Lillard is the player who scored the buzzer beater, series winner, against Paul George. Make output to be readable in 30s. Don't include context, just commentary.",
            *map(lambda x: {"image": x, "resize": 768}, base64Frames[0::150]),
        ],
    },
]
params = {
    "model": "gpt-4-vision-preview",
    "messages": PROMPT_MESSAGES,
    "api_key": os.environ["OPENAI_API_KEY"],
    "headers": {"Openai-Version": "2020-11-07"},
    "max_tokens": 500,
}

result = openai.ChatCompletion.create(**params)
print(result.choices[0].message.content)

In [None]:
# [Optional] may have to use this chunking code to splice video if video is too long.
import time

def chunk_frames(frames, chunk_size):
    """Yield successive n-sized chunks from frames."""
    for i in range(0, len(frames), chunk_size):
        yield frames[i:i + chunk_size]

# Define the chunk size, 150 is reasonable
chunk_size = 150

# Chunk the frames
chunked_frames = list(chunk_frames(base64Frames, chunk_size))

# Now you can use chunked_frames in your code
results = []
for frames in chunked_frames:
    PROMPT_MESSAGES = [
        {
            "role": "user",
            "content": [
                "These are frames of a video. Create a short voiceover script in the style of Mike Breen. Damian Lillard is the player who scored the buzzer beater, series winner, against Paul George. Make output to be readable in 30s. Don't include context, just commentary.",
                *map(lambda x: {"image": x, "resize": 768}, frames[0::10]),
            ],
        },
    ]
    params = {
        "model": "gpt-4-vision-preview",
        "messages": PROMPT_MESSAGES,
        "api_key": os.environ["OPENAI_API_KEY"],
        "headers": {"Openai-Version": "2020-11-07"},
        "max_tokens": 500,
    }

    result = openai.ChatCompletion.create(**params)
    results.append(result.choices[0].message.content)
    time.sleep(60)  # Sleep for 60 seconds to avoid hitting the rate limit

# Now, 'results' contains the results of all the create calls
print(results)

In [None]:
response = requests.post(
    "https://api.openai.com/v1/audio/speech",
    headers={
        "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}",
    },
    json={
        "model": "tts-1",
        "input": result.choices[0].message.content,
        "voice": "onyx",
    },
)

audio = b""
for chunk in response.iter_content(chunk_size=1024 * 1024):
    audio += chunk
Audio(audio)

In [None]:
with open('output.mp3', 'wb') as file:
    file.write(audio)
    
print("The MP3 file has been saved locally as 'output.mp3'.")