<h1 align ="center"> REST API Video Samples</h1>
<hr>
   
# Chat Completions

In [12]:
import json
import os
import requests
import time
import re
from moviepy.editor import VideoFileClip

### Setup Parameters


Here we will load the configurations from _config.json_ file to setup vision_api_key, vision_api_endpoint, deployment_name, openai_api_base, openai_api_key and openai_api_version.

In [13]:
# Load config values
with open(r'config.json') as config_file:
    config_details = json.load(config_file)

# Setting up the vision resource key
vision_api_key = os.getenv("VISION_API_KEY")

# The base URL for your vision resource endpoint, e.g. "https://<your-resource-name>.cognitiveservices.azure.com"
# You must create your resource in the East US region.
vision_api_endpoint = config_details['VISION_API_ENDPOINT'] 

# Setting up the deployment name
deployment_name = config_details['GPT-4V_MODEL']

# The base URL for your Azure OpenAI resource. e.g. "https://<your resource name>.openai.azure.com"
openai_api_base = config_details['OPENAI_API_BASE']

# The API key for your Azure OpenAI resource.
openai_api_key = os.getenv("OPENAI_API_KEY")

# Currently OPENAI API have the following versions available: 2022-12-01. All versions follow the YYYY-MM-DD date structure.
openai_api_version = config_details['OPENAI_API_VERSION']

### Create Video Index


In [None]:
video_SAS_url = "your_SAS_url_here" # Insert your video SAS URL, e.g. https://<your-storage-account-name>.blob.core.windows.net/<your-container-name>/<your-video-name>?<SAS-token>
video_index_name = "unique_video_index_name" # This index name must be unique
video_id = "unique_video_id" # This video ID must be unique

def create_video_index(vision_api_endpoint, vision_api_key, index_name):
    url = f"{vision_api_endpoint}/computervision/retrieval/indexes/{index_name}?api-version=2023-05-01-preview"
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key, "Content-Type": "application/json"}
    data = {
        "features": [
            {"name": "vision", "domain": "surveillance"},
            {"name": "speech"}
        ]
    }
    response = requests.put(url, headers=headers, data=json.dumps(data))
    return response

def add_video_to_index(vision_api_endpoint, vision_api_key, index_name, video_url, video_id):
    url = f"{vision_api_endpoint}/computervision/retrieval/indexes/{index_name}/ingestions/my-ingestion?api-version=2023-05-01-preview"
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key, "Content-Type": "application/json"}
    data = {
        'videos': [{'mode': 'add', 'documentId': video_id, 'documentUrl': video_url}]
    }
    response = requests.put(url, headers=headers, data=json.dumps(data))
    return response

def wait_for_ingestion_completion(vision_api_endpoint, vision_api_key, index_name, max_retries=30):
    url = f"{vision_api_endpoint}/computervision/retrieval/indexes/{index_name}/ingestions?api-version=2023-05-01-preview"
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key}
    retries = 0
    while retries < max_retries:
        time.sleep(10)
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            state_data = response.json()
            if state_data['value'][0]['state'] == 'Completed':
                print(state_data)
                print('Ingestion completed.')
                return True
        retries += 1
    return False


# Step 1: Create an Index
response = create_video_index(vision_api_endpoint, vision_api_key, video_index_name)
print(response.status_code, response.text)

# Step 2: Add a video file to the index
response = add_video_to_index(vision_api_endpoint, vision_api_key, video_index_name, video_SAS_url, video_id)
print(response.status_code, response.text)

# Step 3: Wait for ingestion to complete
if not wait_for_ingestion_completion(vision_api_endpoint, vision_api_key, video_index_name):
    print("Ingestion did not complete within the expected time.")


### Define GPT-4V API Call

In [14]:
# Define GPT-4V API call
def call_GPT4V(vision_api_endpoint, vision_api_key, video_index_name, video_id, user_prompt, sys_message):
    # Construct the API request URL
    api_url = f"{openai_api_base}/openai/deployments/{deployment_name}/extensions/chat/completions?api-version={openai_api_version}"

    # Including the api-key in HTTP headers
    headers = {
        "Content-Type": "application/json",
        "api-key": openai_api_key,
    }

    # Payload for the request
    payload = {
    "dataSources": [
        {
        "type": "AzureComputerVisionVideoIndex",
        "parameters": {
            "computerVisionBaseUrl": f"{vision_api_endpoint}/computervision",
            "computerVisionApiKey": vision_api_key,
            "indexName": video_index_name,
            "videoUrls": [video_SAS_url]
        }
        }
    ],
    "enhancements": {
            "video": {
                "enabled": True
            }
        },
    "messages": [
        {
        "role": "system",
        "content": [
            sys_message
        ]
        },
        {
        "role": "user",
        "content": [
            {
            "acv-document-id": video_id
            },
        ]
        },
        {
        "role": "user",
        "content": [
            user_prompt
        ]
        }, 
    ],
    "temperature": 0.7,
    "top_p": 0.95,
    "max_tokens": 800
    }

    # Send the request and handle the response
    try:
        response = requests.post(api_url, headers=headers, json=payload)
        response.raise_for_status()  # Raise an error for bad HTTP status codes
        return response.json()
    except requests.RequestException as e:
        raise SystemExit(f"Failed to make the request. Error: {e}")

### Call GPT-4V On The Entire Video

In [15]:
# System messages and user prompt
sys_message = """
Your task is to assist in analyzing and optimizing creative assets. 
You will be presented with advertisement videos for products. 
First describe the video in detail paying close attention to Product characteristics highlighted, 
Background images, Lighting, Color Palette and Human characteristics for persons in the video. 
Finally provide a summary of the video and talk about the main message the advertisement video tries to convey to the viewer. 
"""
user_prompt = "Summarize the ad video"

# Call GPT-4V API and print the response
try:
    response = call_GPT4V(vision_api_endpoint, vision_api_key, video_index_name, video_id, user_prompt, sys_message)
    text = response['choices'][0]['message']['content']
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    for sentence in sentences:  # Print the content of the response
        print(sentence)
except requests.RequestException as e:
    raise SystemExit(f"Failed to make the request. Error: {e}")

The advertisement video highlights the features of Microsoft's new virtual assistant, Copilot.
Throughout the video, the assistant is shown helping users in various ways, such as creating pictures, exploring variations, reading, and organizing plans.
The ad uses colorful, serene, and visually pleasing backgrounds to convey a sense of calmness and creativity.
It demonstrates how Copilot can assist in enhancing the user’s digital experience, making it more organized, creative, and productive.
The main message is to introduce and promote Copilot as a helpful and reliable assistant that makes the user's digital interaction easier and more enjoyable.


### Call GPT-4V On Each Video Chunk Sequentially

In [18]:
def download_video(sas_url, local_file_path):
    try:
        response = requests.get(sas_url, stream=True)
        if response.status_code == 200:
            with open(local_file_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            return True
        else:
            print(f"Download failed with status code: {response.status_code}")
            return False
    except Exception as e:
        print(f"An error occurred during download: {e}")
        return False

def get_video_length(file_path):
    try:
        with VideoFileClip(file_path) as video:
            return video.duration
    except Exception as e:
        print(f"Error in getting video length: {e}")
        return None

# Define the number of seconds for each segment
chunk_size = 20
# Download the video
local_file_path = "downloaded_video.mp4"
if download_video(video_SAS_url, local_file_path):
    video_length = get_video_length(local_file_path)
    os.remove(local_file_path)

    if video_length is not None:
        print(f"Video Length: {video_length} seconds")
        sys_message = f"""
        The total length of the video is {video_length}s. Your task is to assist in finding all scenes in this video.
        You also need to describe each scene with start and end time. 
        """
        number_of_segments = int(video_length // chunk_size)
        updated_response = ""
        for i in range(number_of_segments + 1): # Include the last segment
            start_time = i * chunk_size
            end_time = min((i + 1) * chunk_size, video_length)
            user_prompt = f"How many scenes from {start_time}s to {end_time}s?"
            print(f"Segment {i+1}: {user_prompt}")
            if i > 0:
                user_prompt += f"""And here are scenes in the previous segments: {updated_response}. 
                                You need to combine the scenes in the previous segments with the scenes in this segment and provide a summary.
                                """
            
            response = call_GPT4V(vision_api_endpoint, vision_api_key, video_index_name, video_id, user_prompt, sys_message)
            updated_response = response['choices'][0]['message']['content']
            #print(f"Response for segment {i+1}: {updated_response}")
            time.sleep(2) # Avoid throttling
        
        # Print the final response
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', updated_response)
        for sentence in sentences:  # Print the content of the response
            print(sentence)
    else:
        print("Failed to process video length.")
else:
    print("Failed to download video.")

Video Length: 46.13 seconds
Segment 1: How many scenes from 0s to 20s?
Segment 2: How many scenes from 20s to 40s?
Segment 3: How many scenes from 40s to 46.13s?
There are two scenes from 40s to 46.13s.

Here is the updated summary of all scenes:

1.
Scene 1 (00:00 - 00:00.500000): A serene environment with a beautiful house surrounded by nature.
2.
Scene 2 (00:00.500000 - 00:03): A closer look at the house's elegant architecture and interior design.
3.
Scene 3 (00:03 - 00:06): Introduction of the "Hello Copilot" feature on a gradient background.
4.
Scene 4 (00:06 - 00:09): The "Inspire me" feature introduced, suggesting creativity aids.
5.
Scene 5 (00:09 - 00:12): A desktop view showing a picturesque landscape as the wallpaper.
6.
Scene 6 (00:12 - 00:15): Copilot's interface is shown with colorful and interactive designs.
7.
Scene 7 (00:15 - 00:18.500000): A transition with a gradient background.
8.
Scene 8 (00:18.500000 - 00:22): A closer look at the texture and materials of the Copi