In [None]:
import requests
import base64
from PIL import Image
from io import BytesIO
import os

INFERENCE_ENDPOINT = "https://infer.roboflow.com"
API_KEY = "rXUCtP2HyHKayj9DIr2F"
VIDEO = "./video.mov"

prompts = [
    "person",
    "something else"
]

ACTIVE_PROMPT = "person"

In [None]:
def classify_image(image: str) -> dict:
    image_data = Image.fromarray(image)

    buffer = BytesIO()
    image_data.save(buffer, format="JPEG")
    image_data = base64.b64encode(buffer.getvalue()).decode("utf-8")

    payload = {
        "api_key": API_KEY,
        "subject": {
            "type": "base64",
            "value": image_data
        },
        "prompt": prompts,
    }

    data = requests.post(INFERENCE_ENDPOINT + "/clip/compare?api_key=" + API_KEY, json=payload)

    response = data.json()

    highest_prediction = 0
    highest_prediction_index = 0

    for i, prediction in enumerate(response["similarity"]):
        if prediction > highest_prediction:
            highest_prediction = prediction
            highest_prediction_index = i

    return prompts[highest_prediction_index]

In [None]:
import supervision as sv
results = []

for i, frame in enumerate(sv.get_video_frames_generator(source_path=VIDEO, stride=10)):
    print("Frame", i)
    label = classify_image(frame)

    results.append(label)

video_length = 10 * len(results)

video_length = video_length / 24

print(f"Does this video contain a {ACTIVE_PROMPT}?", "yes" if ACTIVE_PROMPT in results else "no")

if ACTIVE_PROMPT in results:
    print(f"When does the {ACTIVE_PROMPT} first appear?", round(results.index(ACTIVE_PROMPT) * 10 / 24, 0), "seconds")

print(f"For how long is the {ACTIVE_PROMPT} visible?", round(results.count(ACTIVE_PROMPT) * 10 / 24, 0), "seconds")

Frame 0
Frame 1
Frame 2
Frame 3
Frame 4
Frame 5
Frame 6
Frame 7
Frame 8
Frame 9
Frame 10
Frame 11
Frame 12
Frame 13
Frame 14
Frame 15
Frame 16
Frame 17
Frame 18
Frame 19
Frame 20
Frame 21
Frame 22
Frame 23
Frame 24
Frame 25
Frame 26
Frame 27
Frame 28
Frame 29
Frame 30
Frame 31
Frame 32
Frame 33
Frame 34
Frame 35
Frame 36
Frame 37
Frame 38
Frame 39
Frame 40
Frame 41
Frame 42
Frame 43
Frame 44
Does this video contain a person? yes
When does the person first appear? 0.0 seconds
For how long is the person visible? 16.0 seconds


# Video analysis using HF transformers

In [None]:
!pip install opencv-python-headless pytube
!pip install moviepy
!pip install pytube





In [None]:

! pip install transformers -q

In [None]:
import warnings,logging
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [None]:

from transformers import pipeline
caption = pipeline('image-to-text')


In [None]:
import os
from pytube import YouTube
import imageio
from moviepy.editor import VideoFileClip

# Define the YouTube video URL
youtube_url = "https://www.youtube.com/watch?v=YLslsZuEaNE"

# Define the folder where you want to store the video
output_folder = "/content"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Download the YouTube video
yt = YouTube(youtube_url)
stream = yt.streams.get_highest_resolution()
video_file = os.path.join(output_folder, f"{yt.title}.mp4")
stream.download(output_path=output_folder, filename=yt.title)

# The zip file can be downloaded using the Colab interface.


'/content/1 Minute Video - Doggie'

In [None]:
import os
from pytube import YouTube
import imageio
from moviepy.editor import VideoFileClip



# Verify if the video file exists after download
video_file="/content/1 Minute Video - Doggie"

if os.path.exists(video_file):
    print(f"Video download successful: {video_file}")

    # Now, you can proceed with the video-to-images conversion using moviepy
    clip = VideoFileClip(video_file)
    frame_count = 8

    # Create a folder to store frames
    frames_folder = os.path.join(output_folder, 'video_frames')
    os.makedirs(frames_folder, exist_ok=True)

    for frame in clip.iter_frames(fps=30):  # Adjust the frame rate (fps) as needed
        frame_count += 1
        image_file = os.path.join(frames_folder, f"frame_{frame_count:04d}.jpg")
        imageio.imwrite(image_file, frame)

    # Print a message when the conversion is complete
    print(f"Video conversion complete. {frame_count} frames saved as images in {frames_folder}")

    # Optionally, you can create a zip file of the images for easy download
    import shutil

    shutil.make_archive(frames_folder, 'zip', frames_folder)


Video download successful: /content/1 Minute Video - Doggie
Video conversion complete. 1802 frames saved as images in /content/video_frames


In [None]:
# Now, process the images with the caption pipeline and save the results
import os
from pytube import YouTube
import imageio
from moviepy.editor import VideoFileClip
output_folder="/content"
frames_folder = os.path.join(output_folder, 'video_frames')
caption_output_folder = os.path.join(output_folder, 'caption_output')
os.makedirs(caption_output_folder, exist_ok=True)
frame_count = 20
for frame_num in range(1, frame_count + 1):
  input_image = os.path.join(frames_folder, f"frame_{frame_num:04d}.jpg")
  output_caption_file = os.path.join(caption_output_folder, f"caption_frame_{frame_num:04d}.txt")

        # Call the caption pipeline to generate captions from images
  caption_result = caption(input_image)

        # Extract the caption text from the result
  print(caption_result)
  caption_text = None
  for key in caption_result[0].keys():
    if 'caption' in key.lower():
      caption_text = caption_result[0][key]
      break

        # Save the caption text to a text file
    if caption_text:
      with open(output_caption_file, 'w') as caption_file:
        caption_file.write(caption_text)

      # Optionally, you can create a zip file of the caption output
import shutil

shutil.make_archive(caption_output_folder, 'zip', caption_output_folder)

    # The zip file can be downloaded using the Colab interface.



[{'generated_text': 'a dog is playing with a ball in a yard '}]
[{'generated_text': 'a dog is playing with a ball in a yard '}]
[{'generated_text': 'a dog is playing with a frisbee in a yard '}]
[{'generated_text': 'a dog is playing with a frisbee in a yard '}]
[{'generated_text': 'a dog is playing with a frisbee in the yard '}]
[{'generated_text': 'a dog playing with a frisbee in a yard '}]
[{'generated_text': 'a small dog playing with a frisbee in a yard '}]
[{'generated_text': 'a small dog playing with a ball in a yard '}]
[{'generated_text': 'a small dog playing with a frisbee in a yard '}]
[{'generated_text': 'a dog is playing with a ball in the yard '}]
[{'generated_text': 'a dog is playing with a frisbee in the yard '}]
[{'generated_text': 'a dog is playing with a frisbee in the yard '}]
[{'generated_text': 'a dog is playing with a frisbee in the yard '}]
[{'generated_text': 'a dog is playing with a frisbee in the yard '}]
[{'generated_text': 'a small dog standing in the grass w

'/content/caption_output.zip'