In [None]:
!pip install opencv-python torchvision torchaudio ultralytics


Collecting ultralytics
  Downloading ultralytics-8.2.91-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.6-py3-none-any.whl.metadata (9.1 kB)
Downloading ultralytics-8.2.91-py3-none-any.whl (871 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m871.8/871.8 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.6-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.2.91 ultralytics-thop-2.0.6


In [None]:
!pip install transformers




In [None]:
import cv2
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
import json
from PIL import Image
import io

# Load YOLOv8 medium model
from ultralytics import YOLO
yolo_model = YOLO('yolov8m.pt')

# Define class names (this list should match the class IDs used by your YOLO model)
class_names = [
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
    'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
    'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
    'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
    'hair drier', 'toothbrush'
]

# Load BART model and tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

def generate_description(detections, image):
    # Convert image to a format suitable for BART (e.g., base64 or text description)
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    image_str = buffered.getvalue()

    # Create a description input for BART
    description_input = f"Frame image data: {image_str}\nDetections: {json.dumps(detections)}"

    # Tokenize and generate description
    inputs = tokenizer(description_input, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = bart_model.generate(inputs["input_ids"], max_length=150, min_length=30, length_penalty=2.0, num_beams=4)
    description = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return description

# Open video file
video_path = '/content/The Performance-oriented ŠKODA SLAVIA 1.5 L TSI - A Class of its Own (1).mp4'
cap = cv2.VideoCapture(video_path)

# Get video properties
fps = cap.get(cv2.CAP_PROP_FPS)
frame_interval = int(fps)  # Extract 1 frame per second

print(f"Video FPS: {fps}")
print(f"Frame Interval: {frame_interval}")

# Lists to store frames and detections
frame_list = []
detections_list = []

# Initialize sequential frame counter
sequential_frame_count = 1

frame_count = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("End of video or error reading frame.")
        break

    # Extract 1 frame per second
    if frame_count % frame_interval == 0:
        frame_list.append(frame)

        # Resize the frame to the input size expected by YOLOv8
        resized_frame = cv2.resize(frame, (640, 640))

        # Perform object detection
        results = yolo_model(resized_frame)

        # Extract detection data with threshold of 0.7
        frame_detections = []
        for result in results[0].boxes:
            bbox = result.xyxy[0].tolist()
            confidence = result.conf[0].item()
            class_id = int(result.cls[0].item())
            class_name = class_names[class_id] if class_id < len(class_names) else 'unknown'

            if confidence >= 0.7:
                frame_detections.append({
                    'bbox': bbox,
                    'confidence': confidence,
                    'class': class_name
                })

        # Convert frame to PIL Image
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        # Generate description
        #description = generate_description(frame_detections, image)
        print(f"Frame {sequential_frame_count}")

        # Store detections with sequential frame numbering
        detections_list.append({
            'frame': sequential_frame_count,
            'detections': frame_detections,
        })

        # Increment the sequential frame counter
        sequential_frame_count += 1

    frame_count += 1

cap.release()

# Save detections to JSON file
with open('detections.json', 'w') as f:
    json.dump(detections_list, f, indent=4)

print("Detections saved to detections.json")


Video FPS: 25.0
Frame Interval: 25

0: 640x640 (no detections), 37.3ms
Speed: 2.2ms preprocess, 37.3ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)
Frame 1

0: 640x640 1 scissors, 37.4ms
Speed: 2.8ms preprocess, 37.4ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 640)
Frame 2

0: 640x640 1 scissors, 37.4ms
Speed: 2.3ms preprocess, 37.4ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)
Frame 3

0: 640x640 1 car, 37.3ms
Speed: 3.5ms preprocess, 37.3ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)
Frame 4

0: 640x640 1 car, 37.3ms
Speed: 3.5ms preprocess, 37.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)
Frame 5

0: 640x640 1 truck, 37.3ms
Speed: 3.1ms preprocess, 37.3ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)
Frame 6

0: 640x640 1 person, 1 car, 37.3ms
Speed: 2.6ms preprocess, 37.3ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)
Frame 7

0: 640x640 1 pe

In [None]:
!pip install -U openai-whisper

Collecting openai-whisper
  Downloading openai-whisper-20231117.tar.gz (798 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/798.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.2/798.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m788.5/798.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.6/798.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting triton<3,>=2.0.0 (from openai-whisper)
  Downloading triton-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.7.0-cp310-cp

In [None]:
import whisper

def transcribe_audio(audio_path, output_file):
    model = whisper.load_model("medium")
    result = model.transcribe(audio_path, language="en", temperature=0.6, verbose=True)

    with open(output_file, 'w') as f:
        for segment in result['segments']:
            start_time = segment['start']
            end_time = segment['end']
            text = segment['text']
            f.write(f"[{start_time:.2f} - {end_time:.2f}] {text}\n")

# Example usage
audio_path = "/content/audio file.m4a"
output_file = "/content/transcription_with_timestamps.txt"
transcribe_audio(audio_path, output_file)


100%|█████████████████████████████████████| 1.42G/1.42G [00:20<00:00, 76.3MiB/s]
  checkpoint = torch.load(fp, map_location=device)


[00:00.000 --> 00:04.200]  There are many routes one can take in life.
[00:04.200 --> 00:07.640]  Sometimes we take a beautiful
[00:07.640 --> 00:10.920]  but measured journey.
[00:10.920 --> 00:14.920]  But when life is full of choices...
[00:14.920 --> 00:21.920]  sometimes you just let the heart take over.
[00:30.000 --> 00:45.920]  When exhilaration drives you to aim higher.
[00:45.920 --> 00:51.360]  When emotion overtakes all else.
[00:51.360 --> 00:52.880]  ŠKODA SLAVIA 1.5 TSI


In [None]:
import json
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')

# Detect if GPU is available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gpt2_model = gpt2_model.to(device)

def clean_input(input_text):
    # Remove unwanted characters or symbols
    allowed_chars = ''.join(chr(i) for i in range(32, 127))  # ASCII characters from space to tilde (~)
    cleaned_text = ''.join(c for c in input_text if c in allowed_chars)
    return cleaned_text

def generate_description(detections_summary, transcribed_text):
    # Combine detections summary and transcribed text
    # description_input = (
    #    f"""You are generating a detailed description of a commercial TV ad based on visual object detections and transcribed text.

    #     1. **Visual Detections**: The following objects were detected in the video with their positions and sizes described by four values: the x-coordinate and y-coordinate of the center, followed by the width and height of each object. Use this data to understand where the objects appear in the video.

    #     {clean_input(detections_summary)}

    #     2. **Transcribed Text**: This is the audio transcription from the video, with corresponding timestamps. Use this to understand the context and actions happening in the video.

    #     {clean_input(transcribed_text)}

    #     Based on this information, generate a rich, descriptive explanation of what is happening in the video, keeping in mind that it is a commercial TV ad."""
    #These are detections of objects in the video of commercial tv ad with their positions and sizes described by four values: the x-coordinate and y-coordinate of the center, followed by the width and height of each object. Use this data to understand where the objects appear in the video:
    # )
    description_input = (
        f"""detections of the objects in the commercial tv ad video are given along with there coordinates"""

        +str(clean_input(detections_summary))

        +"""Additionally this is the transcript of the video with the timestamps:"""

        +str(clean_input(transcribed_text))

        +"""###Baesd on this information, generate a coherent and detailed description of the video content, emphasizing the visual elements and how they relate to the spoken content."""
     )


    # Debug: Print the final input to check formatting
    print("Description Input:\n", description_input)

    # Tokenize and generate description
    inputs = tokenizer(description_input, return_tensors="pt", max_length=1024, truncation=True).to(device)
    outputs = gpt2_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,  # Only limits the number of tokens generated, not affecting input
        min_length=150,
        no_repeat_ngram_size=2,
        early_stopping=True,
        temperature=0.8,
        do_sample = True,
        top_k=50,
        top_p=0.85
    )
    description = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return description

def process_detections_file(detections_file_path, transcribed_audio_file_path):
    # Load detections from JSON file
    with open(detections_file_path, 'r') as f:
        detections_list = json.load(f)

    # Load transcribed audio text from TXT file
    with open(transcribed_audio_file_path, 'r') as f:
        transcribed_text = f.read()

    # Sort the detection list by frame number to ensure sequential order
    detections_list = sorted(detections_list, key=lambda x: x['frame'])

    # Collect detected objects along with their frame numbers and positions (bounding boxes)
    frame_objects = {}
    for entry in detections_list:
        frame = entry.get('frame')
        detections = entry.get('detections', [])
        objects_in_frame = []

        for detection in detections:
            if 'class' in detection and 'bbox' in detection:
                class_name = detection['class']
                bbox = detection['bbox']
                # Convert bounding box to a readable format (e.g., top-left and bottom-right)
                if len(bbox) == 4:  # Only process if the bounding box has the correct format
                    bbox_str = f"({bbox[0]:.1f}, {bbox[1]:.1f}, {bbox[2]:.1f}, {bbox[3]:.1f})"
                    objects_in_frame.append(f"{class_name} at {bbox_str}")

        # Save objects detected in this frame
        if objects_in_frame:
            frame_objects[frame] = objects_in_frame

    # Create a formatted summary including objects and their bounding box positions
    detections_summary = ""
    for frame, objects in frame_objects.items():
        objects_str = '; '.join(objects)
        detections_summary += f"Frame {frame}: {objects_str}\n"

    # Debug: Print formatted summaries to check if they are correct
    print("Detections Summary:\n", detections_summary)
    print("Transcribed Text:\n", transcribed_text)

    # Generate a single description for the whole video including transcribed audio
    description = generate_description(detections_summary, transcribed_text)

    return description

# Example usage
detections_file_path = 'detections.json'
transcribed_audio_file_path = 'transcription_with_timestamps.txt'
video_description = process_detections_file(detections_file_path, transcribed_audio_file_path)
print(f"Video Explanation: {video_description}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Detections Summary:
 Frame 4: car at (36.8, 146.4, 157.4, 267.7)
Frame 5: car at (94.7, 150.7, 220.0, 282.7)
Frame 7: car at (287.3, 192.9, 455.1, 430.0)
Frame 8: car at (304.8, 221.8, 444.2, 418.0); person at (377.3, 219.3, 382.7, 234.5)
Frame 9: car at (289.4, 238.6, 423.5, 406.1); car at (169.8, 342.6, 224.5, 405.6)
Frame 10: car at (174.7, 327.7, 256.7, 411.2)
Frame 11: car at (192.3, 296.1, 337.0, 422.1)
Frame 16: car at (55.4, 239.0, 460.2, 479.5); car at (443.8, 334.5, 562.1, 439.4)
Frame 18: vase at (204.5, 83.4, 402.9, 397.4)
Frame 19: person at (156.9, 83.8, 441.7, 551.2)
Frame 24: person at (166.9, 85.7, 637.7, 546.1)
Frame 32: person at (267.2, 139.9, 569.8, 553.1)
Frame 36: car at (159.4, 235.8, 603.2, 500.9)
Frame 38: car at (206.2, 193.8, 526.9, 505.6)
Frame 45: truck at (189.5, 255.7, 251.9, 321.5)
Frame 48: car at (259.1, 135.0, 336.2, 196.0)
Frame 50: car at (18.6, 113.7, 441.9, 370.4)
Frame 51: car at (0.7, 84.5, 160.8, 530.6)
Frame 53: car at (216.6, 279.3, 393.8, 3