In [5]:
import cv2
import numpy as np
import pandas as pd
import torch
import os
# from torchvision.models.detection import maskrcnn_resnet50_fpn
# from torchvision.transforms import functional as F
from ultralytics import YOLO
import sys

In [2]:
model = YOLO("yolov8n-seg.pt")

In [3]:
def extract_objects_from_video(csv_file, video_file, object_id, output_folder):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Filter the rows for the specified object ID
    df_object = df[df['ID'] == object_id]

    # Open the video file
    cap = cv2.VideoCapture(video_file)

    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Loop through the filtered DataFrame
    for index, row in df_object.iterrows():
        frame_number = row['Frame']
        x1, y1, x2, y2 = int(row['x1']), int(row['y1']), int(row['x2']), int(row['y2'])
        
        # Set the video frame to the correct frame number
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
        ret, frame = cap.read()

        if ret:
            # Crop the object using the bounding box coordinates
            cropped_object = frame[y1:y2, x1:x2]
            
            # Save the cropped object as an image
            output_filename = f"{output_folder}/object_{object_id}_frame_{frame_number}.jpg"
            cv2.imwrite(output_filename, cropped_object)
        else:
            print(f"Could not read frame {frame_number} from the video.")
    
    # Release the video capture
    cap.release()

In [6]:
csv_file = 'vid7.csv'
video_file = 'E:/Dataset_project/Crop_videos/vid7_27_7_FaisalTown.mp4'
object_id = 4  # Specify the object ID you want to extract
output_folder = 'E:/Dataset_project/extracted_objects_4'
extract_objects_from_video(csv_file, video_file, object_id, output_folder)

In [None]:
# Load the CSV file
data = pd.read_csv("vid7.csv")
# Filter for only ID = 1
object_data = data[data['ID'] == 1]
# Load the average frame
average_frame = cv2.imread("E:/Dataset_project/Summarization_videos/average_frame.jpg")
# Define video properties
frame_height, frame_width, _ = average_frame.shape
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_video = cv2.VideoWriter('E:/Dataset_project/Track_on_summarize_video/output_video.mp4', fourcc, 30, (frame_width, frame_height))
# Iterate over each unique frame number for ID = 1
for frame_number in sorted(object_data['Frame'].unique()):
    # Copy the average frame to start fresh for each frame
    frame = average_frame.copy()
    # Get bounding box data for the specific frame
    bbox = object_data[object_data['Frame'] == frame_number].iloc[0]
    x1, y1, x2, y2 = int(bbox['x1']), int(bbox['y1']), int(bbox['x2']), int(bbox['y2'])
    
    # Clear the bounding box area in the current frame
    frame[y1:y2, x1:x2] = 0
    
    # Load the extracted object image for this frame
    object_img_path = f"E:/Dataset_project/extracted_objects/object_1_frame_{frame_number}.0.jpg"
    object_img = cv2.imread(object_img_path)
    
    # Resize object image to fit the bounding box size
    object_img_resized = cv2.resize(object_img, (x2 - x1, y2 - y1))
    
    # Paste the resized object image onto the cleared bounding box area
    frame[y1:y2, x1:x2] = object_img_resized
    
    # Write the frame to the video
    output_video.write(frame)

# Release the video writer
output_video.release()
print("Video created successfully")


### Applying Mask YOLO(Average Frame)

In [None]:
data = pd.read_csv("vid7.csv")

# Filter for only ID = 1
object_data = data[data['ID'] == 1]

# Load the average frame
average_frame = cv2.imread("E:/Dataset_project/Summarization_videos/average_frame.jpg")

# Define video properties
frame_height, frame_width, _ = average_frame.shape
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_video = cv2.VideoWriter('E:/Dataset_project/Track_on_summarize_video/output_mask_video.mp4', fourcc, 30, (frame_width, frame_height))

# Iterate over each unique frame number for ID = 1
for frame_number in sorted(object_data['Frame'].unique()):
    # Copy the average frame to start fresh for each frame
    frame = average_frame.copy()
    
    # Get bounding box data for the specific frame
    bbox = object_data[object_data['Frame'] == frame_number].iloc[0]
    x1, y1, x2, y2 = int(bbox['x1']), int(bbox['y1']), int(bbox['x2']), int(bbox['y2'])
    
    # Clear the bounding box area in the current frame
    frame[y1:y2, x1:x2] = 0
    
    # Load the extracted object image for this frame
    object_img_path = f"E:/Dataset_project/extracted_objects/object_1_frame_{frame_number}.0.jpg"
    object_img = cv2.imread(object_img_path)
    
    # Run YOLOv8 segmentation on the object image
    results = model(object_img, conf=0.5)
    
    # Get the segmentation mask
    masks = results[0].masks

    if masks is not None and len(masks) > 0:
        # Convert the mask to a numpy array and ensure it's in uint8 format
        mask = masks.data[0].numpy().astype(np.uint8)  # Get the first mask and convert to uint8

        # Resize the binary mask to match the bounding box size
        mask_resized = cv2.resize(mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_NEAREST)

        # Resize object image to fit the bounding box size
        object_img_resized = cv2.resize(object_img, (x2 - x1, y2 - y1))

        # Ensure mask is binary (0 or 255)
        mask_resized = (mask_resized * 255).astype(np.uint8)

        # Create the area from the average frame to blend into non-detected areas
        background_region = average_frame[y1:y2, x1:x2].copy()  # Extract same-sized region from average frame

        # Invert the mask to get non-detected areas as 255 (white)
        inverse_mask = cv2.bitwise_not(mask_resized)

        # Apply the inverted mask to the background region
        non_detected_areas = cv2.bitwise_and(background_region, background_region, mask=inverse_mask)

        # Apply segmentation mask to the resized object image
        object_img_masked = cv2.bitwise_and(object_img_resized, object_img_resized, mask=mask_resized)

        # Combine detected and non-detected areas
        object_img_masked = cv2.add(object_img_masked, non_detected_areas)

        # Overlay the masked object image with background-based non-detected areas onto the cleared bounding box area
        frame[y1:y2, x1:x2] = object_img_masked

    # Write the frame to the video
    output_video.write(frame)

# Release the video writer
output_video.release()
print("Video with segmentation mask created successfully.")

### Applying Mask (Averaging of back and next frame)

In [None]:
data = pd.read_csv("vid7.csv")

# Filter for only ID = 1
object_data = data[data['ID'] == 1]

# Load the average frame
average_frame = cv2.imread("E:/Dataset_project/Summarization_videos/average_frame.jpg")

# Define video properties
frame_height, frame_width, _ = average_frame.shape
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_video = cv2.VideoWriter('E:/Dataset_project/Track_on_summarize_video/output_no_black_video.mp4', fourcc, 30, (frame_width, frame_height))

# Define threshold for zero pixels
zero_pixel_threshold = 50  # Adjust as needed

# Dictionary to store processed frames and buffer for missing content
output_frames = {}
last_valid_frame_content = None

# Process each frame for the given object ID
for frame_number in sorted(object_data['Frame'].unique()):
    # Copy the average frame to start fresh for each frame
    frame = average_frame.copy()
    
    # Get bounding box data for the specific frame
    bbox = object_data[object_data['Frame'] == frame_number].iloc[0]
    x1, y1, x2, y2 = int(bbox['x1']), int(bbox['y1']), int(bbox['x2']), int(bbox['y2'])
    
    # Clear the bounding box area in the current frame
    frame[y1:y2, x1:x2] = 0
    
    # Load the extracted object image for this frame
    object_img_path = f"E:/Dataset_project/extracted_objects/object_1_frame_{frame_number}.0.jpg"
    object_img = cv2.imread(object_img_path)
    
    # Run YOLOv8 segmentation on the object image
    results = model(object_img, conf=0.5)
    
    # Get the segmentation mask
    masks = results[0].masks

    if masks is not None and len(masks) > 0:
        # Convert the mask to a numpy array and ensure it's in uint8 format
        mask = masks.data[0].numpy().astype(np.uint8)  # Get the first mask and convert to uint8

        # Resize the binary mask to match the bounding box size
        mask_resized = cv2.resize(mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_NEAREST)

        # Resize object image to fit the bounding box size
        object_img_resized = cv2.resize(object_img, (x2 - x1, y2 - y1))

        # Ensure mask is binary (0 or 255)
        mask_resized = (mask_resized * 255).astype(np.uint8)

        # Create the area from the average frame to blend into non-detected areas
        background_region = average_frame[y1:y2, x1:x2].copy()  # Extract same-sized region from average frame

        # Invert the mask to get non-detected areas as 255 (white)
        inverse_mask = cv2.bitwise_not(mask_resized)

        # Apply the inverted mask to the background region
        non_detected_areas = cv2.bitwise_and(background_region, background_region, mask=inverse_mask)

        # Apply segmentation mask to the resized object image
        object_img_masked = cv2.bitwise_and(object_img_resized, object_img_resized, mask=mask_resized)

        # Combine detected and non-detected areas
        object_img_masked = cv2.add(object_img_masked, non_detected_areas)

        # Overlay the masked object image with background-based non-detected areas onto the cleared bounding box area
        frame[y1:y2, x1:x2] = object_img_masked

        # Update the last valid content with the current bounding box
        last_valid_frame_content = frame[y1:y2, x1:x2].copy()
    else:
        # If there's no detection, check for zero pixels
        if last_valid_frame_content is not None:
            # Resize last_valid_frame_content to match the bounding box size
            resized_content = cv2.resize(last_valid_frame_content, (x2 - x1, y2 - y1))
            # Propagate resized content to avoid black areas
            frame[y1:y2, x1:x2] = resized_content
        else:
            # Use background if no previous content is available
            frame[y1:y2, x1:x2] = average_frame[y1:y2, x1:x2].copy()

    # Save the processed frame in output_frames dictionary
    output_frames[frame_number] = frame

# Write all frames to the video
for frame_number in sorted(output_frames.keys()):
    output_video.write(output_frames[frame_number])

# Release the video writer
output_video.release()
print("Video with filled gaps created successfully.")

### Mask RCNN

In [None]:
# Load the CSV data and filter for ID = 1
data = pd.read_csv("vid7.csv")
object_data = data[data['ID'] == 1]

# Load the average frame
average_frame = cv2.imread("E:/Dataset_project/Summarization_videos/average_frame.jpg")

# Define video properties
frame_height, frame_width, _ = average_frame.shape
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_video = cv2.VideoWriter('E:/Dataset_project/Track_on_summarize_video/output_maskRCNN_video.mp4', fourcc, 30, (frame_width, frame_height))

# Initialize Mask R-CNN model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = maskrcnn_resnet50_fpn(pretrained=True).to(device)
model.eval()

# Iterate over each unique frame number for ID = 1
total_frames = len(object_data['Frame'].unique())
for i, frame_number in enumerate(sorted(object_data['Frame'].unique()), 1):
    # Copy the average frame to start fresh for each frame
    frame = average_frame.copy()
    
    # Get bounding box data for the specific frame
    bbox = object_data[object_data['Frame'] == frame_number].iloc[0]
    x1, y1, x2, y2 = int(bbox['x1']), int(bbox['y1']), int(bbox['x2']), int(bbox['y2'])
    
    # Clear the bounding box area in the current frame
    frame[y1:y2, x1:x2] = 0
    
    # Load the extracted object image for this frame
    object_img_path = f"E:/Dataset_project/extracted_objects/object_1_frame_{frame_number}.0.jpg"
    object_img = cv2.imread(object_img_path)
    
    # Prepare the object image for Mask R-CNN input
    img_tensor = F.to_tensor(object_img).unsqueeze(0).to(device)

    # Run Mask R-CNN on the object image
    with torch.no_grad():
        predictions = model(img_tensor)

    # Filter predictions with a confidence threshold
    masks = predictions[0]['masks']
    scores = predictions[0]['scores']
    confidence_threshold = 0.5
    mask = None

    # Check if any masks exist above the threshold
    for j in range(len(scores)):
        if scores[j] > confidence_threshold:
            mask = masks[j, 0].cpu().numpy()
            break

    if mask is not None:
        # Resize the binary mask to match the bounding box size
        mask_resized = cv2.resize(mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_NEAREST)

        # Resize object image to fit the bounding box size
        object_img_resized = cv2.resize(object_img, (x2 - x1, y2 - y1))

        # Ensure mask is binary (0 or 255)
        mask_resized = (mask_resized * 255).astype(np.uint8)

        # Create the area from the average frame to blend into non-detected areas
        background_region = average_frame[y1:y2, x1:x2].copy()

        # Invert the mask to get non-detected areas as 255 (white)
        inverse_mask = cv2.bitwise_not(mask_resized)

        # Apply the inverted mask to the background region
        non_detected_areas = cv2.bitwise_and(background_region, background_region, mask=inverse_mask)

        # Apply segmentation mask to the resized object image
        object_img_masked = cv2.bitwise_and(object_img_resized, object_img_resized, mask=mask_resized)

        # Combine detected and non-detected areas
        object_img_masked = cv2.add(object_img_masked, non_detected_areas)

        # Overlay the masked object image with background-based non-detected areas onto the cleared bounding box area
        frame[y1:y2, x1:x2] = object_img_masked

    # Write the frame to the video
    output_video.write(frame)
    # Print progress feedback
    sys.stdout.write(f"\rProcessing frame {i}/{total_frames}...")
    sys.stdout.flush()
# Release the video writer
output_video.release()
print("\nVideo with Mask R-CNN segmentation mask created successfully.")

### YOLO with Car only

In [None]:
data = pd.read_csv("vid7.csv")
object_data = data[data['ID'] == 1]

# Load the average frame
average_frame = cv2.imread("E:/Dataset_project/Summarization_videos/average_frame.jpg")

# Define video properties
frame_height, frame_width, _ = average_frame.shape
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_video = cv2.VideoWriter('E:/Dataset_project/Track_on_summarize_video/output_carmask_video.mp4', fourcc, 30, (frame_width, frame_height))

# Define the class ID for car in YOLOv8
car_class_id = 2  # Update this to the correct ID for "car" in your model if different

# Iterate over each unique frame number for ID = 1
for frame_number in sorted(object_data['Frame'].unique()):
    # Copy the average frame to start fresh for each frame
    frame = average_frame.copy()
    
    # Get bounding box data for the specific frame
    bbox = object_data[object_data['Frame'] == frame_number].iloc[0]
    x1, y1, x2, y2 = int(bbox['x1']), int(bbox['y1']), int(bbox['x2']), int(bbox['y2'])
    
    # Check if bounding box coordinates are valid
    if x2 - x1 <= 0 or y2 - y1 <= 0:
        print(f"Skipping frame {frame_number} due to invalid bounding box dimensions.")
        continue

    # Clear the bounding box area in the current frame
    frame[y1:y2, x1:x2] = 0
    
    # Load the extracted object image for this frame
    object_img_path = f"E:/Dataset_project/extracted_objects/object_1_frame_{frame_number}.0.jpg"
    object_img = cv2.imread(object_img_path)
    
    # Run YOLOv8 segmentation on the object image
    results = model(object_img, conf=0.5)

    # Get segmentation masks and class IDs from YOLOv8 model output
    masks = results[0].masks
    boxes = results[0].boxes

    # Check if detections exist in this frame
    if masks is None or boxes is None:
        print(f"Frame {frame_number}: No detections found, skipping this frame.")
        continue

    # Filter to keep only car masks
    car_mask = None
    for mask, box in zip(masks, boxes):
        if int(box.cls) == car_class_id:  # Check if the class ID matches "car"
            car_mask = mask.data.numpy().astype(np.uint8)
            break

    # Debug statements to check the dimensions of car_mask and bounding box
    if car_mask is not None:
        # Ensure car_mask is 2D by taking the first channel if needed
        if car_mask.ndim == 3:
            car_mask = car_mask[0]

        print(f"Frame {frame_number}: Car mask shape before resizing: {car_mask.shape}")
        print(f"Frame {frame_number}: Target bounding box dimensions: {(y2 - y1, x2 - x1)}")

        # Resize the binary car mask to match the bounding box size
        mask_resized = cv2.resize(car_mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_NEAREST)

        # Resize object image to fit the bounding box size
        object_img_resized = cv2.resize(object_img, (x2 - x1, y2 - y1))

        # Ensure mask is binary (0 or 255)
        mask_resized = (mask_resized * 255).astype(np.uint8)

        # Create the area from the average frame to blend into non-detected areas
        background_region = average_frame[y1:y2, x1:x2].copy()

        # Invert the mask to get non-detected areas as 255 (white)
        inverse_mask = cv2.bitwise_not(mask_resized)

        # Apply the inverted mask to the background region
        non_detected_areas = cv2.bitwise_and(background_region, background_region, mask=inverse_mask)

        # Apply segmentation mask to the resized object image
        object_img_masked = cv2.bitwise_and(object_img_resized, object_img_resized, mask=mask_resized)

        # Combine detected and non-detected areas
        object_img_masked = cv2.add(object_img_masked, non_detected_areas)

        # Overlay the masked object image with background-based non-detected areas onto the cleared bounding box area
        frame[y1:y2, x1:x2] = object_img_masked
    else:
        print(f"No car mask found for frame {frame_number}. Skipping mask application for this frame.")

    # Write the frame to the video
    output_video.write(frame)
    print(f"Processed frame {frame_number}")  # Indicate progress for each processed frame

# Release the video writer
output_video.release()
print("Video with car-only segmentation mask created successfully.")

### Multiple object placing

In [None]:
data = pd.read_csv("vid7.csv")

# Define IDs to process and corresponding folders
object_ids = [1, 4]  # Update to IDs 1 and 4
object_folders = {
    1: "E:/Dataset_project/extracted_objects/",  # Folder for ID 1
    4: "E:/Dataset_project/extracted_objects_4/",  # Folder for ID 4
}

# Load the average frame
average_frame = cv2.imread("E:/Dataset_project/Summarization_videos/average_frame.jpg")

# Define video properties
frame_height, frame_width, _ = average_frame.shape
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_video = cv2.VideoWriter('E:/Dataset_project/Track_on_summarize_video/output_ids_1_and_4.mp4', fourcc, 30, (frame_width, frame_height))

# Define threshold for zero pixels
zero_pixel_threshold = 50  # Adjust as needed

# Dictionary to store processed frames
output_frames = {}

# Process frames for all IDs
for frame_number in sorted(data['Frame'].unique()):
    # Start with the average frame for each output frame
    combined_frame = average_frame.copy()

    for obj_id in object_ids:
        # Filter data for the current object ID and frame
        object_data = data[(data['ID'] == obj_id) & (data['Frame'] == frame_number)]

        # Skip if no data for this ID in the current frame
        if object_data.empty:
            continue

        # Get bounding box data
        bbox = object_data.iloc[0]
        x1, y1, x2, y2 = int(bbox['x1']), int(bbox['y1']), int(bbox['x2']), int(bbox['y2'])

        # Clear the bounding box area
        combined_frame[y1:y2, x1:x2] = 0

        # Load the extracted object image for this frame
        object_img_path = f"{object_folders[obj_id]}object_{obj_id}_frame_{frame_number}.0.jpg"
        object_img = cv2.imread(object_img_path)

        if object_img is None:
            continue

        # Run YOLOv8 segmentation on the object image
        results = model(object_img, conf=0.5)

        # Get the segmentation mask
        masks = results[0].masks

        if masks is not None and len(masks) > 0:
            # Convert the mask to a numpy array and ensure it's in uint8 format
            mask = masks.data[0].numpy().astype(np.uint8)

            # Resize the binary mask and object image to fit the bounding box size
            mask_resized = cv2.resize(mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_NEAREST)
            object_img_resized = cv2.resize(object_img, (x2 - x1, y2 - y1))

            # Ensure mask is binary (0 or 255)
            mask_resized = (mask_resized * 255).astype(np.uint8)

            # Create the area from the average frame to blend into non-detected areas
            background_region = average_frame[y1:y2, x1:x2].copy()

            # Invert the mask to get non-detected areas
            inverse_mask = cv2.bitwise_not(mask_resized)
            non_detected_areas = cv2.bitwise_and(background_region, background_region, mask=inverse_mask)

            # Apply segmentation mask to the resized object image
            object_img_masked = cv2.bitwise_and(object_img_resized, object_img_resized, mask=mask_resized)

            # Combine detected and non-detected areas
            combined_content = cv2.add(object_img_masked, non_detected_areas)

            # Overlay the processed object area onto the combined frame
            combined_frame[y1:y2, x1:x2] = combined_content

    # Save the processed frame in the output_frames dictionary
    output_frames[frame_number] = combined_frame

# Write all frames to the video
for frame_number in sorted(output_frames.keys()):
    output_video.write(output_frames[frame_number])

# Release the video writer
output_video.release()
print("Video with IDs 1 and 4 created successfully.")


0: 448x640 1 car, 1821.0ms
Speed: 68.0ms preprocess, 1821.0ms inference, 165.0ms postprocess per image at shape (1, 3, 448, 640)

0: 608x640 (no detections), 1275.0ms
Speed: 31.0ms preprocess, 1275.0ms inference, 5.0ms postprocess per image at shape (1, 3, 608, 640)

0: 448x640 1 car, 767.0ms
Speed: 11.0ms preprocess, 767.0ms inference, 13.0ms postprocess per image at shape (1, 3, 448, 640)

0: 608x640 (no detections), 932.0ms
Speed: 14.0ms preprocess, 932.0ms inference, 5.0ms postprocess per image at shape (1, 3, 608, 640)

0: 448x640 1 car, 677.1ms
Speed: 13.0ms preprocess, 677.1ms inference, 13.9ms postprocess per image at shape (1, 3, 448, 640)

0: 640x640 (no detections), 968.0ms
Speed: 15.0ms preprocess, 968.0ms inference, 8.0ms postprocess per image at shape (1, 3, 640, 640)

0: 448x640 1 car, 739.0ms
Speed: 11.0ms preprocess, 739.0ms inference, 12.0ms postprocess per image at shape (1, 3, 448, 640)

0: 640x640 (no detections), 933.0ms
Speed: 14.0ms preprocess, 933.0ms inferenc