In [None]:
# !pip install opencv-python
# !pip install torch torchvision
# !pip install torch torchvision torchaudio
# !pip install Segmentor
# !pip install grounded-segment-anything
# !pip install gaze_transformer
# !pip install opencv-python Pillow torch matplotlib

In [None]:
import cv2
import numpy as np
from PIL import Image
import torch
from torchvision import transforms
import matplotlib.pyplot as plt

In [None]:
def preprocess_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while(cap.isOpened()):
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    return frames

In [None]:
video_path = "C:/Users/Amit/Desktop/Project/ABA Therapy_ Daniel - Communication.mp4"
preprocessed_frames = preprocess_video(video_path)

In [None]:
preprocessed_frames

In [None]:
def track_gaze(frame):
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    transform = transforms.Compose([transforms.Resize((224, 224)),
                                    transforms.ToTensor()])
    image_tensor = transform(image).unsqueeze(0)
    with torch.no_grad():
        gaze_output = gaze_transformer(image_tensor)

    gaze_x, gaze_y = gaze_output[0], gaze_output[1]
    return gaze_x, gaze_y

In [None]:
def detect_object_interaction(frame):
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    output = segmentor(image)

In [None]:
class EmotionPredictor:
    def __call__(self, frame):
        return "Happy", "Neutral"
emotion_predictor = EmotionPredictor()

In [None]:
def visualize_predictions(frame, child_gaze, therapist_gaze, interactions, child_emotion, therapist_emotion):
    plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    plt.axis('off')

    plt.scatter(child_gaze[0] * frame.shape[1], child_gaze[1] * frame.shape[0], c='r', marker='o', label='Child Gaze')
    plt.scatter(therapist_gaze[0] * frame.shape[1], therapist_gaze[1] * frame.shape[0], c='b', marker='o', label='Therapist Gaze')

    for i, interaction in enumerate(interactions):
        plt.text(10, 20+i*20, interaction, color='white', fontsize=10, bbox=dict(facecolor='black', alpha=0.5))

    plt.text(10, 40+len(interactions)*20, f"Child Emotion: {child_emotion}", color='white', fontsize=10, bbox=dict(facecolor='black', alpha=0.5))
    plt.text(10, 60+len(interactions)*20, f"Therapist Emotion: {therapist_emotion}", color='white', fontsize=10, bbox=dict(facecolor='black', alpha=0.5))

    plt.savefig('temp_frame.jpg')
    plt.close()

In [None]:
def main(video_path):
    frames = preprocess_video(video_path)
    results = []
    for frame in frames:
        child_gaze, therapist_gaze = track_gaze(frame)
        interactions = detect_object_interaction(frame)
        child_emotion, therapist_emotion = emotion_predictor(frame)
        visualize_predictions(frame, child_gaze, therapist_gaze, interactions, child_emotion, therapist_emotion)
        results.append({
            "child_gaze": child_gaze,
            "therapist_gaze": therapist_gaze,
            "interactions": interactions,
            "child_emotion": child_emotion,
            "therapist_emotion": therapist_emotion
        })
    return results

In [None]:
def compile_video(frames, output_video_path):
    frames = cv2.imread(frames[0])
    height, width, layers = frames.shape
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video = cv2.VideoWriter(output_video_path, fourcc, 25.0, (width, height))
    for frame_path in frames:
        video.write(cv2.imread(frame_path))
    cv2.destroyAllWindows()
    video.release()

In [None]:
def main(video_path):
    frames = preprocess_video(video_path)
    results = []
    for frame in frames:
        child_gaze, therapist_gaze = track_gaze(frame)
        interactions = detect_object_interaction(frame)
        visualize_predictions(frame, child_gaze, therapist_gaze, interactions)
        results.append({
            "child_gaze": child_gaze,
            "therapist_gaze": therapist_gaze,
            "interactions": interactions
        })
    return frames, results

video_path = "C:/Users/Amit/Desktop/Project/ABA Therapy_ Daniel - Communication.mp4"
frames, results = main(video_path)
compile_video(frames, "output_video.mp4")
print("Output video saved successfully.")

In [None]:
import os

In [None]:
def visualize_and_save_predictions(frame, frame_number, child_gaze, therapist_gaze, interactions, child_emotion, therapist_emotion, output_dir):
    plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    plt.axis('off')

    plt.scatter(child_gaze[0] * frame.shape[1], child_gaze[1] * frame.shape[0], c='r', marker='o', label='Child Gaze')
    plt.scatter(therapist_gaze[0] * frame.shape[1], therapist_gaze[1] * frame.shape[0], c='b', marker='o', label='Therapist Gaze')

    for i, interaction in enumerate(interactions):
        plt.text(10, 20+i*20, interaction, color='white', fontsize=10, bbox=dict(facecolor='black', alpha=0.5))

    plt.text(10, 40+len(interactions)*20, f"Child Emotion: {child_emotion}", color='white', fontsize=10, bbox=dict(facecolor='black', alpha=0.5))
    plt.text(10, 60+len(interactions)*20, f"Therapist Emotion: {therapist_emotion}", color='white', fontsize=10, bbox=dict(facecolor='black', alpha=0.5))

    output_path = os.path.join(output_dir, f"frame_{frame_number}.jpg")
    plt.savefig(output_path)
    plt.close()

In [None]:
def main_with_frame_saving(video_path, output_dir):
    frames = preprocess_video(video_path)
    results = []
    for i, frame in enumerate(frames):
        child_gaze, therapist_gaze = track_gaze(frame)
        interactions = detect_object_interaction(frame)
        child_emotion, therapist_emotion = emotion_predictor(frame)  # Emotion prediction
        visualize_and_save_predictions(frame, i, child_gaze, therapist_gaze, interactions, child_emotion, therapist_emotion, output_dir)
        results.append({
            "frame_number": i,
            "child_gaze": child_gaze,
            "therapist_gaze": therapist_gaze,
            "interactions": interactions,
            "child_emotion": child_emotion,
            "therapist_emotion": therapist_emotion
        })
    return results

In [None]:
# Function to compile frames into a video
def compile_frames_to_video(frames_dir, output_video_path):
    frame_files = [os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith('.jpg')]
    frame_files.sort()
    
    frame = cv2.imread(frame_files[0])
    height, width, layers = frame.shape
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video = cv2.VideoWriter(output_video_path, fourcc, 25.0, (width, height))
    for frame_file in frame_files:
        video.write(cv2.imread(frame_file))
    cv2.destroyAllWindows()
    video.release()

In [None]:
def create_output_directory(output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

In [None]:
import cv2
import os

def compile_frames_to_video(frames_dir, output_video_path):
    if not os.path.exists(frames_dir):
        print(f"Output directory '{frames_dir}' does not exist. Creating it...")
        os.makedirs(frames_dir)

    frame_files = [os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith('.jpg')]
    frame_files.sort()

    if not frame_files:
        print("No frame files found in the output frames directory.")
        return

    frame = cv2.imread(frame_files[0])
    height, width, layers = frame.shape

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video = cv2.VideoWriter(output_video_path, fourcc, 25.0, (width, height))

    for frame_file in frame_files:
        video.write(cv2.imread(frame_file))

    cv2.destroyAllWindows()
    video.release()

print("Output video saved successfully.")

In [None]:
import os


def visualize_and_save_predictions(frame, frame_number, child_gaze, therapist_gaze, interactions, child_emotion, therapist_emotion, output_dir):
    plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    
    plt.scatter(child_gaze[0] * frame.shape[1], child_gaze[1] * frame.shape[0], c='r', marker='o', label='Child Gaze')
    plt.scatter(therapist_gaze[0] * frame.shape[1], therapist_gaze[1] * frame.shape[0], c='b', marker='o', label='Therapist Gaze')
  
    for i, interaction in enumerate(interactions):
        plt.text(10, 20+i*20, interaction, color='white', fontsize=10, bbox=dict(facecolor='black', alpha=0.5))
    
    plt.text(10, 40+len(interactions)*20, f"Child Emotion: {child_emotion}", color='white', fontsize=10, bbox=dict(facecolor='black', alpha=0.5))
    plt.text(10, 60+len(interactions)*20, f"Therapist Emotion: {therapist_emotion}", color='white', fontsize=10, bbox=dict(facecolor='black', alpha=0.5))
    
    output_path = os.path.join(output_dir, f"frame_{frame_number}.jpg")
    plt.savefig(output_path)
    plt.close()

In [None]:
def main_with_frame_saving(video_path, output_dir):
    frames = preprocess_video(video_path)
    results = []
    for i, frame in enumerate(frames):
        child_gaze, therapist_gaze = track_gaze(frame)
        interactions = detect_object_interaction(frame)
        child_emotion, therapist_emotion = emotion_predictor(frame)  # Emotion prediction
        visualize_and_save_predictions(frame, i, child_gaze, therapist_gaze, interactions, child_emotion, therapist_emotion, output_dir)
        results.append({
            "frame_number": i,
            "child_gaze": child_gaze,
            "therapist_gaze": therapist_gaze,
            "interactions": interactions,
            "child_emotion": child_emotion,
            "therapist_emotion": therapist_emotion
        })
    return results

In [None]:
video_path = "C:/Users/Amit/Desktop/Project/ABA Therapy_ Daniel - Communication.mp4"
output_dir = "output_frames"
results = main_with_frame_saving(video_path, output_dir)

In [None]:
import cv2
import numpy as np
from PIL import Image
import torch
from torchvision import transforms
import matplotlib.pyplot as plt
import os

# Import Emotion Predictor, Gaze Transformer, and Object Interaction Detector models
from emotion_predictor import EmotionPredictor
from gaze_transformer.model import GazePredictionTransformer
from object_interaction_detector import ObjectInteractionDetector

# Function to preprocess video
def preprocess_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while(cap.isOpened()):
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    return frames

# Function to track gaze using Gaze Transformer model
def track_gaze(frame):
    # Gaze Transformer model inference
    gaze_x, gaze_y = 0.5, 0.5  # Placeholder values
    return gaze_x, gaze_y

# Function to predict emotions using Emotion Predictor model
def predict_emotions(frame):
    # Emotion Predictor model inference
    child_emotion, therapist_emotion = "Happy", "Neutral"  # Placeholder values
    return child_emotion, therapist_emotion

# Function to detect object interactions
def detect_object_interactions(frame):
    # Object Interaction Detector model inference
    interactions = ["Child playing with toy", "Therapist showing object"]  # Placeholder values
    return interactions

# Function to visualize predictions and overlay on frame
def visualize_and_overlay_predictions(frame, child_gaze, therapist_gaze, child_emotion, therapist_emotion, interactions):
    # Overlay predictions on frame
    # Code to overlay gaze points, emotions, and interactions on frame

    # Display frame with predictions overlaid
    plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

# Function to save frames with predictions overlaid
def save_frames_with_predictions(frames, output_dir, child_gazes, therapist_gazes, child_emotions, therapist_emotions, interactions):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for i, frame in enumerate(frames):
        visualize_and_overlay_predictions(frame, child_gazes[i], therapist_gazes[i], child_emotions[i], therapist_emotions[i], interactions[i])
        output_path = os.path.join(output_dir, f"frame_{i}.jpg")
        plt.savefig(output_path)
        plt.close()

# Function to compile frames into video
def compile_frames_into_video(frames_dir, output_video_path):
    frame_files = [os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith('.jpg')]
    frame_files.sort()

    frame = cv2.imread(frame_files[1000])
    height, width, layers = frame.shape

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video = cv2.VideoWriter(output_video_path, fourcc, 25.0, (width, height))

    for frame_file in frame_files:
        video.write(cv2.imread(frame_file))

    cv2.destroyAllWindows()
    video.release()

# Main function
def main(video_path, output_dir, output_video_path):
    # Step 1: Video Preprocessing
    frames = preprocess_video(video_path)

    # Step 2: Perform Predictions
    child_gazes, therapist_gazes = [], []
    child_emotions, therapist_emotions = [], []
    interactions = []
    for frame in frames:
        child_gaze, therapist_gaze = track_gaze(frame)
        child_emotion, therapist_emotion = predict_emotions(frame)
        interaction = detect_object_interactions(frame)
        child_gazes.append(child_gaze)
        therapist_gazes.append(therapist_gaze)
        child_emotions.append(child_emotion)
        therapist_emotions.append(therapist_emotion)
        interactions.append(interaction)

    # Step 3: Save Frames with Predictions
    save_frames_with_predictions(frames, output_dir, child_gazes, therapist_gazes, child_emotions, therapist_emotions, interactions)

    # Step 4: Compile Frames into Video
    compile_frames_into_video(output_dir, output_video_path)

# Example usage
video_path = "example_video.mp4"
output_dir = "output_frames"
output_video_path = "output_video.mp4"
main(video_path, output_dir, output_video_path)