In [2]:
import numpy as np
import pandas as pd
import csv, os, re, time, logging, cv2, torch
from pathlib import Path
from importlib.metadata import version
from facenet_pytorch import InceptionResnetV1, MTCNN
from PIL import Image
from moviepy.editor import VideoFileClip
import pytz
import datetime
from win32com.propsys import propsys, pscon
import subprocess
import json
from datetime import datetime

In [3]:
# Set logging level to DEBUG
logging.basicConfig(level=logging.DEBUG)

# Replace 'video.mp4' with the path to your media file
#probe = ffmpeg.probe('//153.19.52.107/emboa/IO3-sessions/NEW STRUCTURE/Camera/GUT/S01/C02/Untitled 140.mp4')
#print(probe)

In [4]:
PYCHARM_DEBUG=True

In [5]:
def print_table(table):
    for row in table:
        print(row)

def create_table(tab1, tab2, tab3):
    result_tab = []

    emotions = ['Unknown', 'Happy', 'Sad', 'Scared', 'Disgusted', 'Surprised', 'Angry']

    for row1, row2, row3 in zip(tab1, tab2, tab3):
        sum_rows = [int(a) + int(b) + int(c) for a, b, c in zip(row1[1:], row2[1:], row3[1:])]

        total_sum = sum(sum_rows)
        if total_sum > 0:
            # Calculate the percentage distribution
            percentages = [round((x / total_sum) * 100, 2) for x in sum_rows]
        else:
            percentages = [0] * len(sum_rows)

        result_tab.append([row1[0]] + percentages)

    return result_tab

# Makes table equal
def fill_table(table, csvreader):
    seconds = 0
    stop = 0
    for row in csvreader:
        row[0] = int(row[0].split('.')[0])
        if seconds <= row[0] and stop == 0:  #fill table with missing seconds
            for i in range(0, row[0]):
                table.append([i, '0', '0', '0', '0', '0', '0', '0'])
                seconds += 1
            stop = 1
            table.append(row)
        else:
            table.append(row)


# Create percentages for every second
def csv_files_reader(base_path):
    index = ['I', 'II', 'III']
    tab1 = []
    tab2 = []
    tab3 = []

    if os.path.exists(base_path):

        for i in index:
            file = os.path.join(base_path, i)
            for file_name in os.listdir(file):
                file_path = os.path.join(file, file_name)

                with open(file_path, 'r') as file:
                    csvreader = csv.reader(file)
                    header = next(csvreader)

                    if i == 'I':
                        fill_table(tab1, csvreader)
                    elif i == 'II':
                        fill_table(tab2, csvreader)
                    elif i == 'III':
                        fill_table(tab3, csvreader)
    else:
        print(f"File path {base_path} doesn't exist.")

    large_table = max([tab1, tab2, tab3], key=len)

    for tab in [tab1, tab2, tab3]:
        if len(tab) < len(large_table):
            diff = len(large_table) - len(tab)

            if len(tab) == 0:
                sec = 0
                for sec in range(len(large_table)):
                    tab.append([sec, '0', '0', '0', '0', '0', '0', '0'])
            else:
                sec = tab[-1][0]  #latest second in table
                for sec in range(tab[-1][0] + 1, len(large_table)):
                    tab.append([sec, '0', '0', '0', '0', '0', '0', '0'])

    return create_table(tab1, tab2, tab3)


def get_boris_vector(path):
    table = csv_files_reader(path)
    df = pd.DataFrame(table)
    return df

In [6]:
def get_video_metadata(file_path):
    try:
        # Get the file status
        file_stat = os.stat(file_path)

        # Get the last modified time
        last_modified_time_str = time.ctime(file_stat.st_mtime)
        last_modified_timestamp = int(file_stat.st_mtime)

        # Get the duration of the video
        with VideoFileClip(file_path) as video:
            duration = video.duration

        metadata = {
            'last_modified_time': last_modified_time_str,
            'last_modified_timestamp': last_modified_timestamp,
            'duration': int(duration),  # Duration in seconds
            'initial_timestamp' : last_modified_timestamp - int(duration)
        }
        return metadata

    except FileNotFoundError:
        return f"The file {file_path} does not exist."
    except Exception as e:
        return f"An error occurred: {str(e)}"


In [7]:
def gut_timestamp(video_path):
    return get_video_metadata(video_path)['initial_timestamp']

In [19]:
def ituyu_timestamp(video_path):
    try:
        properties = propsys.SHGetPropertyStoreFromParsingName(video_path)
        timestamp = int(properties.GetValue(pscon.PKEY_Media_DateEncoded).GetValue().timestamp())
        return timestamp
    
    except Exception as e:
        print(f"Error retrieving video metadata: {e}")
        return None

In [9]:
def maap_timestamp(video_path):
    try:
        # Use ffprobe to extract video metadata
        result = subprocess.run(
            ['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_entries', 'format_tags=creation_time', video_path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True
        )
        metadata = json.loads(result.stdout)
        
        # Extract the creation time if available
        creation_time = metadata.get('format', {}).get('tags', {}).get('creation_time', None)
        
        if creation_time:
            creation_datetime = datetime.fromisoformat(creation_time.replace('Z', '+00:00'))
            unix_timestamp = int(creation_datetime.timestamp())
            return unix_timestamp
        else:
            return None
    
    except Exception as e:
        print(f"Error retrieving video metadata: {e}")
        return None

In [10]:
def get_video_recording_timestamp(video_path, research_center):
    timestamp = 0
    if research_center == 'GUT':
        timestamp = gut_timestamp(video_path)
    elif research_center == 'ITU-YU':
        timestamp = ituyu_timestamp(video_path)
    elif research_center == 'MAAP':
        timestamp = maap_timestamp(video_path)
    return timestamp

In [11]:
# Get the moment of the start of the vector and its frequency
def get_unix_and_hz(file_path):
    with open(file_path, newline='') as csvfile:
        reader = csv.reader(csvfile)
        unix = int(float(next(reader)[0]))  # Convert the first cell to integer
        hz = int(float(next(reader)[0]))  # Convert the second cell to integer
    return unix, hz

In [12]:
# Trim the vector to be the multiple of its frequency
def trim_vector(vector, rate):
    length = len(vector)
    if length % rate != 0:
        # Calculate how many elements need to be removed
        excess_elements = length % rate
        # Trim the vector
        vector = vector[:-excess_elements]
    return vector

In [13]:
# Change the frequency by averaging the values
def mean_of_values(vector, rate):
    # Ensure the length of the vector is a multiple of n
    if len(vector) % rate != 0:
        raise ValueError("Length of the vector must be a multiple of frequency")

    # Reshape the vector into a 2D array where each row is a group of n elements
    reshaped_vector = np.reshape(vector, (-1, rate))

    # Calculate the mean along the rows
    mean_values = np.mean(reshaped_vector, axis=1)

    return mean_values

In [46]:
def extract_frame_embeddings(path):
    # Step 1: Initialize FaceNet model and MTCNN detector
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    mtcnn = MTCNN(keep_all=True, device=device)  # MTCNN for face detection
    facenet = InceptionResnetV1(pretrained='vggface2').eval().to(device)  # Pre-trained FaceNet model
    
    # Step 2: Load video file and get frame rate
    cap = cv2.VideoCapture(path)

    if not cap.isOpened():
        print("Error: Could not open video.")
    else:
        print("Video opened successfully!")

    # Calculate frames per second (fps) and total frames
    frame_interval = cap.get(cv2.CAP_PROP_FPS)  # Frames per second of the video

    # Step 3: Process video at 1 second intervals
    frame_count = 0
    frame_embeddings = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Check if the frame is at the 1-second interval
        if frame_count % frame_interval < 1:
            # Convert frame to RGB (OpenCV uses BGR by default)
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_pil = Image.fromarray(frame_rgb)

            # Step 4: Detect faces in the frame
            boxes, _ = mtcnn.detect(frame_pil)
            if boxes is None:
                num_of_features = 512

                frame_embeddings.append(np.zeros(num_of_features, dtype=np.float32))
                frame_count += 1
                continue  # Skip frames with no detected faces

            # Step 5: Crop and align each detected face
            faces = mtcnn(frame_pil)  # This will return aligned faces

            # Step 6: Generate embeddings for each face
            if faces is not None:
                faces = faces.to(device)
                embeddings = facenet(faces)  # Generate embeddings
                frame_embeddings.append(embeddings.cpu().detach().numpy())  # Store embeddings 

        frame_count += 1

    # Step 7: Release video capture
    cap.release()
    frame_embeddings = [frame_embeddings[0][0] for a in frame_embeddings]
    return pd.DataFrame(frame_embeddings)

extract_frame_embeddings("resources/s01c01.mp4")

Video opened successfully!


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.045057,-0.013996,0.012406,0.093562,0.045286,0.00366,-0.077224,0.058703,-0.03814,0.086408,...,0.014924,-0.035806,0.002744,0.006572,-0.022534,-0.029811,-0.018938,-0.010612,0.008832,-0.014404
1,-0.045057,-0.013996,0.012406,0.093562,0.045286,0.00366,-0.077224,0.058703,-0.03814,0.086408,...,0.014924,-0.035806,0.002744,0.006572,-0.022534,-0.029811,-0.018938,-0.010612,0.008832,-0.014404
2,-0.045057,-0.013996,0.012406,0.093562,0.045286,0.00366,-0.077224,0.058703,-0.03814,0.086408,...,0.014924,-0.035806,0.002744,0.006572,-0.022534,-0.029811,-0.018938,-0.010612,0.008832,-0.014404
3,-0.045057,-0.013996,0.012406,0.093562,0.045286,0.00366,-0.077224,0.058703,-0.03814,0.086408,...,0.014924,-0.035806,0.002744,0.006572,-0.022534,-0.029811,-0.018938,-0.010612,0.008832,-0.014404
4,-0.045057,-0.013996,0.012406,0.093562,0.045286,0.00366,-0.077224,0.058703,-0.03814,0.086408,...,0.014924,-0.035806,0.002744,0.006572,-0.022534,-0.029811,-0.018938,-0.010612,0.008832,-0.014404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,-0.045057,-0.013996,0.012406,0.093562,0.045286,0.00366,-0.077224,0.058703,-0.03814,0.086408,...,0.014924,-0.035806,0.002744,0.006572,-0.022534,-0.029811,-0.018938,-0.010612,0.008832,-0.014404
606,-0.045057,-0.013996,0.012406,0.093562,0.045286,0.00366,-0.077224,0.058703,-0.03814,0.086408,...,0.014924,-0.035806,0.002744,0.006572,-0.022534,-0.029811,-0.018938,-0.010612,0.008832,-0.014404
607,-0.045057,-0.013996,0.012406,0.093562,0.045286,0.00366,-0.077224,0.058703,-0.03814,0.086408,...,0.014924,-0.035806,0.002744,0.006572,-0.022534,-0.029811,-0.018938,-0.010612,0.008832,-0.014404
608,-0.045057,-0.013996,0.012406,0.093562,0.045286,0.00366,-0.077224,0.058703,-0.03814,0.086408,...,0.014924,-0.035806,0.002744,0.006572,-0.022534,-0.029811,-0.018938,-0.010612,0.008832,-0.014404


In [64]:

# Step 1: Initialize FaceNet model and MTCNN detector
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
mtcnn = MTCNN(keep_all=True, device=device)  # MTCNN for face detection
facenet = InceptionResnetV1(pretrained='vggface2').eval().to(device)  # Pre-trained FaceNet model

# Step 2: Load video file and get frame rate
cap = cv2.VideoCapture("resources/s01c01.mp4")

if not cap.isOpened():
    print("Error: Could not open video.")
else:
    print("Video opened successfully!")

# Calculate frames per second (fps) and total frames
frame_interval = cap.get(cv2.CAP_PROP_FPS)  # Frames per second of the video

# Step 3: Process video at 1 second intervals
frame_count = 0
frame_embeddings = []

while (cap.isOpened() & frame_count!=3000):
    ret, frame = cap.read()
    if not ret:
        break

    # Check if the frame is at the 1-second interval
    if frame_count % frame_interval < 1:
        # Convert frame to RGB (OpenCV uses BGR by default)
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_pil = Image.fromarray(frame_rgb)

        # Step 4: Detect faces in the frame
        boxes, _ = mtcnn.detect(frame_pil)
        if boxes is None:
            num_of_features = 512

            frame_embeddings.append(np.zeros(num_of_features, dtype=np.float32))
            frame_count += 1
            continue  # Skip frames with no detected faces

        # Step 5: Crop and align each detected face
        faces = mtcnn(frame_pil)  # This will return aligned faces

        # Step 6: Generate embeddings for each face
        if faces is not None:
            faces = faces.to(device)
            embeddings = facenet(faces)  # Generate embeddings
            frame_embeddings.append(embeddings.detach().cpu().numpy())  # Store embeddings 

    frame_count += 1
    # exit loop
        

# Step 7: Release video capture
cap.release()
frame_embeddings
for embeddings in frame_embeddings:
    for embedding in embeddings:
        

Video opened successfully!
[-4.50573452e-02 -1.39959268e-02  1.24060949e-02  9.35618803e-02
  4.52863537e-02  3.65984510e-03 -7.72244483e-02  5.87028712e-02
 -3.81398238e-02  8.64075199e-02  7.16491044e-02  3.54125947e-02
 -2.43654903e-02  5.77803366e-02  8.65022279e-03 -3.54348235e-02
 -6.88755959e-02  4.36435901e-02  1.49891863e-03 -9.48281158e-05
 -2.04342622e-02  4.93200570e-02  4.02648225e-02 -6.74299672e-02
 -5.29341958e-02  7.48428283e-03  3.30501460e-02  4.46259156e-02
 -3.56597565e-02  7.40405247e-02 -1.49120754e-02 -5.18053062e-02
 -4.60103014e-03  2.05756078e-04 -1.17956903e-02 -3.26935798e-02
 -5.39383069e-02  2.08088551e-02 -3.01821679e-02 -2.39969566e-02
 -4.69697416e-02  6.39413893e-02  3.71190608e-02  6.64382726e-02
  2.95960009e-02  1.72213726e-02 -3.08081713e-02 -5.23753390e-02
 -2.08923072e-02 -3.16424333e-02 -3.56063321e-02 -6.38226643e-02
  8.27484727e-02 -3.04119242e-03 -1.10143855e-01 -3.33103873e-02
 -2.17515621e-02  2.38631517e-02 -9.51526091e-02 -3.75189818e-0

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [68]:
def flatten_to_512(L):
    """
    This function flattens a nested structure of arrays, ensuring that
    each element in the final result is an array of exactly 512 elements.
    
    It also reshapes arrays of shape (1, 512) to (512,).
    
    :param L: List of arrays or nested arrays.
    :return: A list of arrays where each array has exactly 512 elements.
    """

    result = []

    def flatten_recursive(item):
        """
        Helper function to recursively flatten nested arrays.
        """
        # If item is a numpy array
        if isinstance(item, np.ndarray):
            # Check if the array has shape (512,)
            if item.shape == (512,):
                result.append(item)
            # Check if the array has shape (1, 512), reshape it to (512,)
            elif item.shape == (1, 512):
                result.append(item.reshape(512,))
            else:
                raise ValueError(f"Array with incorrect size found: {item.shape}. Expected shapes: (512,) or (1, 512).")

        elif isinstance(item, list):
            # If it's a list, recurse into its elements
            for sub_item in item:
                flatten_recursive(sub_item)
        else:
            raise TypeError(f"Unsupported type encountered: {type(item)}. Expected list or numpy array.")

    # Start the recursive flattening process
    flatten_recursive(L)

    return result

In [69]:
copy_frame_embeddings = flatten_to_512(frame_embeddings)

ValueError: Array with incorrect size found: (2, 512). Expected shapes: (512,) or (1, 512).

In [33]:
def slice_vectors(video_path, biosignal_path, input_storage_path, boris, boris_storage_path, research_center):
    # Get the path for EDA, TEMP and HR
    EDA_path = os.path.join(biosignal_path, 'EDA.csv')
    TEMP_path = os.path.join(biosignal_path, 'TEMP.csv')
    HR_path = os.path.join(biosignal_path, 'HR.csv')
    
    # Get only the data
    EDA = pd.read_csv(EDA_path, skiprows = 2, header = None)
    TEMP = pd.read_csv(TEMP_path, skiprows = 2, header = None)
    HR = pd.read_csv(HR_path, skiprows = 2, header = None)
    frame_embeddings = extract_frame_embeddings(video_path)
    
    # Get the starting time and frequency
    unix_EDA, hz_EDA = get_unix_and_hz(EDA_path)
    unix_TEMP, hz_TEMP = get_unix_and_hz(TEMP_path)
    unix_HR, hz_HR = get_unix_and_hz(HR_path)
    unix_video = get_video_recording_timestamp(video_path, research_center)

    # Unificate the frequencies  
    EDA_mean = mean_of_values(trim_vector(EDA, hz_EDA), hz_EDA)
    TEMP_mean = mean_of_values(trim_vector(TEMP, hz_TEMP), hz_TEMP)
    HR_mean = mean_of_values(trim_vector(HR, hz_HR), hz_HR)

    # Get the lengths
    length_EDA = len(EDA_mean)
    length_TEMP = len(TEMP_mean)
    length_HR = len(HR_mean)
    length_video = get_video_metadata(video_path)['duration']
    
    # Get the vectors of starts and ends for biosignals and video
    starts = [unix_HR, unix_TEMP, unix_EDA, unix_video]
    ends = [unix_HR + length_HR, unix_TEMP + length_TEMP, unix_EDA + length_EDA, unix_video + length_video]

    # Get the latest start of any vector
    last_start = max(starts)
    
    # Get the earliest end of any vector
    first_end = min(ends)

    # Get matching indexes for start and end for every vector
    EDA_first_index = last_start - unix_EDA
    EDA_last_index = first_end - unix_EDA
    TEMP_first_index = last_start - unix_TEMP
    TEMP_last_index = first_end - unix_TEMP
    HR_first_index = last_start - unix_HR
    HR_last_index = first_end - unix_HR
    video_first_index = last_start - unix_video
    video_last_index = first_end - unix_video

    print('EDA: ', EDA_last_index - EDA_first_index)
    print('TEMP: ', TEMP_last_index - TEMP_first_index)
    print('HR: ', HR_last_index - HR_first_index)
    print('video: ', video_last_index - video_first_index)
    
    # Slice the biosignals based on the index
    sliced_EDA = EDA[EDA_first_index:EDA_last_index]
    sliced_TEMP = TEMP[TEMP_first_index:TEMP_last_index]
    sliced_HR = HR[HR_first_index:HR_last_index]
    sliced_boris = boris[video_first_index:video_last_index].copy() # to surpass the warning of working on a view and not a copy
    sliced_frame_embeddings = frame_embeddings[video_first_index:video_last_index]
    
    # Create one df with all biosignals
    input = pd.concat([sliced_EDA.reset_index(drop=True), sliced_TEMP.reset_index(drop=True), sliced_HR.reset_index(drop=True), sliced_frame_embeddings.reset_index()], axis=1)
    
    # Rename column names
    new_columns = ['EDA', 'TEMP', 'HR'] + input.columns[3:].tolist()
    input.columns = new_columns
    input = input.drop(input.columns[3], axis=1)
    # Drop index and contempt from BORIS
    sliced_boris.drop([sliced_boris.columns[0], sliced_boris.columns[1]], axis=1, inplace=True)
    sliced_boris.columns = ['Happy', 'Sad', 'Scared', 'Disgusted', 'Surprised', 'Angry']
    # Add the biosignals to file
    input.to_csv(input_storage_path, index=False, sep=',')
    sliced_boris.to_csv(boris_storage_path, index=False, sep=',')

In [32]:
def traverse():
    root_dir = '//153.19.52.107/emboa/IO3-sessions/NEW STRUCTURE'
    target_root_dir = '//153.19.52.107/emboa/IO3-sessions/NEW STRUCTURE/de-earlyfusionthesis'
    boris_ext = 'Analysis/BORIS/'
    type = ['Camera', 'Wristband']
    research_centers = ['GUT']
    #research_centers = ['GUT', 'ITU-YU', 'MAAP']
    s_values = ["S01"]
    #s_values = ["S01", "S02", "S03", "S04", "S05", "S06", "S07", "S08", "S09", "S10", "S11" ]
    c_values = ["C02"]
    #c_values = ["C01", "C02", "C03", "C04", "C05", "C06", "C07", "C08", "C09", "C10", "C11", "C12", "C13" ]
    for research_center in research_centers:
        for session in s_values:
            # Create the session path for biosignal and camera + BORIS
            camera_path = Path(root_dir).joinpath(type[0], research_center, session)
            signals_path = Path(root_dir).joinpath(type[1], research_center, session)
            if camera_path.is_dir() and signals_path.is_dir():
                # Create the meeting path for biosignal and camera + BORIS
                for camera in c_values:
                    exact_camera_path = Path(camera_path).joinpath(camera)
                    exact_signals_path = Path(signals_path).joinpath(camera)
                    if exact_camera_path.is_dir() and exact_signals_path.is_dir():
                        boris_path = Path(exact_camera_path).joinpath(boris_ext)
                        if boris_path.is_dir():
                            # Quick fix of two MAAP sessions being divided TODO: Fix this
                            if not(research_center == 'MAAP' and ((session == 'S01' and camera == 'C05') or (session == 'S03' and camera == 'C05'))):
                                mp4_pattern = []
                                if research_center == 'GUT':
                                    mp4_pattern.append(re.compile(r'^Untitled \d+\.mp4$'))
                                elif research_center == 'ITU-YU':
                                    mp4_pattern.append(re.compile(r'^ITU-C\d{2}-S\d{2}-\d{8}-Camera\.mp4$'))
                                elif research_center == 'MAAP':
                                    mp4_pattern.append(re.compile(r'^MAAP-C\d{2}-S\d{2}-\d{8}-Camera\.AVI$'))
                                    mp4_pattern.append(re.compile(r'^MAAP-C\d{2}-S\d{2}-\d{8}-Camera\.MOV$'))
                                for files_in_camera_dir in os.listdir(exact_camera_path):
                                    if any(pattern.match(files_in_camera_dir) for pattern in mp4_pattern):
                                        video_path = os.path.join(exact_camera_path, files_in_camera_dir)
                                # TODO: TRY EXCEPT FOR not matching pattern, skip iteration
                                filename = research_center + '_' + session + '_' + camera
                                boris_filename = filename + '_BORIS.csv'
                                boris_target_dir = Path(target_root_dir).joinpath(research_center, boris_filename)
                                
                                input_filename = filename + '_input.csv'
                                input_target_dir = Path(target_root_dir).joinpath(research_center, input_filename)
                                # Create combined BORIS vector
                                boris = get_boris_vector(boris_path)
                                # Create one vector of biosignals, sliced BORIS and sliced video
                                slice_vectors(video_path, exact_signals_path, input_target_dir, boris, boris_target_dir, research_center)
traverse()

Video opened successfully!
EDA:  462
TEMP:  462
HR:  462
video:  462
