In [1]:
# import cv2
from datetime import datetime

now = datetime.now()

print(f'Notebook executed: {now}')

Notebook executed: 2025-12-11 17:44:31.212036


### **Dataframe Creation**
____

#### **File List**
____

In [2]:
from pathlib import Path

files_path = "/mnt/data/StressID_Dataset/Videos"

In [3]:
def find_mp4_files(start_directory):
    """
    Recursively finds all files ending with '.mp4' (case-insensitive) 
    in the specified directory and its subdirectories.

    Args:
        start_directory (str): The folder path to start searching from.

    Returns:
        list: A list of strings, where each string is the full path 
              to an MP4 file.
    """
    # Create a 'Path' object from the input string
    base_path = Path(start_directory)
    
    # Use rglob (recursive glob) to search for a pattern
    # We use a generator expression for efficiency
    mp4_files_generator = (str(p.resolve()) for p in base_path.rglob('*.mp4'))
    
    # We create a separate generator for the uppercase extension
    MP4_files_generator = (str(p.resolve()) for p in base_path.rglob('*.MP4'))

    # Combine and return the results as a list
    all_mp4_files = list(mp4_files_generator) + list(MP4_files_generator)

    return all_mp4_files

video_files_list = find_mp4_files(files_path)

# Print the results
print(f"Found {len(video_files_list)} MP4 files:")
for file_path in video_files_list[:5]:
    print(file_path)


Found 660 MP4 files:
/mnt/data/StressID_Dataset/Videos/iqyg/iqyg_Math.mp4
/mnt/data/StressID_Dataset/Videos/iqyg/iqyg_Counting1.mp4
/mnt/data/StressID_Dataset/Videos/iqyg/iqyg_Reading.mp4
/mnt/data/StressID_Dataset/Videos/iqyg/iqyg_Breathing.mp4
/mnt/data/StressID_Dataset/Videos/iqyg/iqyg_Counting3.mp4


In [4]:
stress_situation = [
                        "counting1", "stroop", "speaking", 
                        "math", "reading", "counting2",
                        "counting3"
                    ]

#### **Paths Dataframe**
___

In [5]:
import pandas as pd

df = pd.DataFrame(data=video_files_list, columns=["path"])

df.sample(3)

Unnamed: 0,path
3,/mnt/data/StressID_Dataset/Videos/iqyg/iqyg_Br...
308,/mnt/data/StressID_Dataset/Videos/cxj0/cxj0_Vi...
294,/mnt/data/StressID_Dataset/Videos/2z7d/2z7d_Co...


In [6]:
N = 2  # number of levels you want

df_levels = df["path"].apply(
    lambda p: pd.Series(Path(p).parts[-N:])
)

df_levels.columns = [f"level_{i}" for i in range(1, N+1)]

df = df.join(df_levels)

df.sample(3)

Unnamed: 0,path,level_1,level_2
170,/mnt/data/StressID_Dataset/Videos/g9j5/._g9j5_...,g9j5,._g9j5_Counting1.mp4
638,/mnt/data/StressID_Dataset/Videos/tmvd/tmvd_Co...,tmvd,tmvd_Counting3.mp4
224,/mnt/data/StressID_Dataset/Videos/b2l8/b2l8_Re...,b2l8,b2l8_Relax.mp4


In [7]:
df_rename = df
df["level_2"] = df["level_2"].apply(lambda x: Path(x).stem)
df_rename.columns = ["path", "subject", "subject/task"]
df_rename.sample(3)

Unnamed: 0,path,subject,subject/task
211,/mnt/data/StressID_Dataset/Videos/h8r2/h8r2_St...,h8r2,h8r2_Stroop
562,/mnt/data/StressID_Dataset/Videos/wssm/wssm_Ma...,wssm,wssm_Math
410,/mnt/data/StressID_Dataset/Videos/9j3o/9j3o_Vi...,9j3o,9j3o_Video2


#### **Labels Dataframe**
______

In [8]:
df_labels = pd.read_csv("/mnt/data/StressID_Dataset/labels.csv", sep=",")

df_labels.sample(3)

Unnamed: 0,subject/task,binary-stress,affect3-class
354,h7j3_Counting3,0,0
610,uymz_Stroop,1,1
17,2hpu_Relax,0,0


#### **Full Dataframe**
_____

In [9]:
print("Number of duplicate filenames in df1:", df_rename["subject/task"].duplicated().sum())
print("Number of duplicate filenames in df2:", df_labels["subject/task"].duplicated().sum())


Number of duplicate filenames in df1: 0
Number of duplicate filenames in df2: 0


In [10]:
df_complete = df_rename.merge(
                                df_labels[["subject/task", "binary-stress"]],   # ← this slice keeps ONLY filename + label
                                on="subject/task",
                                how="left"
                                )

print(f"df_complete shape: {df_complete.shape}")
df_complete = df_complete.dropna(subset=["binary-stress"])

print(f"df_complete without NaN shape: {df_complete.shape}")
print("NaN val binary-stress:", df_complete["binary-stress"].isna().sum())
df_complete.sample(3)

df_complete shape: (660, 4)
df_complete without NaN shape: (578, 4)
NaN val binary-stress: 0


Unnamed: 0,path,subject,subject/task,binary-stress
657,/mnt/data/StressID_Dataset/Videos/v8mh/v8mh_Co...,v8mh,v8mh_Counting2,1.0
55,/mnt/data/StressID_Dataset/Videos/5f7t/5f7t_St...,5f7t,5f7t_Stroop,1.0
272,/mnt/data/StressID_Dataset/Videos/g7r2/g7r2_Vi...,g7r2,g7r2_Video1,0.0


In [11]:
df_complete["task"] = df_complete["subject/task"].str.split("_").str[-1].str.lower()

print(f"df_complete shape: {df_complete.shape}")
df_complete.sample(3)

df_complete shape: (578, 5)


Unnamed: 0,path,subject,subject/task,binary-stress,task
169,/mnt/data/StressID_Dataset/Videos/g9j5/g9j5_Co...,g9j5,g9j5_Counting1,1.0,counting1
289,/mnt/data/StressID_Dataset/Videos/y9z6/y9z6_Vi...,y9z6,y9z6_Video2,0.0,video2
500,/mnt/data/StressID_Dataset/Videos/b9w0/b9w0_Br...,b9w0,b9w0_Breathing,0.0,breathing


In [12]:
df_complete['label'] = df_complete['binary-stress'].map({0.0: 'no-stress', 1.0: 'stress'})
print(f"df_complete shape: {df_complete.shape}")
df_complete.sample(3)

df_complete shape: (578, 6)


Unnamed: 0,path,subject,subject/task,binary-stress,task,label
58,/mnt/data/StressID_Dataset/Videos/5f7t/5f7t_Co...,5f7t,5f7t_Counting1,0.0,counting1,no-stress
288,/mnt/data/StressID_Dataset/Videos/y9z6/y9z6_Co...,y9z6,y9z6_Counting3,0.0,counting3,no-stress
355,/mnt/data/StressID_Dataset/Videos/e5p4/e5p4_Re...,e5p4,e5p4_Reading,0.0,reading,no-stress


#### **Vid-to-Frame extraction**
____

In [13]:
import cv2
from pathlib import Path
import pandas as pd
from tqdm import tqdm

# Base output directory
output_base = Path("/mnt/data/StressID-img-data")
output_base.mkdir(exist_ok=True, parents=True)

TARGET_FPS = 5
TARGET_INTERVAL = 1.0 / TARGET_FPS   # 0.2 seconds

# Wrap the main for loop with tqdm
for idx, row in tqdm(df_complete.iterrows(), total=len(df_complete), desc="Processing videos"):
    video_path = Path(row["path"])
    label = row["label"]
    subject = row["subject"]
    task = row["task"]

    # Output folder structure
    output_folder = output_base / label / subject / task
    output_folder.mkdir(parents=True, exist_ok=True)

    # Open video
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        print(f"Error opening video: {video_path}")
        continue

    next_timestamp = 0.0  # next frame time to save (seconds)
    saved_idx = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Timestamp (in seconds)
        timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000.0

        # Save the next scheduled frame
        if timestamp >= next_timestamp:
            # ----- Modified line: Save as .png instead of .jpg -----
            frame_file = output_folder / f"frame_{saved_idx:04d}.png"
            cv2.imwrite(str(frame_file), frame)
            # --------------------------------------------------------

            saved_idx += 1
            next_timestamp += TARGET_INTERVAL

    cap.release()


Processing videos: 100%|████████████████████████████████████████████████████████████████████████████████████████| 578/578 [1:46:14<00:00, 11.03s/it]


In [14]:
df_complete["binary-stress"].value_counts()

binary-stress
1.0    312
0.0    266
Name: count, dtype: int64

#### **Image dataframe**
___

In [22]:
from pathlib import Path

def find_files(start_directory, extension=None):
    base_path = Path(start_directory)
    pattern = f"*{extension}" if extension else "*"
    files = [str(p.resolve()) for p in base_path.rglob(pattern) 
             if not extension or p.suffix.lower() == extension.lower()]
    return sorted(files)

# Set your path and extension
files_path = "/mnt/data/StressID-img-data/"  # Change this
video_files_list = find_files(files_path, ".png")

print(f"Found {len(video_files_list)} png files:")
for file in video_files_list[:5]:
    print(file)   

Found 271597 png files:
/mnt/data/StressID-img-data/no-stress/2ea4/breathing/frame_0000.png
/mnt/data/StressID-img-data/no-stress/2ea4/breathing/frame_0001.png
/mnt/data/StressID-img-data/no-stress/2ea4/breathing/frame_0002.png
/mnt/data/StressID-img-data/no-stress/2ea4/breathing/frame_0003.png
/mnt/data/StressID-img-data/no-stress/2ea4/breathing/frame_0004.png


In [29]:
img_dataset_df = pd.DataFrame(video_files_list, columns=["path"])
img_dataset_df.sample(3)

Unnamed: 0,path
163568,/mnt/data/StressID-img-data/stress/4woj/counti...
241784,/mnt/data/StressID-img-data/stress/kkf5/video2...
264020,/mnt/data/StressID-img-data/stress/w2t5/relax/...


In [30]:
img_dataset_df["str-label"] = img_dataset_df["path"].str.split("/").str[-4]
img_dataset_df.sample(3)

Unnamed: 0,path,str-label
75608,/mnt/data/StressID-img-data/no-stress/e5p4/spe...,no-stress
22982,/mnt/data/StressID-img-data/no-stress/5f7t/cou...,no-stress
246602,/mnt/data/StressID-img-data/stress/p9i3/speaki...,stress


In [31]:
img_dataset_df['label'] = img_dataset_df['str-label'].map({'no-stress': 0.0, 'stress': 1.0})
img_dataset_df.sample(3)

Unnamed: 0,path,str-label,label
22618,/mnt/data/StressID-img-data/no-stress/5f7t/bre...,no-stress,0.0
133342,/mnt/data/StressID-img-data/no-stress/uymz/spe...,no-stress,0.0
176057,/mnt/data/StressID-img-data/stress/7h5u/speaki...,stress,1.0


In [33]:
train_df = img_dataset_df.sample(frac=0.9, random_state=42441991)
test_df = img_dataset_df.drop(train_df.index)

print(f"shape train df: {train_df.shape}")
print(f"shape test df: {test_df.shape}")

shape train df: (244437, 3)
shape test df: (27160, 3)


In [34]:
train_df.sample(3)

Unnamed: 0,path,str-label,label
39328,/mnt/data/StressID-img-data/no-stress/8i4i/bre...,no-stress,0.0
136633,/mnt/data/StressID-img-data/no-stress/v8mh/rel...,no-stress,0.0
162245,/mnt/data/StressID-img-data/stress/45lx/counti...,stress,1.0


In [35]:
test_df.sample(3)

Unnamed: 0,path,str-label,label
53192,/mnt/data/StressID-img-data/no-stress/b2l8/vid...,no-stress,0.0
10161,/mnt/data/StressID-img-data/no-stress/45lx/mat...,no-stress,0.0
9818,/mnt/data/StressID-img-data/no-stress/45lx/bre...,no-stress,0.0


In [36]:
train_df.to_csv("/mnt/data/StressID-img-data/train.csv", sep=";", index=False)
test_df.to_csv("/mnt/data/StressID-img-data/test.csv", sep=";", index=False)
img_dataset_df.to_csv("/mnt/data/StressID-img-data/dataframe.csv", sep=";", index=False)