In [1]:
import sys

sys.path.append("../src")

In [2]:
import datetime
import json
import os
import subprocess

import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from moviepy.editor import VideoFileClip
from tqdm.notebook import tqdm

from data_processing.survey import *
from data_processing.video import *
from utils import *

In [3]:
paths = Paths("../configs/paths.json")

## Questionnaire

In [4]:
db_con = f"sqlite:///{paths["SurveyDB"]}"
user_df = pd.read_sql(f"SELECT * FROM user", db_con).rename(
    columns={"id": "participant_id", "game1_exp": "fifa_exp", "game2_exp": "sf_exp"}
)
valid_user_ids = set([int(i.stem.split("_")[0]) for i in paths["DataRoot"].glob("*/")])
user_df = user_df[user_df.participant_id.isin(valid_user_ids)].reset_index().drop("index", axis=1)
answer_df = pd.read_sql(f"SELECT * FROM answer", db_con).rename(
    columns={"id": "submission_id", "user_id": "participant_id"}
)
answer_df = answer_df.assign(
    start_ts=pd.to_datetime(answer_df.start_ts, format="ISO8601").dt.tz_localize(
        "US/Eastern"
    ),
    end_ts=pd.to_datetime(answer_df.end_ts, format="ISO8601").dt.tz_localize(
        "US/Eastern"
    ),
)
game_df = pd.read_sql(f"SELECT * FROM game", db_con).rename(
    columns={"id": "game_id", "title": "game", "year": "game_release_year"}
)
level_df = pd.read_sql(f"SELECT * FROM level", db_con).rename(
    columns={"id": "level_id", "name": "difficulty"}
)

In [5]:
df_participants = user_df[["participant_id", "age", "sex", "fifa_exp", "sf_exp"]]
df_participants.to_csv("../data/Dataset/Questionnaire/participants.csv", index=None)
df_participants

Unnamed: 0,participant_id,age,sex,fifa_exp,sf_exp
0,118,26,M,1,1
1,120,21,M,0,0
2,138,31,M,1,0
3,146,26,F,1,1
4,166,20,M,0,2
5,183,26,M,1,0
6,267,22,M,0,1
7,291,22,M,0,1
8,297,22,M,2,0
9,314,22,M,0,0


In [7]:
df_submissions = answer_df.merge(game_df).merge(level_df)[
    [
        "submission_id",
        "participant_id",
        "game",
        "difficulty",
        "session_no",
        "start_ts",
        "end_ts",
        "engagement",
        "interest",
        "stress",
        "excitement",
    ]
]
df_submissions = df_submissions[df_submissions.participant_id.isin(valid_user_ids)]
df_submissions.to_csv("../data/Dataset/Questionnaire/submissions.csv", index=None, date_format='%Y-%m-%d %H:%M:%S%z')
df_submissions

Unnamed: 0,submission_id,participant_id,game,difficulty,session_no,start_ts,end_ts,engagement,interest,stress,excitement
37,39,183,FIFA23,World Class,1,2023-10-23 11:34:04-04:00,2023-10-23 11:37:29-04:00,4,2,2,1
38,40,183,FIFA23,World Class,2,2023-10-23 11:38:12-04:00,2023-10-23 11:39:25-04:00,4,2,2,1
39,41,183,FIFA23,Semi-Pro,1,2023-10-23 11:45:34-04:00,2023-10-23 11:46:31-04:00,3,2,1,0
40,42,183,FIFA23,Semi-Pro,2,2023-10-23 11:46:55-04:00,2023-10-23 11:47:59-04:00,3,3,1,1
41,43,183,FIFA23,Legendary,1,2023-10-23 11:49:29-04:00,2023-10-23 11:50:25-04:00,3,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...
970,974,458,Street Fighter V,(1),2,2023-11-13 15:24:16-05:00,2023-11-13 15:24:59-05:00,3,3,0,2
971,975,458,Street Fighter V,(1),3,2023-11-13 15:25:18-05:00,2023-11-13 15:25:47-05:00,3,2,0,2
972,976,458,Street Fighter V,(2),1,2023-11-13 15:26:37-05:00,2023-11-13 15:27:19-05:00,2,3,1,1
973,977,458,Street Fighter V,(2),2,2023-11-13 15:27:40-05:00,2023-11-13 15:28:20-05:00,3,3,0,1


## OBS (Webcam)

In [8]:
def extract_clip(video, session_info, creation_datetime, trim_start=0, trim_end=0):
    start_time, end_time = timestamps_to_time_offsets(
        session_info.start_ts, session_info.end_ts, creation_datetime
    )
    # the +2 and -2 accounts for the delay for logging the end ts.
    clip = video.subclip(start_time + trim_start, end_time - trim_end)
    return clip


def process_game_sessions(video_path, sessions_info, creation_datetime=None):
    video = VideoFileClip(str(video_path))
    data = list()
    if creation_datetime is None:
        creation_datetime = get_video_creation_datetime(video_path)
    for i, row in tqdm(sessions_info.iterrows(), desc="Gaming Session", leave=False):
        clip = extract_clip(video, row, creation_datetime, trim_start=0, trim_end=0)
        data.append(
            {
                "clip": clip,
                "ans_id": row.id,
                "user_id": row.user_id,
                "engagement": row.engagement,
                "interest": row.interest,
                "stress": row.stress,
                "excitement": row.excitement,
            }
        )
    return data


def process_participant_recording(path, out_dir):
    # extract_user_id
    user_id = int(os.path.split(path)[-1].split("_")[0])
    # load survey answers
    sessions_info = load_participant_answers(user_id, db_path=paths["SurveyDB"])
    # video path
    mp4_files = [file for file in os.listdir(path) if file.endswith(".mp4")]
    if len(mp4_files) != 1:
        warnings.warn(f"Incorrect number of videos for user: {user_id}")
        return None

    video_path = os.path.join(path, mp4_files[0])
    data = process_game_sessions(video_path, sessions_info)
    for session in tqdm(data, desc="Writing videos", leave=False):
        ID = session["ans_id"]
        EN = session["engagement"]
        IN = session["interest"]
        ST = session["stress"]
        EX = session["excitement"]
        target_name = f"{user_id}_{ID}_{EN}_{IN}_{ST}_{EX}.mp4"
        output_folder = os.path.join(out_dir, str(user_id))
        os.makedirs(output_folder, exist_ok=True)

        session["clip"].write_videofile(
            os.path.join(output_folder, target_name),
            verbose=False,
            audio=True,
            logger=None,
        )

In [8]:
root = paths["DataRoot"]
Parallel(n_jobs=-1, backend="threading")(
    delayed(process_participant_recording)(
        os.path.join(root, p), "../data/Dataset/Samples/OBS"
    )
    for p in os.listdir(root)
);

## EEG

In [11]:
def load_eeg_df(path):
    df = pd.read_csv(path, skiprows=1)
    df["Timestamp"] = (
        pd.to_datetime(df.Timestamp, unit="s")
        .dt.tz_localize("utc")
        .dt.tz_convert("US/Eastern")
    )
    return df


def extract_and_save_sessions(eeg_df, sessions_info, output_dir, subfolder=""):
    for _, session in sessions_info.iterrows():
        # Extract EEG segment
        mask = (eeg_df["Timestamp"] >= session["start_ts"]) & (
            eeg_df["Timestamp"] <= session["end_ts"]
        )
        segment = eeg_df[mask]

        # Create directory if not exists
        user_dir = os.path.join(output_dir, str(session["user_id"]))
        if subfolder:
            user_dir = os.path.join(user_dir, subfolder.upper())
        os.makedirs(user_dir, exist_ok=True)

        # Create filename
        filename = f"{session['user_id']}_{session['id']}_{session['engagement']}_{session['interest']}_{session['stress']}_{session['excitement']}.csv"
        filepath = os.path.join(user_dir, filename)

        # Save segment
        segment.to_csv(filepath, index=False)


def process_participant_dfs(path, output_dir, prefix, loading_func):
    path = Path(path)
    user_id = int(path.stem.split("_")[0])
    sessions_info = load_participant_answers(user_id, db_path=paths["SurveyDB"])
    eeg_df = pd.concat([loading_func(f) for f in path.glob(f"*{prefix}*.*sv")])

    extract_and_save_sessions(eeg_df, sessions_info, output_dir, subfolder=prefix)

In [6]:
root = paths["DataRoot"]
for p in tqdm(os.listdir(root), leave=False):
    process_participant_dfs(
        os.path.join(root, p), "../data/Dataset/Samples/", "EEG", load_eeg_df
    )

  0%|          | 0/38 [00:00<?, ?it/s]

## Eye Tracking

In [20]:
def load_gazepoint_df(path):
    df = pd.read_csv(path)
    time_colname = df.columns[3]
    t0 = pd.to_datetime(time_colname[5:-1]).timestamp()
    df = df.assign(Timestamp=t0 + df[time_colname])
    df["Timestamp"] = pd.to_datetime(df.Timestamp, unit="s").dt.tz_localize(
        "US/Eastern"
    )
    df = df[
        [
            "Timestamp",
            "VID_FRAME",
            "FPOGX",
            "FPOGY",
            "FPOGS",
            "FPOGD",
            "FPOGID",
            "FPOGV",
            "BPOGX",
            "BPOGY",
            "BPOGV",
            "LPCX",
            "LPCY",
            "LPD",
            "LPS",
            "LPV",
            "RPCX",
            "RPCY",
            "RPD",
            "RPS",
            "RPV",
            "BKID",
            "BKDUR",
            "BKPMIN",
            "LPMM",
            "LPMMV",
            "RPMM",
            "RPMMV",
            "PIXS",
            "PIXV",
            "SACCADE_MAG",
            "SACCADE_DIR",
        ]
    ]
    return df

In [23]:
root = paths["DataRoot"]
for p in tqdm(os.listdir(root), leave=False):
    process_participant_dfs(
        os.path.join(root, p), "../data/Dataset/data/", "eye", load_gazepoint_df
    )

  0%|          | 0/38 [00:00<?, ?it/s]

## Gamepad inputs

In [13]:
def load_controller_input_df(path):
    df = pd.read_csv(path, sep="\t", header=None)
    df.columns = ["Timestamp", "Event", "EventType", "State"]
    df["Timestamp"] = pd.to_datetime(df.Timestamp).dt.tz_localize("US/Eastern")
    return df

In [76]:
root = paths["DataRoot"]
for p in tqdm(os.listdir(root), leave=False):
    try:
        process_participant_dfs(
            os.path.join(root, p),
            "../data/Dataset/Samples/",
            "xbox",
            load_controller_input_df,
        )
    except ValueError:
        print(f"User: {p.split('_')[0]} missing controller data")

  0%|          | 0/38 [00:00<?, ?it/s]

User: 120 missing controller data
User: 166 missing controller data
User: 462 missing controller data
User: 539 missing controller data
User: 623 missing controller data
User: 703 missing controller data
User: 754 missing controller data
User: 507 missing controller data
User: 514 missing controller data
User: 744 missing controller data


## Heart rate

In [10]:
import json
from datetime import datetime


def extract_and_save_hr_sessions(sessions_info, hr_folder, output_dir):
    for _, session in sessions_info.iterrows():
        session_date = session["start_ts"].strftime("%Y-%m-%d")
        json_file = f"heart_rate-{session_date}.json"
        json_path = os.path.join(hr_folder, json_file)

        with open(json_path) as f:
            hr_data = json.load(f)

        hr_df = pd.DataFrame(
            [
                {
                    "Timestamp": pd.to_datetime(
                        d["dateTime"], format="%m/%d/%y %H:%M:%S"
                    ).tz_localize("UTC").tz_convert("US/Eastern"),
                    "BPM": d["value"]["bpm"],
                    "Confidence": d["value"]["confidence"],
                }
                for d in hr_data
            ]
        )

        mask = (hr_df["Timestamp"] >= session["start_ts"]) & (
            hr_df["Timestamp"] <= session["end_ts"]
        )
        segment = hr_df[mask]

        user_dir = os.path.join(output_dir, str(session["user_id"]), "HR")
        os.makedirs(user_dir, exist_ok=True)

        filename = f"{session['user_id']}_{session['id']}_{session['engagement']}_{session['interest']}_{session['stress']}_{session['excitement']}.csv"
        filepath = os.path.join(user_dir, filename)
        segment.to_csv(filepath, index=False)


def process_participant_hr(path, output_dir):
    path = Path(path)
    user_id = int(path.stem.split("_")[0])
    sessions_info = load_participant_answers(user_id, db_path=paths["SurveyDB"])

    extract_and_save_hr_sessions(sessions_info, paths["HeartDataRoot"], output_dir)

In [12]:
root = paths["DataRoot"]
for p in tqdm(os.listdir(root), leave=False):
    try:
        process_participant_hr(os.path.join(root, p), "../data/Dataset/Samples/")
    except ValueError:
        print(f"User: {p.split('_')[0]} missing Heart Rate data")

  0%|          | 0/39 [00:00<?, ?it/s]