In [None]:
from google.colab import files
files.upload()  # or upload student_behavior_data.zip and then unzip


Saving colab_Dataset.zip to colab_Dataset.zip
Buffered data was truncated after reaching the output size limit.

In [None]:
!unzip -q "colab_Dataset.zip"

In [None]:
!pip install seaborn scikit-learn tqdm




In [None]:
# run this entire cell in Colab
import os, glob, shutil, re, pandas as pd
from collections import defaultdict

CSV_PATH = "colab_Dataset/group_01_experiment_03_subject_01.csv"  # adjust if needed
OUT_CSV = "group_01_experiment_03_subject_01_colab_paths.csv"
TARGET_DIR = "metadata_matched"
os.makedirs(TARGET_DIR, exist_ok=True)

# 1) load csv and inspect metadata column
df = pd.read_csv(CSV_PATH)
meta_vals = df['metadata'].dropna().astype(str).tolist()
print("Total rows in CSV:", len(df))
print("Unique metadata entries in CSV:", len(set(meta_vals)))
print("\nSample CSV metadata values (first 10):")
for s in meta_vals[:10]:
    print("  ", s)

# 2) list JSONs actually present in workspace
all_jsons = glob.glob("**/*.json", recursive=True)
# exclude files inside target dir to avoid confusion
all_jsons = [p for p in all_jsons if TARGET_DIR not in p.split(os.sep)]
print("\nTotal JSON files found in workspace:", len(all_jsons))
print("Example JSON files (first 20):")
for p in all_jsons[:20]:
    print("  ", p)

# build handy indices
basename_map = defaultdict(list)
numtoken_map = defaultdict(list)
for p in all_jsons:
    b = os.path.basename(p)
    basename_map[b].append(p)
    # numeric token extraction: join all digit groups => e.g. "10_56_41_666940" -> "105641666940"
    tok = "".join(re.findall(r"\d+", b))
    if tok:
        numtoken_map[tok].append(p)

# matching loop
mapped = {}   # original csv metadata string -> matched path (Colab)
unmatched = []

for orig in set(meta_vals):
    b = os.path.basename(orig).strip()
    # strategy 1: exact basename
    if b in basename_map:
        mapped[orig] = basename_map[b][0]
        continue
    # strategy 2: numeric token match
    tok = "".join(re.findall(r"\d+", b))
    if tok and tok in numtoken_map:
        mapped[orig] = numtoken_map[tok][0]
        continue
    # strategy 3: substring containment (any json basename that contains token or vice versa)
    candidates = [p for p in all_jsons if b in os.path.basename(p) or os.path.basename(p) in b]
    if candidates:
        mapped[orig] = candidates[0]
        continue
    # no match
    unmatched.append(orig)

# copy matched files into TARGET_DIR with the CSV basename (so preprocessing can use metadata_matched/<basename>)
copied = 0
for orig, src in mapped.items():
    dest_name = os.path.basename(orig).strip()
    dest = os.path.join(TARGET_DIR, dest_name)
    try:
        if not os.path.exists(dest):
            shutil.copyfile(src, dest)
        copied += 1
    except Exception as e:
        print("Failed to copy:", src, "->", dest, e)

print(f"\nMatched: {len(mapped)} entries. Copied {copied} files into {TARGET_DIR}/")
print(f"Unmatched: {len(unmatched)} entries.")

if unmatched:
    print("\nFirst 20 unmatched examples (from CSV):")
    for u in unmatched[:20]:
        print("  ", u)

# 4) create new CSV with updated metadata paths (pointing to metadata_matched/<basename> when matched)
df2 = df.copy()
def map_to_colab_path(orig):
    if pd.isna(orig):
        return orig
    if orig in mapped:
        return os.path.join(TARGET_DIR, os.path.basename(orig).strip())
    else:
        return orig  # leave original (so you can see unresolved ones)

df2['metadata'] = df2['metadata'].apply(map_to_colab_path)
df2.to_csv(OUT_CSV, index=False)
print("\nWrote updated CSV with Colab metadata paths:", OUT_CSV)


Total rows in CSV: 2597
Unique metadata entries in CSV: 2597

Sample CSV metadata values (first 10):
   C:\Users\azans\OneDrive\Desktop\DS_D1\group_01\experiment_03\subject_01\metadata\10_56_17_027710.json
   C:\Users\azans\OneDrive\Desktop\DS_D1\group_01\experiment_03\subject_01\metadata\10_56_17_175583.json
   C:\Users\azans\OneDrive\Desktop\DS_D1\group_01\experiment_03\subject_01\metadata\10_56_17_275580.json
   C:\Users\azans\OneDrive\Desktop\DS_D1\group_01\experiment_03\subject_01\metadata\10_56_17_375644.json
   C:\Users\azans\OneDrive\Desktop\DS_D1\group_01\experiment_03\subject_01\metadata\10_56_17_476041.json
   C:\Users\azans\OneDrive\Desktop\DS_D1\group_01\experiment_03\subject_01\metadata\10_56_17_576104.json
   C:\Users\azans\OneDrive\Desktop\DS_D1\group_01\experiment_03\subject_01\metadata\10_56_17_675687.json
   C:\Users\azans\OneDrive\Desktop\DS_D1\group_01\experiment_03\subject_01\metadata\10_56_17_775627.json
   C:\Users\azans\OneDrive\Desktop\DS_D1\group_01\experimen

In [None]:
!python colab_Dataset/script.py


Using CSV: /content/project/group_01_experiment_03_subject_01_colab_paths.csv
Loading CSV...
CSV shape: (2597, 15)
Using metadata_dir: colab_Dataset/metadata (contains 2597 json files)
Sampling 15.0% -> new shape (390, 15)
Processing rows and metadata JSONs...
100% 390/390 [00:01<00:00, 267.24it/s]
Cleaned dataframe shape: (390, 23)
After dropping rows missing both maj_emotion & maj_attention: (390, 23)
Saved cleaned CSV -> cleaned_subject01.csv
Creating visualizations...
Saved: preproc_visuals/maj_emotion_distribution.png
Saved: preproc_visuals/maj_attention_pie.png
Saved: preproc_visuals/headpose_yaw_vs_pitch.png
Saved: preproc_visuals/agreement_emotion_hist.png
Saved: preproc_visuals/confusion_majority_vs_self_emotion.png
Saved visuals to preproc_visuals
Preprocessing complete.
