### Converting mp4 to wav files

In [19]:
# import os
# import subprocess

# def convert_m4a_to_wav(input_folder, output_folder=None, sample_rate=16000):
#     """
#     Convert all .m4a files in input_folder to .wav using ffmpeg.
    
#     Args:
#         input_folder (str): Path containing .m4a files.
#         output_folder (str): Destination folder for .wav files (defaults to input folder).
#         sample_rate (int): Desired sample rate for output wav files (default 16000 Hz).
#     """
#     if output_folder is None:
#         output_folder = input_folder

#     if not os.path.exists(output_folder):
#         os.makedirs(output_folder)

#     for file_name in os.listdir(input_folder):
#         if file_name.lower().endswith(".m4a"):
#             input_path = os.path.join(input_folder, file_name)
#             output_name = os.path.splitext(file_name)[0] + ".wav"
#             output_path = os.path.join(output_folder, output_name)

#             command = [
#                 "ffmpeg",
#                 "-y",                 # overwrite without asking
#                 "-i", input_path,     # input file
#                 "-ar", str(sample_rate),  # resample
#                 output_path
#             ]

#             print(f"Converting: {input_path} -> {output_path}")
#             subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# # Example usage
# if __name__ == "__main__":
#     convert_m4a_to_wav(r"O:\\MyThesis\\DataSets\\Michal\\Video\\",
#                        r"O:\\MyThesis\\DataSets\\Michal\\Audio")


import os
import subprocess
from pathlib import Path

# -------------------------------------------------
# CONFIGURATION
# -------------------------------------------------
INPUT_ROOT  = r"O:\\MyThesis\\DataSets\\Michal\\Video\\"   # folder with mp4 files
OUTPUT_ROOT = r"O:\\MyThesis\\DataSets\\Michal\\Audio\\"     # where wav files will be saved

SAMPLE_RATE = 16000   # Hz (good for speech / ML)
CHANNELS    = 1       # 1 = mono, 2 = stereo
OVERWRITE   = False   # True = overwrite existing wav files
# -------------------------------------------------


def check_ffmpeg():
    """Check that ffmpeg is installed and available."""
    try:
        subprocess.run(
            ["ffmpeg", "-version"],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
            check=True
        )
    except Exception:
        raise RuntimeError(
            "FFmpeg is not installed or not in PATH.\n"
            "Install FFmpeg and restart the terminal / VS Code."
        )


def convert_tree_mp4_to_wav(input_root, output_root):
    input_root = Path(input_root).resolve()
    output_root = Path(output_root).resolve()

    mp4_files = list(input_root.rglob("*.mp4"))

    print(f"Found {len(mp4_files)} mp4 files")

    for mp4_path in mp4_files:
        # Keep folder structure
        relative_path = mp4_path.relative_to(input_root)

        # Change extension to .wav
        wav_path = output_root / relative_path.with_suffix(".wav")

        # Create output folders if needed
        wav_path.parent.mkdir(parents=True, exist_ok=True)

        if wav_path.exists() and not OVERWRITE:
            print(f"SKIP: {wav_path}")
            continue

        cmd = [
            "ffmpeg",
            "-y" if OVERWRITE else "-n",
            "-i", str(mp4_path),
            "-vn",                       # no video
            "-ac", str(CHANNELS),
            "-ar", str(SAMPLE_RATE),
            "-c:a", "pcm_s16le",         # WAV PCM 16-bit
            str(wav_path)
        ]

        try:
            subprocess.run(cmd, check=True)
            print(f"OK: {mp4_path} -> {wav_path}")
        except subprocess.CalledProcessError:
            print(f"ERROR converting: {mp4_path}")

    print("Conversion finished.")


if __name__ == "__main__":
    check_ffmpeg()
    convert_tree_mp4_to_wav(INPUT_ROOT, OUTPUT_ROOT)


Found 217 mp4 files
OK: O:\MyThesis\DataSets\Michal\Video\100\Convers_100.mp4 -> O:\MyThesis\DataSets\Michal\Audio\100\Convers_100.wav
OK: O:\MyThesis\DataSets\Michal\Video\100\Inten_100.mp4 -> O:\MyThesis\DataSets\Michal\Audio\100\Inten_100.wav
OK: O:\MyThesis\DataSets\Michal\Video\100\Spon_100.mp4 -> O:\MyThesis\DataSets\Michal\Audio\100\Spon_100.wav
OK: O:\MyThesis\DataSets\Michal\Video\101\Convers_101.mp4 -> O:\MyThesis\DataSets\Michal\Audio\101\Convers_101.wav
OK: O:\MyThesis\DataSets\Michal\Video\101\Inten_101.mp4 -> O:\MyThesis\DataSets\Michal\Audio\101\Inten_101.wav
OK: O:\MyThesis\DataSets\Michal\Video\101\Spon_101.mp4 -> O:\MyThesis\DataSets\Michal\Audio\101\Spon_101.wav
OK: O:\MyThesis\DataSets\Michal\Video\102\Convers_102.mp4 -> O:\MyThesis\DataSets\Michal\Audio\102\Convers_102.wav
OK: O:\MyThesis\DataSets\Michal\Video\102\Inten_102.mp4 -> O:\MyThesis\DataSets\Michal\Audio\102\Inten_102.wav
OK: O:\MyThesis\DataSets\Michal\Video\102\Spon_102.mp4 -> O:\MyThesis\DataSets\Micha

### Loading wav file and creating csv file results

In [21]:
import os, math, csv, traceback
from pathlib import Path
import numpy as np
import librosa, soundfile as sf
import torch
from speechbrain.pretrained import SpeakerRecognition
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

# ======================= Config =======================
INPUT_DIR  = r"O:\\MyThesis\\DataSets\\Michal\\Audio\\"     # where wav files will be saved
OUTPUT_DIR = r"O:\\MyThesis\\DataSets\\Michal\\outputsResults" # where CSVs will be written
RECURSIVE = True                                           # search subfolders
SKIP_IF_EXISTS = True                                      # skip files with an existing CSV
FORCE_NUM_SPEAKERS = 2                                     # set to int (e.g., 2) or None to auto-estimate (2..5)

# VAD & windowing
VAD_TOP_DB = 25            # lower → more sensitive VAD (20–35 sensible range)
MIN_SPEECH_SEC = 0.30      # drop micro-blips
WIN_SEC = 1.5              # embedding window length (s)
HOP_SEC = 0.75             # hop (s) → 50% overlap
MERGE_GAP_SEC = 0.20       # merge adjacent same-speaker segments if gap < this
# ======================================================

def fmt_time_ss_msec(t: float) -> str:
    t = max(0.0, float(t))
    s = int(t)
    ms = int(round((t - s) * 1000))
    if ms == 1000:
        s += 1; ms = 0
    return f"{s}.{ms:03d}"

def get_speech_segments(y, sr, top_db=25, min_len=0.3, frame_length=2048, hop_length=512):
    """Return non-silent (start_s, end_s) intervals via librosa.effects.split."""
    intervals = librosa.effects.split(y, top_db=top_db, frame_length=frame_length, hop_length=hop_length)
    segs = []
    for a, b in intervals:
        s, e = a / sr, b / sr
        if e - s >= min_len:
            segs.append((s, e))
    return segs

def sliding_windows(seg_start, seg_end, win, hop):
    t = seg_start
    out = []
    while t < seg_end:
        tend = min(t + win, seg_end)
        if tend - t >= 0.5:  # need at least 0.5s for stable embedding
            out.append((t, tend))
        t += hop
    return out

def merge_same_label(segments, gap=0.2):
    """segments: list of (start, end, label), sorted by time; merge contiguous/nearby same labels."""
    if not segments:
        return []
    merged = []
    cur_s, cur_e, cur_l = segments[0]
    for s, e, l in segments[1:]:
        if l == cur_l and 0 <= s - cur_e <= gap:
            cur_e = max(cur_e, e)
        else:
            merged.append((cur_s, cur_e, cur_l))
            cur_s, cur_e, cur_l = s, e, l
    merged.append((cur_s, cur_e, cur_l))
    return merged

def pick_top_two_by_talktime(segments):
    totals = {}
    for s, e, l in segments:
        totals[l] = totals.get(l, 0.0) + (e - s)
    items = sorted(totals.items(), key=lambda kv: kv[1], reverse=True)
    top = [lab for lab, _ in items[:2]]
    while len(top) < 2:
        top.append(f"UNK{len(top)+1}")
    return top[0], top[1]

def diarize_one_file(audio_path: Path, out_csv_path: Path, recog: SpeakerRecognition):
    # Load audio mono 16 kHz
    y, sr = librosa.load(str(audio_path), sr=16000, mono=True)
    # VAD
    speech_regions = get_speech_segments(y, sr, top_db=VAD_TOP_DB, min_len=MIN_SPEECH_SEC)
    # Prepare output dir
    out_csv_path.parent.mkdir(parents=True, exist_ok=True)

    # If no speech, write header only
    if not speech_regions:
        with open(out_csv_path, "w", newline="", encoding="utf-8") as f:
            csv.writer(f).writerow(["Begin Time - ss.msec","End Time - ss.msec","Duration - ss.msec","speaker1","speaker2","other"])
        return {"segments": 0, "speakers": 0}

    # Windows for embeddings
    windows = []
    for s, e in speech_regions:
        windows.extend(sliding_windows(s, e, WIN_SEC, HOP_SEC))
    if not windows:
        with open(out_csv_path, "w", newline="", encoding="utf-8") as f:
            csv.writer(f).writerow(["Begin Time - ss.msec","End Time - ss.msec","Duration - ss.msec","speaker1","speaker2","other"])
        return {"segments": 0, "speakers": 0}

    # Embeddings
    embs = []
    for s, e in windows:
        chunk = y[int(s*sr):int(e*sr)]
        wav = torch.tensor(chunk, dtype=torch.float32).unsqueeze(0)  # [1, T]
        with torch.no_grad():
            emb = recog.encode_batch(wav)  # [1, D]
        embs.append(emb.squeeze().cpu().numpy().reshape(-1))
    X = np.vstack(embs)  # [N, D]

    # Number of speakers
    if FORCE_NUM_SPEAKERS and FORCE_NUM_SPEAKERS >= 1:
        k = int(FORCE_NUM_SPEAKERS)
    else:
        best_k, best_score = 2, -1.0
        for cand in range(2, 6):
            labels_c = AgglomerativeClustering(n_clusters=cand, linkage="ward").fit_predict(X)
            if 1 < len(set(labels_c)) < len(labels_c):
                score = silhouette_score(X, labels_c)
                if score > best_score:
                    best_score, best_k = score, cand
        k = best_k
    clustering = AgglomerativeClustering(n_clusters=k, linkage="ward")
    win_labels = clustering.fit_predict(X)

    # Windows -> merged segments
    labeled_windows = sorted([(windows[i][0], windows[i][1], int(win_labels[i])) for i in range(len(windows))],
                             key=lambda x: (x[0], x[1]))
    merged = merge_same_label(labeled_windows, gap=MERGE_GAP_SEC)

    # Map to speaker1/2/other
    spk1, spk2 = pick_top_two_by_talktime(merged)

    # Write CSV
    with open(out_csv_path, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["Begin Time - ss.msec","End Time - ss.msec","Duration - ss.msec","Mother","Child","Others"])
        for s, e, lab in merged:
            w.writerow([
                fmt_time_ss_msec(s),
                fmt_time_ss_msec(e),
                fmt_time_ss_msec(e - s),
                1 if lab == spk1 else 0,
                1 if lab == spk2 else 0,
                1 if lab not in (spk1, spk2) else 0
            ])
    return {"segments": len(merged), "speakers": k}

def main():
    in_dir = Path(INPUT_DIR)
    out_dir = Path(OUTPUT_DIR)
    pattern = "**/*.wav" if RECURSIVE else "*.wav"
    files = sorted(in_dir.glob(pattern))
    if not files:
        print(f"No WAVs found in: {INPUT_DIR}")
        return

    # Load model once
    recog = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

    total_files = 0
    processed = 0
    for wav_path in files:
        total_files += 1
        out_name = wav_path.stem + "_diarization.csv"
        out_csv_path = out_dir / out_name
        if SKIP_IF_EXISTS and out_csv_path.exists():
            print(f"[skip] {wav_path.name} -> {out_csv_path.name} (exists)")
            continue
        try:
            stats = diarize_one_file(wav_path, out_csv_path, recog)
            print(f"[ok] {wav_path.name} -> {out_csv_path.name} | segs={stats['segments']} | speakers={stats['speakers']}")
            processed += 1
        except Exception as e:
            print(f"[error] {wav_path} :: {e}")
            traceback.print_exc()

    print(f"\nDone. Found {total_files} WAV(s), processed {processed}, output dir: {OUTPUT_DIR}")

if __name__ == "__main__":
    main()


[ok] Convers_100.wav -> Convers_100_diarization.csv | segs=59 | speakers=2
[ok] Inten_100.wav -> Inten_100_diarization.csv | segs=60 | speakers=2
[ok] Spon_100.wav -> Spon_100_diarization.csv | segs=57 | speakers=2
[ok] Convers_101.wav -> Convers_101_diarization.csv | segs=163 | speakers=2
[ok] Inten_101.wav -> Inten_101_diarization.csv | segs=328 | speakers=2
[ok] Spon_101.wav -> Spon_101_diarization.csv | segs=304 | speakers=2
[ok] Convers_102.wav -> Convers_102_diarization.csv | segs=165 | speakers=2
[ok] Inten_102.wav -> Inten_102_diarization.csv | segs=64 | speakers=2
[ok] Spon_102.wav -> Spon_102_diarization.csv | segs=61 | speakers=2
[ok] Convers_103.wav -> Convers_103_diarization.csv | segs=133 | speakers=2
[ok] Inten_103.wav -> Inten_103_diarization.csv | segs=286 | speakers=2
[ok] Spon_103.wav -> Spon_103_diarization.csv | segs=338 | speakers=2
[ok] Convers_104.wav -> Convers_104_diarization.csv | segs=55 | speakers=2
[ok] Inten_104.wav -> Inten_104_diarization.csv | segs=40 

### Adding CIBs (From CIBs CSV File)

In [26]:
import os
import re
from pathlib import Path
import pandas as pd


# -----------------------------
# CONFIG (edit these)
# -----------------------------
SUBJECTS_CSV = r"O:\\MyThesis\\DataSets\\Michal\\CIB_code\\Michal_Subjects.csv"         # (1)
SEGMENTS_FOLDER = r"O:\\MyThesis\\DataSets\\Michal\\outputsResults"    # (2)
OUTPUT_FOLDER = r"O:\\MyThesis\\DataSets\\Michal\\outputsResultsWithCIBs" # output
# -----------------------------

SUBJECT_COLS_EXPECTED = [
    "Sub", "AqScore",
    "SENSIT", "INTRUS", "LIMITS", "INVOLVE",
    "WITHDRAW", "COMPLY", "SYNCH", "DYADNEG"
]

SEGMENT_COLS_EXPECTED = [
    "Begin Time - ss.msec",
    "End Time - ss.msec",
    "Duration - ss.msec",
    "Mother",
    "Child"
    # "Others" intentionally excluded
]



def extract_sub_from_filename(filename: str):
    """
    Tries to extract Sub as an integer from the filename.
    Works for common patterns like:
      - 'Sub12.csv'
      - 'subject_12_segments.csv'
      - '12_anything.csv'
    Returns int or None if not found.
    """
    m = re.search(r"(?:sub|subject)\s*[_-]?\s*(\d+)", filename, flags=re.IGNORECASE)
    if m:
        return int(m.group(1))

    m = re.search(r"(\d+)", filename)
    if m:
        return int(m.group(1))

    return None


def main():
    subjects_path = Path(SUBJECTS_CSV)
    seg_folder = Path(SEGMENTS_FOLDER)
    out_folder = Path(OUTPUT_FOLDER)
    out_folder.mkdir(parents=True, exist_ok=True)

    # -----------------------------
    # Load subjects table (1)
    # -----------------------------
    subj_df = pd.read_csv(subjects_path)

    missing = [c for c in SUBJECT_COLS_EXPECTED if c not in subj_df.columns]
    if missing:
        raise ValueError(
            "Subjects CSV is missing required columns:\n"
            + ", ".join(missing)
            + f"\nFound columns: {list(subj_df.columns)}"
        )

    subj_df["Sub"] = pd.to_numeric(subj_df["Sub"], errors="raise").astype(int)
    subj_lookup = subj_df.set_index("Sub")[SUBJECT_COLS_EXPECTED[1:]].to_dict(orient="index")

    # -----------------------------
    # Process segment CSVs (2)
    # -----------------------------
    segment_files = sorted(seg_folder.glob("*.csv"))
    if not segment_files:
        print(f"No CSV files found in: {seg_folder}")
        return

    print(f"Found {len(segment_files)} segment CSV files.")

    not_found_sub = []     # list of dicts: filename, extracted_sub
    no_sub_in_name = []    # list of filenames
    read_errors = []       # list of dicts: filename, error
    processed = 0

    for fpath in segment_files:
        sub_id = extract_sub_from_filename(fpath.name)
        if sub_id is None:
            no_sub_in_name.append(fpath.name)
            continue

        if sub_id not in subj_lookup:
            not_found_sub.append({"filename": fpath.name, "extracted_sub": sub_id})
            continue

        try:
            seg_df = pd.read_csv(fpath)
        except Exception as e:
            read_errors.append({"filename": fpath.name, "error": str(e)})
            continue

        missing_seg_cols = [c for c in SEGMENT_COLS_EXPECTED if c not in seg_df.columns]
        if missing_seg_cols:
            print(f"WARNING: {fpath.name} missing columns: {missing_seg_cols}")

        # Add subject columns to every row
        meta = subj_lookup[sub_id]
        seg_df.insert(0, "Sub", sub_id)
        for col, val in meta.items():
            seg_df[col] = val

        # -----------------------------
        # EXCLUDE "Others" COLUMN
        # -----------------------------
        if "Others" in seg_df.columns:
            seg_df = seg_df.drop(columns=["Others"])

        # Save
        out_path = out_folder / fpath.name
        seg_df.to_csv(out_path, index=False)
        processed += 1

    # -----------------------------
    # Summary + show skipped filenames
    # -----------------------------
    print("\nDONE")
    print(f"Processed: {processed}")
    print(f"Skipped (no Sub in filename): {len(no_sub_in_name)}")
    print(f"Skipped (Sub not found in subjects table): {len(not_found_sub)}")
    print(f"Skipped (read/parse errors): {len(read_errors)}")
    print(f"Output folder: {out_folder}")

    if no_sub_in_name:
        print("\n--- Skipped: NO Sub number detected in filename ---")
        for fname in no_sub_in_name:
            print("  -", fname)

    if not_found_sub:
        print("\n--- Skipped: Sub NOT FOUND in subjects table ---")
        for item in not_found_sub:
            print(f"  - {item['filename']}  (extracted Sub={item['extracted_sub']})")

    if read_errors:
        print("\n--- Skipped: CSV read/parse errors ---")
        for item in read_errors:
            print(f"  - {item['filename']}  (error={item['error']})")

    # # -----------------------------
    # # Save skipped report CSV
    # # -----------------------------
    # skipped_rows = []

    # for fname in no_sub_in_name:
    #     skipped_rows.append({"filename": fname, "reason": "no_sub_in_filename", "extracted_sub": None, "error": None})

    # for item in not_found_sub:
    #     skipped_rows.append({"filename": item["filename"], "reason": "sub_not_in_subjects_table",
    #                          "extracted_sub": item["extracted_sub"], "error": None})

    # for item in read_errors:
    #     skipped_rows.append({"filename": item["filename"], "reason": "csv_read_error",
    #                          "extracted_sub": extract_sub_from_filename(item["filename"]), "error": item["error"]})

    # if skipped_rows:
    #     report_path = out_folder / "skipped_report.csv"
    #     pd.DataFrame(skipped_rows).to_csv(report_path, index=False)
    #     print(f"\nSaved skipped report: {report_path}")


if __name__ == "__main__":
    main()


Found 213 segment CSV files.

DONE
Processed: 213
Skipped (no Sub in filename): 0
Skipped (Sub not found in subjects table): 0
Skipped (read/parse errors): 0
Output folder: O:\MyThesis\DataSets\Michal\outputsResultsWithCIBs


### Creating Direct / Stand Alone Features (Vocal and interaction)

In [12]:
# ============================================================
# FULL PIPELINE: deterministic WAV<->CSV pairing
# WAV:  Convers_100.wav
# CSV:  Convers_100_diarization.csv
# Output: Convers_100_features.csv
# ============================================================

# ----------------------------
# NO WARNINGS (must be first)
# ----------------------------
import os, warnings
os.environ["NUMBA_DISABLE_JIT"] = "1"
warnings.filterwarnings("ignore")

# ----------------------------
# Imports
# ----------------------------
import glob
import numpy as np
import pandas as pd
import librosa
from pathlib import Path
from tqdm import tqdm

# ----------------------------
# Columns that already exist in each diarization CSV
# ----------------------------
LABEL_COLS = [
    "Sub","AqScore", "SENSIT", "INTRUS", "LIMITS",
    "INVOLVE", "WITHDRAW", "COMPLY", "SYNCH", "DYADNEG"
]

# ============================================================
# Speaker helper
# ============================================================
def detect_speaker(row):
    m = int(row.get("Mother", 0)) if not pd.isna(row.get("Mother", 0)) else 0
    c = int(row.get("Child", 0)) if not pd.isna(row.get("Child", 0)) else 0
    if m == 1 and c == 0:
        return "Mother"
    if c == 1 and m == 0:
        return "Child"
    if m == 1 and c == 1:
        return "Both"
    return "None"

# ============================================================
# CSV loader with AUTO sec/ms + label extraction
# ============================================================
def load_segments_csv(csv_path: str, audio_duration_sec: float):
    df = pd.read_csv(csv_path)
    df.columns = [c.strip() for c in df.columns]

    # ignore Others if present
    if "Others" in df.columns:
        df = df.drop(columns=["Others"])

    required = ["Begin Time - ss.msec", "End Time - ss.msec", "Duration - ss.msec", "Mother", "Child"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns in {csv_path}: {missing}")

    # Extract label values once (assumed constant per file)
    labels = {c: (df[c].iloc[0] if c in df.columns and len(df) else np.nan) for c in LABEL_COLS}

    begin_raw = df["Begin Time - ss.msec"].astype(float)
    end_raw   = df["End Time - ss.msec"].astype(float)
    dur_raw   = df["Duration - ss.msec"].astype(float)

    # auto-detect milliseconds vs seconds
    is_ms = end_raw.max() > audio_duration_sec * 3.0
    if is_ms:
        begin_raw /= 1000.0
        end_raw   /= 1000.0
        dur_raw   /= 1000.0

    df["segment_begin_sec"] = begin_raw
    df["segment_end_sec"]   = end_raw
    df["duration"]          = dur_raw
    df["speaker"] = df.apply(detect_speaker, axis=1)
    df = df.reset_index(drop=True)
    df["segment_index"] = np.arange(len(df), dtype=int)

    return df, labels

# ============================================================
# Pitch (robust)
# ============================================================
def compute_pitch(y_seg, sr):
    if y_seg.size < int(0.06 * sr):
        return np.nan, np.nan

    rms = librosa.feature.rms(y=y_seg)[0]
    if np.mean(rms) < 1e-4:
        return np.nan, np.nan

    try:
        f0, _, _ = librosa.pyin(
            y_seg, sr=sr,
            fmin=50, fmax=3000,
            frame_length=1024, hop_length=256
        )
        if f0 is None:
            return np.nan, np.nan
        f0v = f0[~np.isnan(f0)]
        if f0v.size < 3:
            return np.nan, np.nan
        return float(np.mean(f0v)), float(np.std(f0v))
    except Exception:
        return np.nan, np.nan

# ============================================================
# Extract audio features for a segment (returns NaNs if empty)
# ============================================================
def extract_features(y_seg, sr, n_mfcc=20):
    if y_seg is None or len(y_seg) == 0:
        feats = {
            "spectral_centroid": np.nan,
            "spectral_bandwidth": np.nan,
            "spectral_rolloff": np.nan,
            "zcr": np.nan,
            "rms": np.nan,
            "pitch_level": np.nan,
            "pitch_std": np.nan,
            "intensity_std": np.nan,
            "tonal_centroid": np.nan,
            "mel_spec": np.nan,
            "log_mel_spec": np.nan,
            "chroma": np.nan,
        }
        for i in range(1, n_mfcc + 1):
            feats[f"mfcc_{i}"] = np.nan
        return feats

    sc  = librosa.feature.spectral_centroid(y=y_seg, sr=sr)
    sb  = librosa.feature.spectral_bandwidth(y=y_seg, sr=sr)
    sro = librosa.feature.spectral_rolloff(y=y_seg, sr=sr, roll_percent=0.85)
    zcr = librosa.feature.zero_crossing_rate(y_seg)
    rms = librosa.feature.rms(y=y_seg)

    mfcc = librosa.feature.mfcc(y=y_seg, sr=sr, n_mfcc=n_mfcc)

    pitch_level, pitch_std = compute_pitch(y_seg, sr)

    mel = librosa.feature.melspectrogram(y=y_seg, sr=sr, n_mels=64)
    log_mel = librosa.power_to_db(mel, ref=np.max) if mel.size else np.array([])

    chroma = librosa.feature.chroma_stft(y=y_seg, sr=sr)

    try:
        y_h = librosa.effects.harmonic(y_seg)
        tonnetz = librosa.feature.tonnetz(y=y_h, sr=sr)
        tonal_centroid = float(np.mean(tonnetz)) if tonnetz.size else np.nan
    except Exception:
        tonal_centroid = np.nan

    feats = {
        "spectral_centroid": float(np.mean(sc)) if sc.size else np.nan,
        "spectral_bandwidth": float(np.mean(sb)) if sb.size else np.nan,
        "spectral_rolloff": float(np.mean(sro)) if sro.size else np.nan,
        "zcr": float(np.mean(zcr)) if zcr.size else np.nan,
        "rms": float(np.mean(rms)) if rms.size else np.nan,
        "intensity_std": float(np.std(rms)) if rms.size else np.nan,
        "pitch_level": pitch_level,
        "pitch_std": pitch_std,
        "tonal_centroid": tonal_centroid,
        "mel_spec": float(np.mean(mel)) if mel.size else np.nan,
        "log_mel_spec": float(np.mean(log_mel)) if log_mel.size else np.nan,
        "chroma": float(np.mean(chroma)) if chroma.size else np.nan,
    }

    for i in range(1, n_mfcc + 1):
        feats[f"mfcc_{i}"] = float(np.mean(mfcc[i - 1])) if mfcc.size else np.nan

    return feats

# ============================================================
# Deterministic pairing: wav_stem -> wav_stem + "_diarization.csv"
# ============================================================
def build_pairs_diarization(wav_root, csv_dir):
    wavs = sorted(glob.glob(os.path.join(wav_root, "**", "*.wav"), recursive=True))
    csvs = sorted(glob.glob(os.path.join(csv_dir, "*.csv")))

    csv_by_stem = {Path(c).stem: c for c in csvs}

    pairs = []
    unmatched = []
    for w in wavs:
        wav_stem = Path(w).stem
        expected_csv_stem = f"{wav_stem}_diarization"
        c = csv_by_stem.get(expected_csv_stem)
        if c is None:
            unmatched.append(w)
        else:
            pairs.append((w, c))
    return pairs, unmatched

# ============================================================
# Process one pair: output rows == input rows
# ============================================================
def process_pair(wav_path, csv_path, out_csv_path, target_sr=16000):
    y, sr = librosa.load(wav_path, sr=target_sr, mono=False)

    # force mono
    if isinstance(y, np.ndarray) and y.ndim == 2:
        y = np.mean(y, axis=0)

    audio_duration_sec = len(y) / sr
    seg_df, labels = load_segments_csv(csv_path, audio_duration_sec)

    rows = []
    audio_len = len(y)

    for i, r in seg_df.iterrows():
        b = float(r["segment_begin_sec"])
        e = float(r["segment_end_sec"])

        # Clamp and use floor/ceil to avoid empty due to rounding
        start = int(np.floor(b * sr))
        end   = int(np.ceil(e * sr))
        start = max(0, min(start, audio_len))
        end   = max(0, min(end, audio_len))

        y_seg = y[start:end] if end > start else np.array([], dtype=np.float32)

        feats = extract_features(y_seg, sr, n_mfcc=20)

        # next speaker flags
        speaker_now = r["speaker"]
        speaker_next = seg_df.loc[i + 1, "speaker"] if i + 1 < len(seg_df) else "None"
        mother_to_child = int(speaker_now == "Mother" and speaker_next == "Child")
        child_to_mother = int(speaker_now == "Child" and speaker_next == "Mother")

        row = {}
        row.update(labels)
        row.update(feats)
        row.update({
            "speaker": speaker_now,
            "segment_index": int(r["segment_index"]),
            "segment_begin_sec": b,
            "segment_end_sec": e,
            "duration": float(r["duration"]),
            "mother_to_child": mother_to_child,
            "child_to_mother": child_to_mother,
            "mother_pitch": feats["pitch_level"] if speaker_now == "Mother" else np.nan,
            "child_pitch": feats["pitch_level"] if speaker_now == "Child" else np.nan,
        })

        rows.append(row)

    out_df = pd.DataFrame(rows)

    ordered = (
        LABEL_COLS
        + ["spectral_centroid", "spectral_bandwidth", "spectral_rolloff", "zcr", "rms"]
        + [f"mfcc_{i}" for i in range(1, 21)]
        + ["pitch_level", "pitch_std", "intensity_std", "duration",
           "tonal_centroid", "mel_spec", "log_mel_spec", "chroma",
           "speaker", "segment_index", "segment_begin_sec", "segment_end_sec",
           "mother_to_child", "child_to_mother", "mother_pitch", "child_pitch"]
    )
    ordered = [c for c in ordered if c in out_df.columns]
    out_df = out_df[ordered]

    os.makedirs(Path(out_csv_path).parent, exist_ok=True)
    out_df.to_csv(out_csv_path, index=False)



# ============================================================
# MAIN
# ============================================================
if __name__ == "__main__":
    # Example usage:
    #   python extract_vocal_interaction_features.py
    #
    # Edit these paths (Windows example)
    WAV_ROOT = r"O:\\MyThesis\\DataSets\\Michal\\Audio"     # folder that contains subfolders of wav files
    SEGMENTS_CSV_DIR = r"O:\\MyThesis\\DataSets\\Michal\\outputsResultsWithCIBs"
    OUTPUT_FOLDER = r"O:\\MyThesis\\DataSets\\Michal\\outputsResultsWithStandAloneFeatures"

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    pairs, unmatched = build_pairs_diarization(WAV_ROOT, SEGMENTS_CSV_DIR)

    print(f"PAIRS FOUND: {len(pairs)}")
    print(f"UNMATCHED WAVS: {len(unmatched)}")

    for w, c in pairs[:15]:
        print("PAIR:", Path(w).name, "<->", Path(c).name)

    for wav_path, csv_path in tqdm(pairs, desc="Processing pairs"):
        out_csv = os.path.join(OUTPUT_DIR, f"{Path(wav_path).stem}_features.csv")
        process_pair(wav_path, csv_path, out_csv)

    print("DONE")



PAIRS FOUND: 213
UNMATCHED WAVS: 0
PAIR: Convers_100.wav <-> Convers_100_diarization.csv
PAIR: Inten_100.wav <-> Inten_100_diarization.csv
PAIR: Spon_100.wav <-> Spon_100_diarization.csv
PAIR: Convers_101.wav <-> Convers_101_diarization.csv
PAIR: Inten_101.wav <-> Inten_101_diarization.csv
PAIR: Spon_101.wav <-> Spon_101_diarization.csv
PAIR: Convers_102.wav <-> Convers_102_diarization.csv
PAIR: Inten_102.wav <-> Inten_102_diarization.csv
PAIR: Spon_102.wav <-> Spon_102_diarization.csv
PAIR: Convers_103.wav <-> Convers_103_diarization.csv
PAIR: Inten_103.wav <-> Inten_103_diarization.csv
PAIR: Spon_103.wav <-> Spon_103_diarization.csv
PAIR: Convers_104.wav <-> Convers_104_diarization.csv
PAIR: Inten_104.wav <-> Inten_104_diarization.csv
PAIR: Spon_104.wav <-> Spon_104_diarization.csv


Processing pairs: 100%|██████████| 213/213 [3:53:37<00:00, 65.81s/it]   

DONE





### Add CIBs Prediction and save folder with all csv file that each file include all features

In [4]:
import os
import glob
import warnings
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

warnings.filterwarnings("ignore")

# ============================================================
# CONFIG: features + targets (as you provided)
# ============================================================
INPUT_FEATURES = [
    "spectral_centroid", "spectral_bandwidth", "spectral_rolloff",
    "zcr", "rms",
    *[f"mfcc_{i}" for i in range(1, 21)],
    "pitch_level", "pitch_std", "intensity_std", "duration",
    "tonal_centroid",
    "mel_spec", "log_mel_spec", "chroma",
    "mother_to_child", "child_to_mother",
    "mother_pitch", "child_pitch"
]

TARGETS = [
    "SENSIT", "INTRUS", "LIMITS",
    "INVOLVE", "WITHDRAW", "COMPLY",
    "SYNCH", "DYADNEG"
]

# ============================================================
# Helper utils
# ============================================================
def ensure_dir(folder: str):
    Path(folder).mkdir(parents=True, exist_ok=True)

def safe_numeric_df(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    out = df.copy()
    for c in cols:
        out[c] = pd.to_numeric(out[c], errors="coerce")
    return out

def fit_predict_one_target(
    train_df: pd.DataFrame,
    pred_df: pd.DataFrame,
    feature_cols: list[str],
    target_col: str,
    n_splits: int = 5,
    random_state: int = 42,
) -> tuple[np.ndarray, np.ndarray]:
    """
    Train model on train_df for target_col, then predict on pred_df.
    Returns:
      - oof_pred (cross-validated predictions for train_df rows)
      - test_pred (predictions for pred_df rows)
    """
    # Keep only rows with non-null target in training
    train_use = train_df.dropna(subset=[target_col]).copy()
    if train_use.empty:
        raise ValueError(f"No labeled rows found for target '{target_col}' (all NaN).")

    X_train = train_use[feature_cols].values
    y_train = train_use[target_col].values
    X_pred = pred_df[feature_cols].values

    # A solid default regressor; tune if you want later
    model = XGBRegressor(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_lambda=1.0,
        objective="reg:squarederror",
        random_state=random_state,
        n_jobs=-1,
    )

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("xgb", model),
    ])

    # If too few samples, reduce splits
    k = min(n_splits, len(train_use))
    if k < 2:
        # Fit once (no CV possible)
        pipe.fit(X_train, y_train)
        oof = pipe.predict(X_train)
        test_pred = pipe.predict(X_pred)
        return oof, test_pred

    cv = KFold(n_splits=k, shuffle=True, random_state=random_state)

    # OOF predictions for sanity checks / debugging
    oof = cross_val_predict(pipe, X_train, y_train, cv=cv, n_jobs=-1)

    # Fit on full labeled set and predict on pred_df
    pipe.fit(X_train, y_train)
    test_pred = pipe.predict(X_pred)
    return oof, test_pred

# ============================================================
# Main pipeline: folder -> folder
# ============================================================
def predict_targets_for_csv_folder(
    input_folder: str,
    output_folder: str,
    feature_cols: list[str] = INPUT_FEATURES,
    targets: list[str] = TARGETS,
    file_pattern: str = "*.csv",
    n_splits: int = 5,
    random_state: int = 42,
    output_suffix: str = "_PRED",
):
    """
    Reads ALL CSVs in input_folder, trains per-target XGBRegressor using rows
    where target exists, predicts missing targets (and also writes predicted columns)
    for every file, and saves to output_folder.

    Output:
      - Same CSV structure + added columns: <TARGET>_pred
      - Also fills missing original TARGET values with predictions (optional behavior below)
    """
    ensure_dir(output_folder)

    paths = sorted(glob.glob(os.path.join(input_folder, file_pattern)))
    if not paths:
        raise FileNotFoundError(f"No CSV files found in: {input_folder}")

    # Load all, keep origin path for saving back
    dfs = []
    for p in paths:
        df = pd.read_csv(p)
        df["__source_file__"] = os.path.basename(p)
        dfs.append(df)

    all_df = pd.concat(dfs, ignore_index=True)

    # Validate required feature columns
    missing_feats = [c for c in feature_cols if c not in all_df.columns]
    if missing_feats:
        raise ValueError(
            "Missing required INPUT_FEATURES columns:\n"
            + "\n".join(missing_feats)
        )

    # Make numeric where relevant
    all_df = safe_numeric_df(all_df, feature_cols + targets)

    # If features have NaNs, XGBoost can handle some NaNs, but scaler can't.
    # So: fill feature NaNs with 0 (or you can use median impute).
    all_df[feature_cols] = all_df[feature_cols].fillna(0)

    # Create prediction columns
    for t in targets:
        all_df[f"{t}_pred"] = np.nan

    # Train/predict per target (global model across all files)
    for t in targets:
        if t not in all_df.columns:
            # If the target column doesn't exist at all, create it as NaN
            all_df[t] = np.nan

        # Predict for all rows in all_df (same frame used as pred_df)
        _, pred = fit_predict_one_target(
            train_df=all_df,
            pred_df=all_df,
            feature_cols=feature_cols,
            target_col=t,
            n_splits=n_splits,
            random_state=random_state,
        )
        all_df[f"{t}_pred"] = pred

        # OPTIONAL: fill missing true labels with predictions
        all_df[t] = all_df[t].where(~all_df[t].isna(), all_df[f"{t}_pred"])

    # Save back per original file
    for fname, sub_df in all_df.groupby("__source_file__", sort=False):
        out_df = sub_df.drop(columns=["__source_file__"])

        stem = Path(fname).stem
        out_name = f"{stem}{output_suffix}.csv"
        out_path = os.path.join(output_folder, out_name)
        out_df.to_csv(out_path, index=False)

    print(f"Done. Wrote {len(paths)} files to: {output_folder}")

# ============================================================
# Example run
# ============================================================
if __name__ == "__main__":

    INPUT_FOLDER = r"O:\\MyThesis\\DataSets\\Michal\\outputsResultsWithStandAloneFeatures"
    OUTPUT_FOLDER = r"O:\\MyThesis\\DataSets\\Michal\\outputsResultsWithAllFeatures"

    predict_targets_for_csv_folder(
        input_folder=INPUT_FOLDER,
        output_folder=OUTPUT_FOLDER,
        n_splits=5,          # CV folds for OOF; final model fits on all labeled rows
        random_state=42,
        output_suffix="_AllFeatures"
    )


Done. Wrote 213 files to: O:\\MyThesis\\DataSets\\Michal\\outputsResultsWithAllFeatures
