In [53]:
import sys
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import Literal, Optional
from pathlib import Path

In [2]:
current_dir = Path.cwd()
project_root = current_dir.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
    print('done')

done


In [3]:
from src.utils.save_data import save_df_parquet_safe, load_df_parquet_safe
from src.utils.plot_graphs import plot_beautiful, plot_dual_beautiful

In [11]:
path = project_root / "data" / "processed" / "merge" / "avw_merged_2.parquet"
avw_data = load_df_parquet_safe(path=str(path))
avw_data_copy = avw_data.copy()
avw_data_copy.head()

Unnamed: 0,Time,words,text_concat,h_ratio,v_ratio,_neutral,browDownLeft,browDownRight,browInnerUp,browOuterUpLeft,...,mouthUpperUpRight,noseSneerLeft,noseSneerRight,audio_rms(volumn),audio_pitch_avg,audio_pitch_var(expressiveness),is_silent,speaker,filler_percentage,pause_percent_pr
0,0.0,"[We're, starting]",We're starting,0.463844,0.012997,3.112116e-06,0.015592,0.025971,0.02329,0.123947,...,2.2e-05,4.540787e-07,4.185384e-07,0.0373,0.0,0.0,False,,0.064433,0.079177
1,0.5,"[now., [*]]",now. [*],0.418074,-0.048988,2.55837e-06,0.017408,0.037184,0.007811,0.1378,...,3e-05,2.511999e-07,3.643879e-07,0.0533,169.25,11.45,False,A,0.064433,0.079177
2,1.0,,,0.465044,-0.030748,3.958608e-07,0.008864,0.010406,0.016732,0.17933,...,0.000269,2.899701e-06,1.13486e-07,0.0006,0.0,0.0,True,A,0.064433,0.079177
3,1.5,"[So, welcome]",So welcome,0.505119,-0.055033,3.260218e-07,0.012983,0.016629,0.008106,0.186212,...,0.000103,2.350217e-06,9.529118e-08,0.0593,179.94,14.03,False,A,0.064433,0.079177
4,2.0,"[to, the]",to the,0.590369,-0.050514,5.068692e-07,0.016363,0.028436,0.006705,0.117622,...,0.000476,4.304491e-06,2.317813e-07,0.0646,237.84,56.17,False,A,0.064433,0.079177


In [122]:
def secs_mins(secs):
    mins = secs / 60
    if mins < 1:
        return mins*60
    else:
        sec = (mins - (secs // 60))*60
        mins = str(int((secs // 60))) + ":" + str(round(sec))
        return mins

#### Have Tested 2-3 models this model which is called RRCF (Robust Random Cut Forest) has worked well, basically i wanted a model that is unsupervised (since i don't have any labels ðŸ˜…) Anomaly Detection ML algo and i also wanted a model that consider the temporal characteristic of the data, So, I wanted a model that would work better on Time Series Anomaly Detection, and this looks promising.

In [5]:
from pysad.models import RobustRandomCutForest
from pysad.utils import ArrayStreamer
from sklearn.preprocessing import StandardScaler

In [86]:
def run_rrcf(features: np.ndarray, num_trees: int = 40, tree_size: int = 256, shingle: int = 1):
    """Runs rrcf on the given features

    Args:
        features (np.ndarray): provide your features as numpy nd array in shape (num_samples, num_features)
        num_trees (int, optional): No of trees the model should fit. Defaults to 40.
        tree_size (int, optional): max depth of the tree. Defaults to 256.
        shingle (int, optional): _description_. Defaults to 1.

    Returns:
        list: returns list of anomaly which is equal to the length of the feature array.
    """
    # initializing the model
    model = RobustRandomCutForest(num_trees=num_trees, tree_size=tree_size, shingle_size=shingle)
    
    # This simulates a live stream from a static array
    streamer = ArrayStreamer(shuffle=False)
    
    anomaly_scores = []
    
    for X in streamer.iter(features):
        # fit_score_partial updates the model and returns the score in one step
        score = model.fit_score_partial(X)
        anomaly_scores.append(score)
    
    print(f"Processed {len(anomaly_scores)} points")
    
    return anomaly_scores

In [87]:
def get_threshold_mad(scores, n_sigma=3):
    """
    Robust Z-Score method using Median and MAD.
    Standard Z-Score = (x - mean) / std
    Robust Z-Score = (x - median) / (1.4826 * MAD)
    """
    scores = np.array(scores)
    median = np.median(scores)
    
    # Calculate MAD (Median Absolute Deviation)
    mad = np.median(np.abs(scores - median))
    
    # The factor 1.4826 makes MAD comparable to Standard Deviation for normal data
    consistent_mad = 1.4826 * mad
    
    # Threshold = Median + (3 * Robust_Std_Dev)
    threshold = median + (n_sigma * consistent_mad)
    return threshold

In [88]:
def get_data_ready(data: pd.DataFrame, features: list[str], type: Literal["ui", "ud"], standardize: bool, scalar: Optional[StandardScaler] = None):
    """Data get's ready to be processed.

    Args:
        scalar (StandardScaler): needed to standardize the data
        data (pd.DataFrame): need the complete data
        features (list[str]): Just provide a list of feature names that you want to fit with model
        type (Literal[&quot;ui&quot;, &quot;ud&quot;]): 'ui' if the user independent else 'ud' user dependent
    """
    
    # get the data
    if type == 'ui':
        features = data[features].ffill()
        f_id = features.index
        if standardize:
            features = scalar.fit_transform(features)
            return f_id, features
        else:
            return f_id, features.to_numpy()
    else:
        features = data.loc[data['speaker'] == 'B', features]
        if features.notna().all().all():
            f_id = features.index
            if standardize:
                features = scalar.fit_transform(features)
                return f_id, features
            else:
                return f_id, features.to_numpy()
        else:
            features = features.ffill()
            f_id = features.index
            if standardize:
                features = scalar.fit_transform(features)
                return f_id, features
            else:
                return f_id, features.to_numpy()
        

In [89]:
# features to choose from
avw_data_copy.columns

Index(['Time', 'words', 'text_concat', 'h_ratio', 'v_ratio', '_neutral',
       'browDownLeft', 'browDownRight', 'browInnerUp', 'browOuterUpLeft',
       'browOuterUpRight', 'cheekPuff', 'cheekSquintLeft', 'cheekSquintRight',
       'eyeBlinkLeft', 'eyeBlinkRight', 'eyeLookDownLeft', 'eyeLookDownRight',
       'eyeLookInLeft', 'eyeLookInRight', 'eyeLookOutLeft', 'eyeLookOutRight',
       'eyeLookUpLeft', 'eyeLookUpRight', 'eyeSquintLeft', 'eyeSquintRight',
       'eyeWideLeft', 'eyeWideRight', 'jawForward', 'jawLeft', 'jawOpen',
       'jawRight', 'mouthClose', 'mouthDimpleLeft', 'mouthDimpleRight',
       'mouthFrownLeft', 'mouthFrownRight', 'mouthFunnel', 'mouthLeft',
       'mouthLowerDownLeft', 'mouthLowerDownRight', 'mouthPressLeft',
       'mouthPressRight', 'mouthPucker', 'mouthRight', 'mouthRollLower',
       'mouthRollUpper', 'mouthShrugLower', 'mouthShrugUpper',
       'mouthSmileLeft', 'mouthSmileRight', 'mouthStretchLeft',
       'mouthStretchRight', 'mouthUpperUpLeft', 'mo

In [136]:
# get the data and the indexes
f_id, features = get_data_ready(data=avw_data_copy, features=['pause_percent_pr'], type='ud', standardize=True, scalar=StandardScaler())

In [137]:
f_id

Index([  23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
       ...
       1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194],
      dtype='int64', length=1053)

In [138]:
features

array([[-1.60936424],
       [-1.60936424],
       [-1.60936424],
       ...,
       [ 1.8023605 ],
       [ 1.8023605 ],
       [ 1.83079154]], shape=(1053, 1))

In [139]:
anomaly_scores = run_rrcf(features=features, shingle=10)
anomaly_scores[:5]

Processed 1053 points


[0.0, 0.0, 0.0, 0.0, 0.0]

In [140]:
threshold = get_threshold_mad(scores=anomaly_scores)
threshold

np.float64(25.63273678975102)

In [141]:
anomaly_scores = pd.DataFrame(anomaly_scores, index=f_id, columns=['scores'])

In [142]:
anomaly_scores.head()

Unnamed: 0,scores
23,0.0
24,0.0
25,0.0
26,0.0
27,0.0


In [143]:
anomaly_obs = avw_data_copy.loc[anomaly_scores[anomaly_scores['scores'] >= threshold].index]
anomaly_obs.shape

(116, 64)

In [145]:
anomaly_obs['Time'].iloc[20:40].apply(func=secs_mins)

336    2:48
358    2:59
376     3:8
377     3:8
378     3:9
386    3:13
387    3:14
399    3:20
405    3:22
407    3:23
423    3:31
433    3:36
435    3:38
436    3:38
437    3:38
446    3:43
452    3:46
454    3:47
477    3:59
504    4:12
Name: Time, dtype: object