### Here I will try running RRCF parallely and create the final dataset that the agent would see

In [1]:
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from joblib import Parallel, delayed

In [2]:
current_dir = Path().cwd()
project_root = current_dir.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
    print('Done')

Done


In [3]:
from src.sync.RRCF_TS_anomaly_detection import *
from src.utils.save_data import *
from src.sync.Feature_Transformation import feature_engineering

In [4]:
path = project_root / "data" / "processed" / "merge" / "final_smooth_rz_data.parquet"
df=load_df_parquet_safe(path=str(path))

In [5]:
df.head()

Unnamed: 0,Time,blink_intensity_smooth_rz,gaze_magnitude_smooth_rz,jaw_magnitude_smooth_rz,smile_intensity_smooth_rz,loudness_db_smooth_rz,pitch_relative_st_smooth_rz,pitch_expressiveness_st_smooth_rz,wps_smooth_rz,words,text_concat,speaker,filler_percentage,pause_percent_pr
0,0.0,3.571464,1.55596,-0.351862,1.418536,,,,,"[We're, starting]",We're starting,,0.064433,0.079177
1,0.5,1.74083,1.323856,-0.425948,1.438164,,,,,"[now., [*]]",now. [*],A,0.064433,0.079177
2,1.0,1.297544,1.195807,-0.627946,1.056126,,,,,,,A,0.064433,0.079177
3,1.5,1.004625,0.428483,-0.742188,0.60234,,,,,"[So, welcome]",So welcome,A,0.064433,0.079177
4,2.0,1.322258,1.213327,-0.828038,0.355089,,,,,"[to, the]",to the,A,0.064433,0.079177


In [6]:
feature_cols = [
    ('blink_intensity_smooth_rz', 'ui'),
    ('gaze_magnitude_smooth_rz', 'ui'),
    ('jaw_magnitude_smooth_rz', 'ui'),
    ('smile_intensity_smooth_rz', 'ui'),
    ('loudness_db_smooth_rz', 'ud'),
    ('pitch_relative_st_smooth_rz', 'ud'),
    ('pitch_expressiveness_st_smooth_rz', 'ud'),
    ('wps_smooth_rz', 'ud'),
    ('filler_percentage', 'ud'),
    ('pause_percent_pr', 'ud'),
]

features = {}
feat_id = {}
for name, type in feature_cols:
    f_id, vector = get_data_ready(data=df, features=[name], type=type)
    features[name] = vector
    feat_id[name] = f_id

In [7]:
for key, value in features.items():
    print(value)

[[3.57146417]
 [1.74083022]
 [1.29754446]
 ...
 [1.6549074 ]
 [1.252199  ]
 [1.04397284]]
[[1.55595991]
 [1.3238559 ]
 [1.19580709]
 ...
 [1.07019098]
 [0.47603773]
 [0.01574653]]
[[-0.35186165]
 [-0.42594774]
 [-0.62794575]
 ...
 [-0.90239459]
 [-0.90098368]
 [-0.89959585]]
[[ 1.41853557e+00]
 [ 1.43816411e+00]
 [ 1.05612625e+00]
 ...
 [ 2.40739112e-01]
 [ 8.45211635e-04]
 [-1.89532529e-01]]
[[-0.39149598]
 [-1.81300585]
 [-1.78566912]
 ...
 [ 0.50776887]
 [ 0.0872767 ]
 [-1.10199763]]
[[-0.67413378]
 [-0.69538963]
 [-0.62424157]
 ...
 [-0.66407274]
 [-1.09142559]
 [-0.93226954]]
[[ 0.3390234 ]
 [-0.34631684]
 [-1.44226786]
 ...
 [-0.53677324]
 [-0.70190335]
 [-1.69625822]]
[[-3.21992348]
 [-3.21992348]
 [-2.49393394]
 ...
 [ 1.21875057]
 [ 2.12852661]
 [ 0.60039801]]
[[ 0.06443299]
 [ 0.06443299]
 [ 0.06443299]
 ...
 [13.46649485]
 [13.46649485]
 [13.46649485]]
[[0.07917656]
 [0.07917656]
 [0.07917656]
 ...
 [9.58036421]
 [9.58036421]
 [9.65954078]]


In [8]:
results = Parallel(n_jobs=-1)(
    delayed(run_rrcf)(
        features=value,
        shingle=10
    )
    for _, value in features.items()
)

In [9]:
anomaly_scores = {key:pd.DataFrame(value, index=feat_id[key], columns=['anomaly_scores']) for key, value in zip([feature[0] for feature in feature_cols], results)}

In [10]:
indexes = df.index

In [11]:
df[df['Time'] == 2.5].index.item()

5

In [12]:
anomaly_scores['blink_intensity_smooth_rz']['anomaly_scores']

0       0.000000
1       1.000000
2       1.150000
3       1.750000
4       1.000000
          ...   
1259    5.462380
1260    4.086178
1261    2.953284
1262    4.557526
1263    3.796845
Name: anomaly_scores, Length: 1264, dtype: float64

In [13]:
for key, value in anomaly_scores.items():
    n_sigma = adaptive_n_sigma(anomaly_scores=value['anomaly_scores'])
    # print(n_sigma)
    threshold = get_threshold_mad(scores=value['anomaly_scores'], n_sigma=n_sigma)
    # print(threshold)
    value['is_anomaly'] = value['anomaly_scores'] > threshold
    
    value['anomaly_index'] = value.index.map(lambda x: indexes.get_loc(x) if x in indexes else np.nan)

In [14]:
anomaly_scores

{'blink_intensity_smooth_rz':       anomaly_scores  is_anomaly  anomaly_index
 0           0.000000       False              0
 1           1.000000       False              1
 2           1.150000       False              2
 3           1.750000       False              3
 4           1.000000       False              4
 ...              ...         ...            ...
 1259        5.462380       False           1259
 1260        4.086178       False           1260
 1261        2.953284       False           1261
 1262        4.557526       False           1262
 1263        3.796845       False           1263
 
 [1264 rows x 3 columns],
 'gaze_magnitude_smooth_rz':       anomaly_scores  is_anomaly  anomaly_index
 0           0.000000       False              0
 1           1.000000       False              1
 2           1.400000       False              2
 3           2.650000       False              3
 4           1.075000       False              4
 ...              ...         ...

In [15]:
anomaly_scores['blink_intensity_smooth_rz']['is_anomaly'] == True

0       False
1       False
2       False
3       False
4       False
        ...  
1259    False
1260    False
1261    False
1262    False
1263    False
Name: is_anomaly, Length: 1264, dtype: bool

In [16]:
c_anomalous = {}
for key, value in anomaly_scores.items():
    pass_df = pd.DataFrame(df.loc[(value[value['is_anomaly'] == True]).index, 'Time'], columns=['Time'])
    pass_df['anomaly_index'] = value.loc[(value[value['is_anomaly'] == True]).index, 'anomaly_index']
    # print(pass_df)
    
    c_anomalous[key] = get_anomalous_time_ranges(anomalies_time=pass_df)

In [17]:
c_anomalous

{'blink_intensity_smooth_rz': [[78, 79],
  [241, 242, 244, 245],
  [292, 295, 296, 298],
  [329, 330, 331, 332, 333, 336],
  [586, 588, 590],
  [602, 604],
  [620, 624],
  [729, 732],
  [815, 816, 817, 818, 819, 820, 821, 822],
  [875, 876, 878, 879, 880, 881],
  [1062, 1063],
  [1115, 1116],
  [1145, 1146],
  [1152, 1153, 1154, 1155, 1158],
  [1196, 1197, 1198, 1199, 1203]],
 'gaze_magnitude_smooth_rz': [[148, 149, 150, 151],
  [258, 259, 260, 262, 264],
  [330, 331, 332, 333],
  [386, 387, 388],
  [424, 425, 426, 427, 428, 429],
  [471, 472, 473, 474, 475, 476],
  [515, 516],
  [569, 570, 571, 572, 573],
  [628, 631, 634],
  [793, 796, 800, 801, 802],
  [957, 958, 959],
  [983, 987],
  [1061, 1062, 1063],
  [1073, 1074, 1075, 1076],
  [1164, 1165],
  [1202, 1206, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217]],
 'jaw_magnitude_smooth_rz': [[27, 28],
  [70, 71, 72, 73],
  [106, 108, 110],
  [210, 211, 212],
  [218, 219],
  [238, 240],
  [278, 279, 280, 281],
  [294, 295, 

In [None]:
feature_engineering(mode="evaluation", c_anomalies=c_anomalous)