### Here I will try running RRCF parallely and create the final dataset that the agent would see

In [1]:
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from joblib import Parallel, delayed

In [2]:
current_dir = Path().cwd()
project_root = current_dir.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
    print('Done')

Done


In [3]:
from src.sync.RRCF_TS_anomaly_detection import *
from src.utils.save_data import *
from src.sync.Feature_Transformation import feature_engineering, compute_speaker_median_pitch, get_speaker_timings

In [4]:
path = project_root / "data" / "processed" / "merge" / "final_smooth_rz_data.parquet"
df=load_df_parquet_safe(path=str(path))
path = project_root / "data" / "processed" / "merge" / "avw_merged_2.parquet"
non_normfe_df = load_df_parquet_safe(path=str(path))

In [5]:
df.head()

Unnamed: 0,Time,blink_intensity_smooth_rz,gaze_magnitude_smooth_rz,jaw_magnitude_smooth_rz,smile_intensity_smooth_rz,loudness_db_smooth_rz,pitch_relative_st_smooth_rz,pitch_expressiveness_st_smooth_rz,wps_smooth_rz,words,text_concat,speaker,filler_percentage,pause_percent_pr
0,0.0,3.571464,1.55596,-0.351862,1.418536,,,,,"[We're, starting]",We're starting,,0.064433,0.079177
1,0.5,1.74083,1.323856,-0.425948,1.438164,,,,,"[now., [*]]",now. [*],A,0.064433,0.079177
2,1.0,1.297544,1.195807,-0.627946,1.056126,,,,,,,A,0.064433,0.079177
3,1.5,1.004625,0.428483,-0.742188,0.60234,,,,,"[So, welcome]",So welcome,A,0.064433,0.079177
4,2.0,1.322258,1.213327,-0.828038,0.355089,,,,,"[to, the]",to the,A,0.064433,0.079177


In [6]:
feature_cols = [
    ('blink_intensity_smooth_rz', 'ui'),
    ('gaze_magnitude_smooth_rz', 'ui'),
    ('jaw_magnitude_smooth_rz', 'ui'),
    ('smile_intensity_smooth_rz', 'ui'),
    ('loudness_db_smooth_rz', 'ud'),
    ('pitch_relative_st_smooth_rz', 'ud'),
    ('pitch_expressiveness_st_smooth_rz', 'ud'),
    ('wps_smooth_rz', 'ud'),
    ('filler_percentage', 'ud'),
    ('pause_percent_pr', 'ud'),
]

features = {}
feat_id = {}
for name, type in feature_cols:
    f_id, vector = get_data_ready(data=df, features=[name], type=type)
    features[name] = vector
    feat_id[name] = f_id

In [7]:
for key, value in features.items():
    print(value)

[[3.57146417]
 [1.74083022]
 [1.29754446]
 ...
 [1.6549074 ]
 [1.252199  ]
 [1.04397284]]
[[1.55595991]
 [1.3238559 ]
 [1.19580709]
 ...
 [1.07019098]
 [0.47603773]
 [0.01574653]]
[[-0.35186165]
 [-0.42594774]
 [-0.62794575]
 ...
 [-0.90239459]
 [-0.90098368]
 [-0.89959585]]
[[ 1.41853557e+00]
 [ 1.43816411e+00]
 [ 1.05612625e+00]
 ...
 [ 2.40739112e-01]
 [ 8.45211635e-04]
 [-1.89532529e-01]]
[[-0.39149598]
 [-1.81300585]
 [-1.78566912]
 ...
 [ 0.50776887]
 [ 0.0872767 ]
 [-1.10199763]]
[[-0.67413378]
 [-0.69538963]
 [-0.62424157]
 ...
 [-0.66407274]
 [-1.09142559]
 [-0.93226954]]
[[ 0.3390234 ]
 [-0.34631684]
 [-1.44226786]
 ...
 [-0.53677324]
 [-0.70190335]
 [-1.69625822]]
[[-3.21992348]
 [-3.21992348]
 [-2.49393394]
 ...
 [ 1.21875057]
 [ 2.12852661]
 [ 0.60039801]]
[[ 0.06443299]
 [ 0.06443299]
 [ 0.06443299]
 ...
 [13.46649485]
 [13.46649485]
 [13.46649485]]
[[0.07917656]
 [0.07917656]
 [0.07917656]
 ...
 [9.58036421]
 [9.58036421]
 [9.65954078]]


In [8]:
results = Parallel(n_jobs=-1)(
    delayed(run_rrcf)(
        features=value,
        shingle=10
    )
    for _, value in features.items()
)

In [9]:
anomaly_scores = {key:pd.DataFrame(value, index=feat_id[key], columns=['anomaly_scores']) for key, value in zip([feature[0] for feature in feature_cols], results)}

In [10]:
indexes = df.index

In [11]:
df[df['Time'] == 2.5].index.item()

5

In [12]:
anomaly_scores['blink_intensity_smooth_rz']['anomaly_scores']

0       0.000000
1       1.000000
2       1.175000
3       1.450000
4       1.025000
          ...   
1259    4.135224
1260    5.072007
1261    2.844137
1262    3.410219
1263    3.707985
Name: anomaly_scores, Length: 1264, dtype: float64

In [13]:
for key, value in anomaly_scores.items():
    n_sigma = adaptive_n_sigma(anomaly_scores=value['anomaly_scores'])
    # print(n_sigma)
    threshold = get_threshold_mad(scores=value['anomaly_scores'], n_sigma=n_sigma)
    # print(threshold)
    value['is_anomaly'] = value['anomaly_scores'] > threshold
    
    value.loc[value[value['is_anomaly']].index, ['anomaly_index']] = value[value['is_anomaly']].index.map(lambda x: int(indexes.get_loc(x)))
    # value['anomaly_index'] = value['anomaly_index'].astype("Int64")

In [14]:
anomaly_scores

{'blink_intensity_smooth_rz':       anomaly_scores  is_anomaly  anomaly_index
 0           0.000000       False            NaN
 1           1.000000       False            NaN
 2           1.175000       False            NaN
 3           1.450000       False            NaN
 4           1.025000       False            NaN
 ...              ...         ...            ...
 1259        4.135224       False            NaN
 1260        5.072007       False            NaN
 1261        2.844137       False            NaN
 1262        3.410219       False            NaN
 1263        3.707985       False            NaN
 
 [1264 rows x 3 columns],
 'gaze_magnitude_smooth_rz':       anomaly_scores  is_anomaly  anomaly_index
 0           0.000000       False            NaN
 1           1.000000       False            NaN
 2           1.475000       False            NaN
 3           2.575000       False            NaN
 4           1.050000       False            NaN
 ...              ...         ...

In [15]:
anomaly_scores['blink_intensity_smooth_rz']['is_anomaly'] == True

0       False
1       False
2       False
3       False
4       False
        ...  
1259    False
1260    False
1261    False
1262    False
1263    False
Name: is_anomaly, Length: 1264, dtype: bool

In [16]:
c_anomalous = {}
for key, value in anomaly_scores.items():
    pass_df = pd.DataFrame(df.loc[(value[value['is_anomaly'] == True]).index, 'Time'], columns=['Time'])
    pass_df['anomaly_index'] = value.loc[(value[value['is_anomaly'] == True]).index, 'anomaly_index']
    # print(pass_df)
    
    c_anomalous[key] = get_anomalous_time_ranges(anomalies_time=pass_df)

In [17]:
c_anomalous

{'blink_intensity_smooth_rz': [[78, 79, 80, 81],
  [292, 295, 296, 298],
  [330, 331, 332, 333],
  [586, 588, 590],
  [602, 603, 604],
  [620, 622, 624],
  [815, 816, 817, 818, 819, 820, 821, 822],
  [852, 853],
  [875, 878, 879, 880, 881],
  [1062, 1063],
  [1114, 1115, 1116],
  [1145, 1146, 1147, 1148, 1152, 1153, 1154, 1155, 1158, 1160],
  [1169, 1171],
  [1196, 1197, 1198, 1199, 1200, 1204, 1206]],
 'gaze_magnitude_smooth_rz': [[148, 149, 150, 151],
  [258, 259, 260, 264],
  [328, 330, 331, 332, 333],
  [387, 388],
  [424, 425, 426, 427],
  [471, 472, 473, 474, 475, 476],
  [513, 515, 516],
  [569, 570, 571, 572, 573],
  [793, 795, 796, 799, 800, 801, 802],
  [957, 958, 959],
  [966, 967],
  [983, 987],
  [1061, 1062, 1063],
  [1073, 1074, 1075, 1076],
  [1162, 1164, 1165],
  [1208,
   1209,
   1210,
   1211,
   1212,
   1213,
   1214,
   1215,
   1216,
   1217,
   1218,
   1219,
   1220,
   1221]],
 'jaw_magnitude_smooth_rz': [[27, 28, 29],
  [71, 72],
  [106, 108, 110],
  [124, 1

In [75]:
anomaly_scores['blink_intensity_smooth_rz']['anomaly_index']

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
1259   NaN
1260   NaN
1261   NaN
1262   NaN
1263   NaN
Name: anomaly_index, Length: 1264, dtype: float64

In [18]:
anomalies = {key: list(value['anomaly_index'][value['anomaly_index'].notna()]) for key, value in anomaly_scores.items()}

In [None]:
# anomalies = {key: list(map(int, value)) for key, value in anomalies}
anomalies

In [26]:
non_normfe_df.columns

Index(['Time', 'words', 'text_concat', 'h_ratio', 'v_ratio', '_neutral',
       'browDownLeft', 'browDownRight', 'browInnerUp', 'browOuterUpLeft',
       'browOuterUpRight', 'cheekPuff', 'cheekSquintLeft', 'cheekSquintRight',
       'eyeBlinkLeft', 'eyeBlinkRight', 'eyeLookDownLeft', 'eyeLookDownRight',
       'eyeLookInLeft', 'eyeLookInRight', 'eyeLookOutLeft', 'eyeLookOutRight',
       'eyeLookUpLeft', 'eyeLookUpRight', 'eyeSquintLeft', 'eyeSquintRight',
       'eyeWideLeft', 'eyeWideRight', 'jawForward', 'jawLeft', 'jawOpen',
       'jawRight', 'mouthClose', 'mouthDimpleLeft', 'mouthDimpleRight',
       'mouthFrownLeft', 'mouthFrownRight', 'mouthFunnel', 'mouthLeft',
       'mouthLowerDownLeft', 'mouthLowerDownRight', 'mouthPressLeft',
       'mouthPressRight', 'mouthPucker', 'mouthRight', 'mouthRollLower',
       'mouthRollUpper', 'mouthShrugLower', 'mouthShrugUpper',
       'mouthSmileLeft', 'mouthSmileRight', 'mouthStretchLeft',
       'mouthStretchRight', 'mouthUpperUpLeft', 'mo

In [31]:
cols = ['h_ratio', 'v_ratio', '_neutral',
       'browDownLeft', 'browDownRight', 'browInnerUp', 'browOuterUpLeft',
       'browOuterUpRight', 'cheekPuff', 'cheekSquintLeft', 'cheekSquintRight',
       'eyeBlinkLeft', 'eyeBlinkRight', 'eyeLookDownLeft', 'eyeLookDownRight',
       'eyeLookInLeft', 'eyeLookInRight', 'eyeLookOutLeft', 'eyeLookOutRight',
       'eyeLookUpLeft', 'eyeLookUpRight', 'eyeSquintLeft', 'eyeSquintRight',
       'eyeWideLeft', 'eyeWideRight', 'jawForward', 'jawLeft', 'jawOpen',
       'jawRight', 'mouthClose', 'mouthDimpleLeft', 'mouthDimpleRight',
       'mouthFrownLeft', 'mouthFrownRight', 'mouthFunnel', 'mouthLeft',
       'mouthLowerDownLeft', 'mouthLowerDownRight', 'mouthPressLeft',
       'mouthPressRight', 'mouthPucker', 'mouthRight', 'mouthRollLower',
       'mouthRollUpper', 'mouthShrugLower', 'mouthShrugUpper',
       'mouthSmileLeft', 'mouthSmileRight', 'mouthStretchLeft',
       'mouthStretchRight', 'mouthUpperUpLeft', 'mouthUpperUpRight',
       'noseSneerLeft', 'noseSneerRight']

In [32]:
non_normfe_df[cols] = non_normfe_df[cols].ffill()

In [33]:
non_normfe_df[non_normfe_df['jawOpen'].isna()]

Unnamed: 0,Time,words,text_concat,h_ratio,v_ratio,_neutral,browDownLeft,browDownRight,browInnerUp,browOuterUpLeft,...,mouthUpperUpRight,noseSneerLeft,noseSneerRight,audio_rms(volumn),audio_pitch_avg,audio_pitch_var(expressiveness),is_silent,speaker,filler_percentage,pause_percent_pr


In [20]:
speaker_timmings = get_speaker_timings(speaker_times=df[['Time', 'speaker']], speaker='B')

In [20]:
audio_path = project_root / "data" / "raw" / "Interview_2.wav"
median = compute_speaker_median_pitch(audio_path=str(audio_path), speaker_segments=speaker_timmings)

In [21]:
median = 158.74

In [34]:
final_dataframe = feature_engineering(mode="evaluation", c_anomalies=c_anomalous, anomalies=anomalies, df=non_normfe_df, norm_rz_df=df, speaker_median_pitch=median, speaker='B')

In [39]:
final_dataframe.iloc[20:40]

Unnamed: 0,blinking_data,gaze_data,jaw_movement_data,smile_data,loudness_data,average_pitch_data,pitch_standard_deviation,words_per_sec,filler_words_usage,pauses_taken
20,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.0003300155804026872, 'lateral': Non...","{'intensity': 1.970937121775762e-05, 'asymmetr...",,,,,,
21,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 9.470622899243608e-05, 'lateral': Non...","{'intensity': 0.0026545619226879145, 'asymmetr...",,,,,,
22,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.06632930040359497, 'lateral': None,...","{'intensity': 5.0152379290580026e-05, 'asymmet...",,,,,,
23,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.0005459068925119936, 'lateral': Non...","{'intensity': 2.1747304715802328e-05, 'asymmet...","{'level': 'normal', 'rz_score': -0.39149597783...","{'relative_level': 'normal', 'rz_score': -0.67...","{'expressiveness': 'expressive', 'rz_score': 0...","{'speaking_rate': 'very_slow', 'rz_score': -3....","{'filler_percentage_level': 'normal', 'is_anom...","{'pause_percentage_level': 'normal', 'is_anoma..."
24,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.0005173762328922749, 'lateral': Non...","{'intensity': 1.3380005162844098e-05, 'asymmet...","{'level': 'normal', 'rz_score': -1.81300585287...","{'relative_level': 'normal', 'rz_score': -0.69...","{'expressiveness': 'expressive', 'rz_score': -...","{'speaking_rate': 'very_slow', 'rz_score': -3....","{'filler_percentage_level': 'normal', 'is_anom...","{'pause_percentage_level': 'normal', 'is_anoma..."
25,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.018747227266430855, 'lateral': None...","{'intensity': 1.3724788826152689e-05, 'asymmet...","{'level': 'normal', 'rz_score': -1.78566912450...","{'relative_level': 'normal', 'rz_score': -0.62...","{'expressiveness': 'slightly_expressive', 'rz_...","{'speaking_rate': 'slow', 'rz_score': -2.49393...","{'filler_percentage_level': 'normal', 'is_anom...","{'pause_percentage_level': 'normal', 'is_anoma..."
26,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.041224658489227295, 'lateral': None...","{'intensity': 1.0467591879148584e-05, 'asymmet...","{'level': 'normal', 'rz_score': -1.13186570441...","{'relative_level': 'normal', 'rz_score': -0.46...","{'expressiveness': 'slightly_expressive', 'rz_...","{'speaking_rate': 'normal', 'rz_score': -1.249...","{'filler_percentage_level': 'normal', 'is_anom...","{'pause_percentage_level': 'normal', 'is_anoma..."
27,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.09769998490810394, 'lateral': -0.00...","{'intensity': 6.6711254877560576e-06, 'asymmet...","{'level': 'normal', 'rz_score': -1.11288186527...","{'relative_level': 'normal', 'rz_score': -0.57...","{'expressiveness': 'slightly_expressive', 'rz_...","{'speaking_rate': 'normal', 'rz_score': -0.360...","{'filler_percentage_level': 'normal', 'is_anom...","{'pause_percentage_level': 'normal', 'is_anoma..."
28,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.14945782721042633, 'lateral': -0.06...","{'intensity': 6.139134125859868e-06, 'asymmetr...","{'level': 'normal', 'rz_score': -0.80066099083...","{'relative_level': 'normal', 'rz_score': -0.69...","{'expressiveness': 'slightly_expressive', 'rz_...","{'speaking_rate': 'normal', 'rz_score': 0.2745...","{'filler_percentage_level': 'normal', 'is_anom...","{'pause_percentage_level': 'normal', 'is_anoma..."
29,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.09816937148571014, 'lateral': -0.01...","{'intensity': 9.85508647630695e-06, 'asymmetry...","{'level': 'normal', 'rz_score': -1.15405570305...","{'relative_level': 'normal', 'rz_score': -0.92...","{'expressiveness': 'slightly_expressive', 'rz_...","{'speaking_rate': 'normal', 'rz_score': 0.7281...","{'filler_percentage_level': 'normal', 'is_anom...","{'pause_percentage_level': 'normal', 'is_anoma..."


In [42]:
non_normfe_df['Time'].iloc[c_anomalous['gaze_magnitude_smooth_rz'][0]]

148    74.0
149    74.5
150    75.0
151    75.5
Name: Time, dtype: float64

In [53]:
non_normfe_df.columns

Index(['Time', 'words', 'text_concat', 'h_ratio', 'v_ratio', '_neutral',
       'browDownLeft', 'browDownRight', 'browInnerUp', 'browOuterUpLeft',
       'browOuterUpRight', 'cheekPuff', 'cheekSquintLeft', 'cheekSquintRight',
       'eyeBlinkLeft', 'eyeBlinkRight', 'eyeLookDownLeft', 'eyeLookDownRight',
       'eyeLookInLeft', 'eyeLookInRight', 'eyeLookOutLeft', 'eyeLookOutRight',
       'eyeLookUpLeft', 'eyeLookUpRight', 'eyeSquintLeft', 'eyeSquintRight',
       'eyeWideLeft', 'eyeWideRight', 'jawForward', 'jawLeft', 'jawOpen',
       'jawRight', 'mouthClose', 'mouthDimpleLeft', 'mouthDimpleRight',
       'mouthFrownLeft', 'mouthFrownRight', 'mouthFunnel', 'mouthLeft',
       'mouthLowerDownLeft', 'mouthLowerDownRight', 'mouthPressLeft',
       'mouthPressRight', 'mouthPucker', 'mouthRight', 'mouthRollLower',
       'mouthRollUpper', 'mouthShrugLower', 'mouthShrugUpper',
       'mouthSmileLeft', 'mouthSmileRight', 'mouthStretchLeft',
       'mouthStretchRight', 'mouthUpperUpLeft', 'mo

In [54]:
final_dataframe[['Time', 'words', 'text_concat']] = non_normfe_df[['Time', 'words', 'text_concat']]

In [55]:
final_dataframe

Unnamed: 0,blinking_data,gaze_data,jaw_movement_data,smile_data,loudness_data,average_pitch_data,pitch_standard_deviation,words_per_sec,filler_words_usage,pauses_taken,Time,words,text_concat
0,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.023826630786061287, 'lateral': None...","{'intensity': 0.00020072988331669704, 'asymmet...",,,,,,,0.0,"[We're, starting]",We're starting
1,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.01953260414302349, 'lateral': None,...","{'intensity': 0.00020793307342685806, 'asymmet...",,,,,,,0.5,"[now., [*]]",now. [*]
2,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.0008870773017406464, 'lateral': Non...","{'intensity': 6.213216343535776e-05, 'asymmetr...",,,,,,,1.0,,
3,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.0015012087533250451, 'lateral': Non...","{'intensity': 4.647151249770331e-06, 'asymmetr...",,,,,,,1.5,"[So, welcome]",So welcome
4,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.00043307070154696703, 'lateral': No...","{'intensity': 4.343396951611567e-05, 'asymmetr...",,,,,,,2.0,"[to, the]",to the
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.0018719969084486365, 'lateral': Non...","{'intensity': 2.4302891581129413e-05, 'asymmet...",,,,,,,629.5,[So],So
1260,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.0018371319165453315, 'lateral': Non...","{'intensity': 2.613913233808773e-05, 'asymmetr...",,,,,,,630.0,"[yeah,]","yeah,"
1261,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.001750032533891499, 'lateral': None...","{'intensity': 2.566660534313314e-05, 'asymmetr...",,,,,,,630.5,"[that's, a]",that's a
1262,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.0019191708415746689, 'lateral': Non...","{'intensity': 1.664540692782879e-05, 'asymmetr...",,,,,,,631.0,"[really, solid, approach.]",really solid approach.


In [56]:
path = project_root / "data" / "processed" / "merge" / "final_dataframe.parquet"
save_df_parquet_safe(df=final_dataframe, path=str(path))

Saved Parquet: c:\Users\BIT\Desktop\mmr\data\processed\merge\final_dataframe.parquet
Saved schema: c:\Users\BIT\Desktop\mmr\data\processed\merge\final_dataframe.parquet.schema.json
