In [97]:
import pandas as pd
import opensmile
from tqdm import tqdm 
import argparse
from pydub import AudioSegment
import os 


import cv2
import glob
from pathlib import Path
from PIL import Image

import numpy as np

# mediapipe 
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision


In [98]:
ex_df= pd.read_csv('./ex_data.csv')

# 1. Extract Audio Feature

In [101]:
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)

In [105]:
opensmile_file_list = []
for row1 in tqdm(ex_df.itertuples(),total=len(ex_df)):
    new_file_name = row1.txt_img_path.replace('text_bind','opensmile_bind')
    new_path = new_file_name.replace(new_file_name.split('/')[-1],'')
    if not os.path.exists(new_path):
        os.makedirs(new_path)

    txt_df = pd.read_json(row1.txt_img_path)
    #vid_df = pd.read_json(row1.pose_path)[video_col]

    zero_point = txt_df.start.iloc[0]
    aud_df = pd.DataFrame()
    for row2 in txt_df.itertuples():
        y = smile.process_file(row1.audio_path, start=row2.start-zero_point, end=row2.end-zero_point)
        aud_df = pd.concat([aud_df,y],axis=0)
        
    aud_df.reset_index(drop=True).to_json(new_file_name)
    opensmile_file_list.append(new_file_name)

100%|███████████████████████████████████████████| 10/10 [00:19<00:00,  1.94s/it]


# 2. Extract Gesture Feature

In [107]:
mp_holistic = mp.solutions.holistic

In [108]:
pose_file_list = []
for txt_video_path in tqdm(ex_df['txt_img_path'].tolist()):

    txt_df = pd.read_json(txt_video_path)

    new_df = pd.DataFrame()
    new_file_name = txt_video_path.replace('text_bind','pose_bind')
    new_path = new_file_name.replace(new_file_name.split('/')[-1],'')

    if not os.path.exists(new_path):
        os.makedirs(new_path)

    for i,token_img_path in enumerate(txt_df['token_img_path'].tolist()):
        with mp_holistic.Holistic(static_image_mode=True, min_detection_confidence=0.5, model_complexity=2) as holistic:

            image= cv2.imread(token_img_path)

            results = holistic.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
            dic = {'token_img_path':token_img_path}

            if results.pose_landmarks:
                for mark, data_point in zip(mp_holistic.PoseLandmark, results.pose_landmarks.landmark):
                    dic[mark.name] = dict(landmark = mark.name, 
                        x = data_point.x,
                        y = data_point.y,
                        z = data_point.z,
                        visibility = data_point.visibility)
            else : 
                for mark in mp_holistic.PoseLandmark:
                    dic[mark.name] = np.nan
            new_df = new_df.append(dic, ignore_index=True)

    pose_keypoints_list = new_df.columns.tolist()[1:]

    new_new_df = pd.DataFrame()
    for i,row in new_df.iterrows():
        dic = {'token_img_path':row['token_img_path']}
        for keypoint in pose_keypoints_list:
            if (row[keypoint] is None ) or ( type(row[keypoint]) is not dict):
                for coord in ['x','y','z']: 
                    dic[f'{keypoint}_{coord}']= np.nan
            else: 
                for coord in ['x','y','z']: 
                    # print(row[keypoint])
                    dic[f'{keypoint}_{coord}']= row[keypoint][coord]
        new_new_df = new_new_df.append(dic,ignore_index=True)

    new_new_df.to_json(new_file_name,orient='records')
    pose_file_list.append(new_file_name)

100%|███████████████████████████████████████████| 10/10 [02:05<00:00, 12.55s/it]


In [111]:
ex_df['pose_path'] = pose_file_list
ex_df['opensmile_path'] = opensmile_file_list

In [112]:
video_col = ['NOSE_x', 'NOSE_y', 'NOSE_z', 'LEFT_EYE_INNER_x','LEFT_EYE_INNER_y', 'LEFT_EYE_INNER_z', 'LEFT_EYE_x', 'LEFT_EYE_y','LEFT_EYE_z', 
             'LEFT_EYE_OUTER_x', 'LEFT_EYE_OUTER_y','LEFT_EYE_OUTER_z', 'RIGHT_EYE_INNER_x', 'RIGHT_EYE_INNER_y','RIGHT_EYE_INNER_z', 
             'RIGHT_EYE_x', 'RIGHT_EYE_y', 'RIGHT_EYE_z','RIGHT_EYE_OUTER_x', 'RIGHT_EYE_OUTER_y', 'RIGHT_EYE_OUTER_z','LEFT_EAR_x', 
             'LEFT_EAR_y', 'LEFT_EAR_z', 'RIGHT_EAR_x', 'RIGHT_EAR_y','RIGHT_EAR_z', 'MOUTH_LEFT_x', 'MOUTH_LEFT_y', 'MOUTH_LEFT_z',
             'MOUTH_RIGHT_x', 'MOUTH_RIGHT_y', 'MOUTH_RIGHT_z', 'LEFT_SHOULDER_x','LEFT_SHOULDER_y', 'LEFT_SHOULDER_z', 'RIGHT_SHOULDER_x',
             'RIGHT_SHOULDER_y', 'RIGHT_SHOULDER_z', 'LEFT_ELBOW_x', 'LEFT_ELBOW_y','LEFT_ELBOW_z', 'RIGHT_ELBOW_x', 'RIGHT_ELBOW_y', 
             'RIGHT_ELBOW_z','LEFT_WRIST_x', 'LEFT_WRIST_y', 'LEFT_WRIST_z', 'RIGHT_WRIST_x','RIGHT_WRIST_y', 'RIGHT_WRIST_z', 'LEFT_PINKY_x', 
             'LEFT_PINKY_y','LEFT_PINKY_z', 'RIGHT_PINKY_x', 'RIGHT_PINKY_y', 'RIGHT_PINKY_z','LEFT_INDEX_x', 'LEFT_INDEX_y', 'LEFT_INDEX_z',
             'RIGHT_INDEX_x','RIGHT_INDEX_y', 'RIGHT_INDEX_z', 'LEFT_THUMB_x', 'LEFT_THUMB_y','LEFT_THUMB_z', 'RIGHT_THUMB_x', 'RIGHT_THUMB_y','RIGHT_THUMB_z']

audio_col = ['F0semitoneFrom27.5Hz_sma3nz_amean','F1amplitudeLogRelF0_sma3nz_amean','F1bandwidth_sma3nz_amean','F1frequency_sma3nz_amean',
             'F2amplitudeLogRelF0_sma3nz_amean','F2bandwidth_sma3nz_amean','F2frequency_sma3nz_amean','F3amplitudeLogRelF0_sma3nz_amean',
             'F3bandwidth_sma3nz_amean','F3frequency_sma3nz_amean','HNRdBACF_sma3nz_amean','alphaRatioV_sma3nz_amean',
             'hammarbergIndexV_sma3nz_amean','jitterLocal_sma3nz_amean','logRelF0-H1-A3_sma3nz_amean','logRelF0-H1-H2_sma3nz_amean',
             'loudness_sma3_amean','mfcc1_sma3_amean','mfcc2_sma3_amean','mfcc3_sma3_amean','mfcc4_sma3_amean','shimmerLocaldB_sma3nz_amean',
             'slopeV0-500_sma3nz_amean','slopeV500-1500_sma3nz_amean','spectralFlux_sma3_amean']

In [119]:
for row in tqdm(ex_df.itertuples()):
    df_txt = pd.read_json(row.txt_img_path)
    df_aud = pd.read_json(row.opensmile_path)[audio_col].interpolate()
    df_vid = pd.read_json(row.pose_path)[video_col].interpolate()
    
    df_txt = pd.concat([df_txt,df_aud],axis=1)
    df_txt = pd.concat([df_txt,df_vid],axis=1)
#     df_txt.to_json(row.txt_img_path)

10it [00:00, 15.60it/s]


In [121]:
df_txt.columns

Index(['user_name', 'status', 'chunk_id', 'token', 'start', 'end',
       'token_img_path', 'F0semitoneFrom27.5Hz_sma3nz_amean',
       'F1amplitudeLogRelF0_sma3nz_amean', 'F1bandwidth_sma3nz_amean',
       ...
       'LEFT_INDEX_z', 'RIGHT_INDEX_x', 'RIGHT_INDEX_y', 'RIGHT_INDEX_z',
       'LEFT_THUMB_x', 'LEFT_THUMB_y', 'LEFT_THUMB_z', 'RIGHT_THUMB_x',
       'RIGHT_THUMB_y', 'RIGHT_THUMB_z'],
      dtype='object', length=101)

In [None]:
[]