# Data pre-processing

The main pre-processing is to save the frames, data landmarks, and audio to the hdf5 file format. There are three sections to this notebook given below. Each section outputs a hdf5 file with a separate group for each person label. In each hdf5 group a separate dataset is created for each video file. Each dataset shape is defined below.

1) Extract raw grayscale frames: Each dataset has shape (number of frames in video, frame_sz_y, frame_sz_x). The frame size of raw image and the number of frames are given by frame_sz and out_fps*T, where T is the length of the video in seconds.

2) Extract landmark, 3D head rotation, and intensity of action units: Each dataset has shape (number of frames in video, 156 (68 landmark x + 68 landmark y + 17 Action Units+ 3 head rotaion)). The number of frames per second are given by frame_sz and out_fps*T, where T is the length of the video in seconds. The landmarks are normalized and frontalized. 

3) Spectograms of audio: Each dataset is of shape (number of frames in the video, n_fft/2+1). The number of frames are given by (T-sfft_wl)/sfft_hop.

In [5]:
#start by importing the libraries
from IPython.display import clear_output
%matplotlib inline
import h5py
import numpy as np
import os
import pandas as pd
import librosa
import matplotlib.pyplot as plt

In [11]:
bsfldr = '/Users/shrutiagarwal/Documents/MATLAB/cs282/data/fakes/imposter/'#the basefolder to the dataset
#not using anywhere
seq_len = 5 #sequence of input length in seconds, this length will be the same for audio spectrogram as well

#this is for video data
in_fps = 30 #number of frames input per second
out_fps = 10 #subsample the frames
frame_sz = (128, 128) #the size of the image

#audio feature, these params are picked from voxceleb paper
sfft_wl = 0.025 #spectrogram window in seconds
sfft_hop = 0.010 #hop length in seconds 
n_fft = 1024 # the size of the dataset is t x (n_fft/2+1)

In [12]:
def get_all_files(basefldr):
    
    # Get the list of all files in directory tree at given path
    listOfFiles = {}
    dirnames = [f for f in os.listdir(basefldr) if os.path.isdir(os.path.join(basefldr, f))]#all the subject names
    dirnames = np.sort(dirnames)
    
    subject_lbl = pd.DataFrame() #store the mapping of labels and subjects
    subject_lbl['subjects'] = dirnames
    subject_lbl['label'] = np.arange(len(dirnames))
    subject_lbl['file_count'] = 0    
    i = 0
    for d in dirnames:
        listOfFiles[i] = np.sort(np.array([os.path.join(basefldr, d, os.path.splitext(f)[0]) 
                                  for f in os.listdir(os.path.join(basefldr, d)) if f.endswith('.mp4')]))

        subject_lbl['file_count'][subject_lbl['label'] == i] = len(listOfFiles[i])
        i = i+1
    
    print(subject_lbl)
    subject_lbl.to_csv('subject_label.csv')
    
    return listOfFiles
    
file_paths = get_all_files(bsfldr)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


    subjects  label  file_count
0     bernie      0          12
1      biden      1           0
2       diff      2           0
3    hillary      3          28
4     justin      4           5
5        may      5           8
6   michelle      6           0
7       modi      7          19
8      obama      8          21
9     pelosi      9           0
10     putin     10           0
11     trump     11          24
12    warren     12          10


## Raw Frames 

In [None]:
import cv2

#used to detect the face in the frames
protoPath = os.path.sep.join(['face_detection_model', "deploy.prototxt"])
modelPath = os.path.sep.join(['face_detection_model', "res10_300x300_ssd_iter_140000.caffemodel"])
DETECTOR = cv2.dnn.readNetFromCaffe(protoPath, modelPath)

#code to get the face in an RGB image
def get_face(image, verbose=False):
    
    face = None
    (h, w) = image.shape[:2]
    # construct a blob from the image
    imageBlob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0), 
                                      swapRB=False, crop=False)
    # apply OpenCV's deep learning-based face detector to localize faces in the input image
    DETECTOR.setInput(imageBlob)
    detections = DETECTOR.forward()
    if len(detections) > 0: # ensure at least one face was found
        # we're making the assumption that each image has only ONE
        # face, so find the bounding box with the largest probability
        i = np.argmax(detections[0, 0, :, 2])
        box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
        (startX, startY, endX, endY) = box.astype("int")

        # extract the face ROI and grab the ROI dimensions
        face = image[startY:endY, startX:endX]
        if verbose:
            cv2.imshow("Frame", face)
            key = cv2.waitKey(1) & 0xFF
    
    return face

#get all the frames in the video, preprocess as the following:
#1 extract the rgb frame
#2 convert to grayscale
#3 reduce the size
#4 reduce the temporal frequency
def get_all_frames_video(in_file, in_fps, out_fps, frame_sz, verbose=False):
    
    avg_win = int(in_fps/out_fps)
    #the number of frames we will get from this video
    vidcap = cv2.VideoCapture(in_file + '.mp4') # read the mp4
    in_frame_cnt = vidcap.get(cv2.CAP_PROP_FRAME_COUNT)
    out_frame_cnt = int(in_frame_cnt*out_fps/in_fps)
    out_faces = np.zeros((out_frame_cnt, frame_sz[0], frame_sz[1]))
    count = 0

    #this is used to do the running average
    avg_win_faces = np.zeros((avg_win, frame_sz[0], frame_sz[1]))
    avg_win_cnt = 0

    success, image = vidcap.read() # extract the frames
    while success:

        face = get_face(image, verbose) # get the face rectangle
        if face is not None:

            face = np.mean(face, axis=2) # convert the face to grayscale
            # reduce the frame size to frame_size (sz_x X sz_y X frames)
            face = cv2.resize(face, dsize=frame_sz, interpolation=cv2.INTER_CUBIC)
            face = face - np.min(face)
            face = face/np.max(face) # the intensity is in 0-1

            avg_win_faces[avg_win_cnt, :,:] = face.copy() #average window
            avg_win_cnt += 1

            if avg_win_cnt == avg_win:
                out_faces[count, :, :] = np.mean(avg_win_faces, axis=2)
                count += 1
                # subsample the frames at a outfps rate
                avg_win_faces = np.zeros((avg_win, frame_sz[0], frame_sz[1]))
                avg_win_cnt = 0

        success, image = vidcap.read() # extract the frames

    if count < out_frame_cnt:
        print('{} count: {}/{}'.format(v, count, out_frame_cnt))
        print('max:{} min:{}'.format(np.max(out_faces), np.min(out_faces)))
    out_faces = out_faces[:, :, :count]
    
    return out_faces

In [None]:
#frame hdf5
h5file = h5py.File('rawframes.hdf5', 'a')
lbls = list(file_paths.keys()) #hdf5 file will have a separate group for each label
avg_win = int(in_fps/out_fps)
for i in lbls:
    
    if str(i) in h5file.keys():
        group = h5file[str(i)]
    else:
        group = h5file.create_group(str(i)) #create label specific group
    vid_names = file_paths[i]
    
    for v in vid_names:
        
        db_name = v.split('/')[-1] # name of dataset
        if db_name not in group.keys(): #do only if the file has not been processed

            out_faces = get_all_frames_video(v, in_fps, out_fps, frame_sz, verbose=False)
            
            # display the frame
            if False:
                plt.imshow(out_faces[:, :, 10], cmap='gray')
                plt.show()
            
            # create the dataset with the complete groups with chunksize 
            data = group.create_dataset(db_name, out_faces.shape, dtype='f4',
                                        compression="gzip", compression_opts=4)
            data[:] = out_faces
            h5file.flush() #write out the file

h5file.close()

## Facial Landmarks

In [8]:
#code to get the face in an RGB image
def get_landmark(csv_file, verbose=False):
    
    x = np.array(csv_file.loc[:, ' X_0':' X_67'])
    y = np.array(csv_file.loc[:, ' Y_0':' Y_67'])
    z = np.array(csv_file.loc[:, ' Z_0':' Z_67'])

    r_x = np.array(csv_file.loc[:, ' pose_Rx'])
    r_y = np.array(csv_file.loc[:, ' pose_Ry'])
    r_z = np.array(csv_file.loc[:, ' pose_Rz'])

    x_new = x * (np.cos(r_z)*np.cos(r_y))[:, np.newaxis] \
            + y * (np.cos(r_z)*np.sin(r_y)*np.sin(r_x) + np.sin(r_z)*np.cos(r_x))[:, np.newaxis] \
            + z * (np.sin(r_z)*np.sin(r_x) - np.cos(r_z)*np.sin(r_y)*np.cos(r_x))[:, np.newaxis]
    y_new = -x * (np.sin(r_z)*np.cos(r_y))[:, np.newaxis] \
            + y * (np.cos(r_z)*np.cos(r_x) - np.sin(r_z)*np.sin(r_y)*np.sin(r_x))[:, np.newaxis] \
            + z * (np.sin(r_z)*np.sin(r_y)*np.cos(r_x) + np.cos(r_z)*np.sin(r_x))[:, np.newaxis]

    y_new = -y_new

    #x_new = x.copy(); y_new = -y.copy()

    #for every row find t_x, t_y, theta, and scale
    l_e_x = np.mean(x_new[:, 36:42], axis=1)
    l_e_y = np.mean(y_new[:, 36:42], axis=1)
    r_e_x = np.mean(x_new[:, 42:48], axis=1)
    r_e_y = np.mean(y_new[:, 42:48], axis=1)

    #translate
    x = x_new - l_e_x[:, np.newaxis]
    y = y_new - l_e_y[:, np.newaxis]
    r_e_x = r_e_x - l_e_x
    r_e_y = r_e_y - l_e_y
    l_e_x = l_e_x - l_e_x
    l_e_y = l_e_y - l_e_y

    #rotate theta, assumption r_e_x is positive
    cos_theta = r_e_x / np.sqrt(r_e_x**2 + r_e_y**2)
    sin_theta = np.sqrt(1 - cos_theta**2)
    sin_theta[r_e_y<0] = -sin_theta[r_e_y<0]

    x_new = x * cos_theta[:, np.newaxis] + y * sin_theta[:, np.newaxis]
    y_new = y * cos_theta[:, np.newaxis] - x * sin_theta[:, np.newaxis]
    x = x_new
    y = y_new
    #for every row find t_x, t_y, theta, and scale
    l_e_x = np.mean(x_new[:, 36:42], axis=1)
    l_e_y = np.mean(y_new[:, 36:42], axis=1)
    r_e_x = np.mean(x_new[:, 42:48], axis=1)
    r_e_y = np.mean(y_new[:, 42:48], axis=1)

    #scale
    x = x / r_e_x[:, np.newaxis]
    y = y / r_e_x[:, np.newaxis]
    l_e_x = l_e_x / r_e_x
    l_e_y = l_e_y / r_e_x
    r_e_y = r_e_y / r_e_x
    r_e_x = r_e_x / r_e_x


    if verbose:
        fig = plt.figure()
        for i in range(len(l_e_y)):
            fig.clf()
            plt.scatter(x[i, :], y[i, :], c='b', marker='.')
            plt.scatter(l_e_x[i], l_e_y[i], c='r', marker='.')
            plt.scatter(r_e_x[i], r_e_y[i], c='r', marker='.')
            fig.canvas.draw()
            plt.show()
            plt.pause(0.001)
            
        plt.close(fig)

    out_ar = dict()
    out_ar['x'] = x
    out_ar['y'] = y

    return out_ar

In [14]:
#landmark hdf5
h5file = h5py.File('pdm_imposter.hdf5', 'a')
lbls = list(file_paths.keys()) #hdf5 file will have a separate group for each label
avg_win = int(in_fps/out_fps)
for i in lbls:
    
    print('{}/{}'.format(i, len(lbls)))
    if str(i) in h5file.keys():
        group = h5file[str(i)]
    else:
        group = h5file.create_group(str(i)) #create label specific group
    vid_names = file_paths[i]
    
    for v in vid_names:
        
        db_name = v.split('/')[-1] # name of dataset
        if db_name not in group.keys(): #do only if the file has not been processed

            #the number of frames we will get from this video
            full_csv = pd.read_csv(v + '.csv') # read the csv
            
            lndmrk = get_landmark(full_csv, verbose=False)
            x = lndmrk['x']
            y = lndmrk['y']
            
            p = np.array(full_csv.loc[:, ' p_0':' p_33'])
            aus = np.array(full_csv.loc[:, ' AU01_r':' AU45_r'])
            r_xyz = np.array(full_csv.loc[:, ' pose_Rx':' pose_Rz'])
            
            #out_lndmrk = np.concatenate((x, y, aus, r_xyz), axis=1)
            out_lndmrk = np.concatenate((p, aus, r_xyz), axis=1)
            out_frame_cnt = int(np.floor(len(out_lndmrk)*out_fps/in_fps))
            out_lndmrk = out_lndmrk[:(out_frame_cnt*avg_win), :]
            
            out_lndmrk = np.squeeze(np.mean(np.reshape(out_lndmrk, (out_frame_cnt, avg_win, out_lndmrk.shape[1])), 
                                            axis=1))
            
            if 0:#np.random.choice(range(100000), 1)[0] == 0:
                clear_output(wait=True)
                plt.figure()
                plt.plot(out_lndmrk[:, 0:68], out_lndmrk[:, 68:136])
                plt.title(i)
                plt.axis('equal')
                plt.show();
                print('\t {} {}/{}'.format(v, len(out_lndmrk), len(aus)))
                plt.pause(0.5)
                
            # create the dataset with the complete groups with chunksize 
            data = group.create_dataset(db_name, out_lndmrk.shape, dtype='f4',
                                        compression="gzip", compression_opts=4)
            data[:] = out_lndmrk
            h5file.flush() #write out the file

print(out_lndmrk.shape)
h5file.close()

0/13
1/13
2/13
3/13
4/13
5/13
6/13
7/13
8/13
9/13
10/13
11/13
12/13
(119, 54)


## Spectrograms

In [None]:
#audio hdf5
h5file = h5py.File('audio.hdf5', 'a')
lbls = list(file_paths.keys()) #hdf5 file will have a separate group for each label
avg_win = int(in_fps/out_fps)
for i in lbls:
    
    if str(i) in h5file.keys():
        group = h5file[str(i)]
    else:
        group = h5file.create_group(str(i)) #create label specific group
    vid_names = file_paths[i]
    
    for v in vid_names:
        
        db_name = v.split('/')[-1] # name of dataset
        if db_name not in group.keys(): #do only if the file has not been processed

            #the number of frames we will get from this video
            audio, sample_rate = librosa.load(v + '.wav')# read the audio feel
            spectrum = np.abs(librosa.stft(audio, n_fft=n_fft, 
                                    hop_length=int(sample_rate*sfft_hop), 
                                    win_length=int(sample_rate*sfft_wl), 
                                   center=False).T)
            
            # create the dataset with the complete groups with chunksize 
            data = group.create_dataset(db_name, spectrum.shape, dtype='f4', compression="gzip", compression_opts=4)
            if True:
                plt.imshow('sfft', spectrum)
                plt.show()
            data[:] = spectrum
            
            h5file.flush() #write out the file

h5file.close()