In [2]:
import pickle
import pandas as pd
import numpy as np
from pathlib import Path  
import os
from torchvision import datasets, transforms
import torch
import csv
from tqdm import tqdm
from sklearn import utils

In [3]:
DATA_DIR = r'C:\Users\User\Downloads\RealLifeDeceptionDetection.2016\Real-life_Deception_Detection_2016\Frames\Openface_features'

In [4]:
def create_openface_dataset(data_dir):
    x = []
    y = []
    filenames = []
    
    for filename in tqdm(os.listdir(data_dir)):
        fn=os.path.splitext(filename)[0] # goes through files names without extension
        if 'of_details' not in fn:
            # filenames.append(fn)
            openface_df = pd.read_csv(os.path.join(data_dir,filename))
            # fill zeroes with mean values where openface failed to detect faces 
            openface_df.loc[openface_df[' success'] == 0] = openface_df.loc[openface_df[' success'] == 0].replace(0, openface_df.loc[openface_df[' success'] == 1].mean())
            # remove some irrelevant columns
            openface_df = openface_df.loc[:, ~openface_df.columns.isin(['frame', ' face_id', ' timestamp', ' confidence', ' success'])]

            upper_half_df = openface_df.iloc[:len(openface_df) // 2]
            lower_half_df = openface_df.iloc[len(openface_df) // 2:len(openface_df)]
            halves_df = [upper_half_df, lower_half_df]

            for i in halves_df:
                total_features=None
                mean_features = (np.mean(i, axis=0))
                std_features = (np.std(i, axis=0))
                max_features = (np.max(i, axis=0))
                min_features = (np.min(i, axis=0))

                # join several features together
                feature = np.concatenate((mean_features, std_features, min_features, max_features), axis=None)                    
                #feature = np.concatenate((mean_features, std_features, min_features), axis=None)
                #feature = np.concatenate((mean_features, min_features, max_features), axis=None)
                #feature = np.concatenate((max_features, std_features), axis=None)
                #feature=max_features

                total_features=feature
                
                is_nan = False
                for j in range(len(total_features)):
                    if np.isnan(total_features[j]):
                        is_nan = True
                
                if is_nan == False and total_features is not None:
                    x.append(total_features)
                    if filename[6:11] == 'truth':
                        y.append(1)
                    else:
                        y.append(0)
                    # print(filename + '_' + i)
                    filenames.append(fn)

            if is_nan == True:
                print(filename)

    x=np.array(x)
    y=np.array(y)
    
    print(x.shape,y.shape)
    return x,y,filenames

X, y, filenames = create_openface_dataset(os.path.join(DATA_DIR))
# x_test, y_test = create_openface_dataset(os.path.join(DATA_DIR, 'Val_AFEW'))

 78%|███████▊  | 185/238 [00:26<00:08,  6.25it/s]

trial_truth_033.csv


 79%|███████▊  | 187/238 [00:27<00:08,  5.92it/s]

trial_truth_034.csv


 81%|████████  | 193/238 [00:28<00:06,  6.52it/s]

trial_truth_037.csv


100%|██████████| 238/238 [00:33<00:00,  7.15it/s]

(228, 1316) (228,)





trial_truth_033, trial_truth_034, trial_truth_037 - files with NaN values

### Openface Train/Test Split with data leak check

In [5]:
all_doubles = True
for i in range(len(filenames)):
    if i % 2 == 1:
        if filenames[i] != filenames[i-1]:
            all_doubles = False
            print(filenames[i-1])

if all_doubles == True:
    print('All names have a pair')

All names have a pair


In [6]:
len(X) == len(y) == len(filenames)

True

In [7]:
X_united = []
y_united = []
filenames_united = []
counter = 0
for i in range(len(X)):
    if i % 2 != 0:
        X_united.append(np.array([X[i-1], X[i]]))
        y_united.append(np.array([y[i-1], y[i]]))
        filenames_united.append(np.array([filenames[i-1], filenames[i]]))
        counter += 1

In [8]:
print('X', X[:4])
print('X_united', X_united[:2])

X [[-0.33519394  0.24029782 -0.90844565 ...  1.          0.
   1.        ]
 [-0.33873423  0.22622251 -0.91095667 ...  1.          0.
   1.        ]
 [-0.30352965  0.30097358 -0.89948478 ...  1.          0.
   1.        ]
 [-0.30048499  0.29519414 -0.90244767 ...  1.          0.
   1.        ]]
X_united [array([[-0.33519394,  0.24029782, -0.90844565, ...,  1.        ,
         0.        ,  1.        ],
       [-0.33873423,  0.22622251, -0.91095667, ...,  1.        ,
         0.        ,  1.        ]]), array([[-0.30352965,  0.30097358, -0.89948478, ...,  1.        ,
         0.        ,  1.        ],
       [-0.30048499,  0.29519414, -0.90244767, ...,  1.        ,
         0.        ,  1.        ]])]


In [9]:
print('y', y[:4])
print('y_united', y_united[:2])

y [0 0 0 0]
y_united [array([0, 0]), array([0, 0])]


In [11]:
X_united_shuffled, y_united_shuffled, filenames_united_shuffled = utils.shuffle(X_united, y_united, filenames_united)

In [12]:
y_united_shuffled[:10]

[array([0, 0]),
 array([1, 1]),
 array([0, 0]),
 array([1, 1]),
 array([1, 1]),
 array([1, 1]),
 array([1, 1]),
 array([0, 0]),
 array([0, 0]),
 array([0, 0])]

In [13]:
filenames_united_shuffled[:10]

[array(['trial_lie_019', 'trial_lie_019'], dtype='<U13'),
 array(['trial_truth_020', 'trial_truth_020'], dtype='<U15'),
 array(['trial_lie_025', 'trial_lie_025'], dtype='<U13'),
 array(['trial_truth_044', 'trial_truth_044'], dtype='<U15'),
 array(['trial_truth_032', 'trial_truth_032'], dtype='<U15'),
 array(['trial_truth_012', 'trial_truth_012'], dtype='<U15'),
 array(['trial_truth_050', 'trial_truth_050'], dtype='<U15'),
 array(['trial_lie_014', 'trial_lie_014'], dtype='<U13'),
 array(['trial_lie_028', 'trial_lie_028'], dtype='<U13'),
 array(['trial_lie_018', 'trial_lie_018'], dtype='<U13')]

In [14]:
x_train_united = X_united_shuffled[:82]
y_train_united = y_united_shuffled[:82]
train_filenames_united = filenames_united_shuffled[:82]
x_test_united = X_united_shuffled[len(x_train_united):]
y_test_united = y_united_shuffled[len(y_train_united):]
test_filenames_united = filenames_united_shuffled[len(train_filenames_united):]

In [15]:
x_train = []
y_train = []
train_filenames = []

for i in range(len(x_train_united)):
    for j in range(len(x_train_united[i])):
        x_train.append(x_train_united[i][j])
        y_train.append(y_train_united[i][j])
        train_filenames.append(train_filenames_united[i][j])
        

In [16]:
x_test = []
y_test = []
test_filenames = []

for i in range(len(x_test_united)):
    for j in range(len(x_test_united[i])):
        x_test.append(x_test_united[i][j])
        y_test.append(y_test_united[i][j])
        test_filenames.append(test_filenames_united[i][j])

In [17]:
x_train_united[:1]

[array([[-0.35708296,  0.28857529, -0.88456584, ...,  1.        ,
          0.        ,  1.        ],
        [-0.36232963,  0.3030747 , -0.87831   , ...,  1.        ,
          0.        ,  1.        ]])]

In [18]:
x_train[:2]

[array([-0.35708296,  0.28857529, -0.88456584, ...,  1.        ,
         0.        ,  1.        ]),
 array([-0.36232963,  0.3030747 , -0.87831   , ...,  1.        ,
         0.        ,  1.        ])]

In [19]:
x_train, y_train, x_test, y_test = np.array(x_train), np.array(y_train), np.array(x_test), np.array(y_test)

In [20]:
c = list(set(train_filenames) & set(test_filenames))
print(c)

[]


Сommon train and test elements were not found. The data leak is missing

### Classification

In [21]:
import xgboost as xgb
import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

### Cross_val_score

In [33]:
xgb_clf = xgb.XGBClassifier(use_label_encoder=False)
scores = cross_val_score(xgb_clf, X, y, cv=StratifiedKFold(10))
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[1.         1.         0.91304348 0.86956522 1.         1.
 0.95652174 1.         0.95454545 1.        ]
0.97 accuracy with a standard deviation of 0.04


In [38]:
rfc_clf = RandomForestClassifier(n_estimators = 1000)
scores = cross_val_score(rfc_clf, X, y, cv=10)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[1.         1.         0.91304348 1.         1.         1.
 0.95652174 1.         0.90909091 1.        ]
0.98 accuracy with a standard deviation of 0.04
