In [2]:
import pathlib
import os
import pandas as pd
import cv2
import numpy as np
from tqdm import tqdm

In [25]:
def list_files(path, format = ".jpg"):
    txt_files = set()  # create an empty set to store unique txt file names
    for root, dir, files in os.walk(path):
        for file in files:
            if file.endswith(format):
                file_path = os.path.join(root, file)
                txt_files.add(file_path)  # add the file path to the set
    return list(txt_files)  # return a list of unique txt file names

In [3]:
EmotioNet_emotion_images = pathlib.Path("C:/Users/cdr03/Documents/Thesis/dataset/EmotioNet/emotioNet_Emotions")
EmotioNet_EmotioNet_labels = pathlib.Path("C:/Users/cdr03/Documents/Thesis/dataset/EmotioNet/EmotioNet_Emotions.xlsx")

In [4]:
emotioNet_emotion_files = list_files(EmotioNet_emotion_images)

In [26]:
def image_to_array(img):
    array = np.array(img , dtype="float32")/255
    return array

In [27]:
def detect_and_crop_face(image_path, resize=None):
    # Load the input image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Load the Haar Cascade classifier for face detection
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

    # Detect faces in the image
    faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5)

    # Crop the image to contain the detected face (if any)
    if len(faces) > 0:
        (x, y, w, h) = faces[0]
        face_image = image[y:y+h, x:x+w]
        if resize is not None:
            face_image = cv2.resize(face_image, resize)
            face_image = image_to_array(face_image)
            not_detected_faces = None
    else:
        face_image = None
        not_detected_faces = image
        print("face wasn't detected")
    return face_image, not_detected_faces

In [28]:
size = (224,224)
def readImages(file_list):
    df = pd.DataFrame(columns=['Name','Data'])
    df_face_undetected = pd.DataFrame(columns=['Name','Data'])
    for file_path in tqdm(file_list, desc="Processing Images"):
        data, non_faces = detect_and_crop_face(file_path, size)
        file_name = os.path.basename(file_path)
        temp_df = pd.DataFrame({'Name': [file_name], 'Data':[data]})
        df = pd.concat([df, temp_df], ignore_index=True)
        temp_df_undetected = pd.DataFrame({'Name': [file_name], 'Data':[non_faces]})
        df_face_undetected = pd.concat([df_face_undetected, temp_df_undetected], ignore_index=True)
    return df, df_face_undetected

In [8]:
EmotioNet_emotion, undetected = readImages(emotioNet_emotion_files)

Processing Images:   0%|          | 5/2471 [00:00<04:00, 10.27it/s]

face wasn't detected


Processing Images:   1%|▏         | 35/2471 [00:03<04:34,  8.86it/s]

face wasn't detected


Processing Images:   3%|▎         | 68/2471 [00:07<03:18, 12.13it/s]

face wasn't detected


Processing Images:   3%|▎         | 70/2471 [00:07<03:54, 10.22it/s]

face wasn't detected


Processing Images:   8%|▊         | 201/2471 [00:35<04:54,  7.71it/s]  

face wasn't detected
face wasn't detected


Processing Images:   8%|▊         | 208/2471 [00:41<24:30,  1.54it/s]

face wasn't detected


Processing Images:  10%|▉         | 236/2471 [00:47<04:39,  7.99it/s]

face wasn't detected


Processing Images:  10%|▉         | 238/2471 [00:47<04:14,  8.79it/s]

face wasn't detected


Processing Images:  10%|█         | 249/2471 [00:53<25:34,  1.45it/s]

face wasn't detected


Processing Images:  11%|█▏        | 281/2471 [00:58<05:33,  6.56it/s]

face wasn't detected


Processing Images:  14%|█▍        | 344/2471 [01:09<10:07,  3.50it/s]

face wasn't detected


Processing Images:  15%|█▍        | 363/2471 [01:14<11:01,  3.19it/s]

face wasn't detected


Processing Images:  16%|█▌        | 396/2471 [01:19<05:07,  6.74it/s]

face wasn't detected
face wasn't detected


Processing Images:  16%|█▋        | 402/2471 [01:20<03:36,  9.58it/s]

face wasn't detected


Processing Images:  16%|█▋        | 406/2471 [01:20<03:29,  9.86it/s]

face wasn't detected


Processing Images:  17%|█▋        | 425/2471 [01:22<02:58, 11.46it/s]

face wasn't detected


Processing Images:  19%|█▊        | 463/2471 [01:28<05:54,  5.67it/s]

face wasn't detected


Processing Images:  19%|█▉        | 469/2471 [01:29<04:41,  7.11it/s]

face wasn't detected


Processing Images:  20%|██        | 502/2471 [01:34<04:05,  8.01it/s]

face wasn't detected


Processing Images:  21%|██        | 523/2471 [01:36<03:57,  8.21it/s]

face wasn't detected


Processing Images:  22%|██▏       | 532/2471 [01:37<04:21,  7.42it/s]

face wasn't detected


Processing Images:  23%|██▎       | 562/2471 [01:41<03:19,  9.57it/s]

face wasn't detected


Processing Images:  24%|██▍       | 603/2471 [01:46<03:07,  9.96it/s]

face wasn't detected


Processing Images:  25%|██▌       | 627/2471 [01:49<02:43, 11.26it/s]

face wasn't detected


Processing Images:  25%|██▌       | 629/2471 [01:49<02:44, 11.18it/s]

face wasn't detected


Processing Images:  27%|██▋       | 664/2471 [01:54<03:54,  7.72it/s]

face wasn't detected


Processing Images:  27%|██▋       | 671/2471 [01:55<03:30,  8.56it/s]

face wasn't detected


Processing Images:  28%|██▊       | 690/2471 [01:57<03:09,  9.40it/s]

face wasn't detected


Processing Images:  30%|██▉       | 730/2471 [02:07<09:03,  3.20it/s]

face wasn't detected


Processing Images:  30%|██▉       | 734/2471 [02:08<10:34,  2.74it/s]

face wasn't detected


Processing Images:  32%|███▏      | 782/2471 [02:17<03:27,  8.14it/s]

face wasn't detected


Processing Images:  32%|███▏      | 799/2471 [02:21<04:20,  6.41it/s]

face wasn't detected


Processing Images:  33%|███▎      | 826/2471 [02:28<04:16,  6.42it/s]

face wasn't detected


Processing Images:  34%|███▎      | 830/2471 [02:29<03:50,  7.11it/s]

face wasn't detected


Processing Images:  35%|███▍      | 860/2471 [02:34<04:19,  6.22it/s]

face wasn't detected


Processing Images:  35%|███▌      | 866/2471 [02:35<04:08,  6.47it/s]

face wasn't detected


Processing Images:  36%|███▌      | 887/2471 [02:38<04:26,  5.95it/s]

face wasn't detected


Processing Images:  37%|███▋      | 913/2471 [02:42<04:15,  6.09it/s]

face wasn't detected


Processing Images:  37%|███▋      | 918/2471 [02:42<04:46,  5.43it/s]

face wasn't detected


Processing Images:  38%|███▊      | 950/2471 [02:50<06:46,  3.74it/s]

face wasn't detected


Processing Images:  40%|███▉      | 982/2471 [02:55<02:51,  8.68it/s]

face wasn't detected


Processing Images:  41%|████      | 1002/2471 [02:57<02:32,  9.64it/s]

face wasn't detected


Processing Images:  41%|████▏     | 1025/2471 [03:01<03:03,  7.88it/s]

face wasn't detected


Processing Images:  42%|████▏     | 1033/2471 [03:02<03:56,  6.07it/s]

face wasn't detected


Processing Images:  42%|████▏     | 1049/2471 [03:04<02:18, 10.24it/s]

face wasn't detected


Processing Images:  44%|████▍     | 1089/2471 [03:12<04:02,  5.70it/s]

face wasn't detected


Processing Images:  45%|████▌     | 1117/2471 [03:17<03:58,  5.68it/s]

face wasn't detected


Processing Images:  46%|████▌     | 1127/2471 [03:24<06:38,  3.37it/s]

face wasn't detected


Processing Images:  46%|████▌     | 1136/2471 [03:26<03:26,  6.47it/s]

face wasn't detected


Processing Images:  47%|████▋     | 1166/2471 [03:32<03:22,  6.43it/s]

face wasn't detected


Processing Images:  48%|████▊     | 1187/2471 [03:34<03:35,  5.95it/s]

face wasn't detected


Processing Images:  49%|████▊     | 1204/2471 [03:36<02:47,  7.58it/s]

face wasn't detected


Processing Images:  51%|█████     | 1255/2471 [03:45<02:02,  9.90it/s]

face wasn't detected
face wasn't detected


Processing Images:  51%|█████     | 1260/2471 [03:45<02:31,  8.00it/s]

face wasn't detected


Processing Images:  52%|█████▏    | 1281/2471 [03:48<01:53, 10.49it/s]

face wasn't detected


Processing Images:  54%|█████▍    | 1333/2471 [03:55<03:58,  4.77it/s]

face wasn't detected


Processing Images:  58%|█████▊    | 1443/2471 [04:12<02:09,  7.95it/s]

face wasn't detected


Processing Images:  60%|█████▉    | 1474/2471 [04:15<01:13, 13.51it/s]

face wasn't detected


Processing Images:  60%|██████    | 1487/2471 [04:17<02:08,  7.67it/s]

face wasn't detected


Processing Images:  60%|██████    | 1489/2471 [04:17<02:29,  6.58it/s]

face wasn't detected


Processing Images:  61%|██████    | 1512/2471 [04:20<01:40,  9.55it/s]

face wasn't detected


Processing Images:  61%|██████▏   | 1518/2471 [04:20<01:37,  9.77it/s]

face wasn't detected


Processing Images:  62%|██████▏   | 1532/2471 [04:23<02:47,  5.61it/s]

face wasn't detected


Processing Images:  62%|██████▏   | 1536/2471 [04:24<02:46,  5.61it/s]

face wasn't detected


Processing Images:  65%|██████▌   | 1616/2471 [04:35<01:27,  9.76it/s]

face wasn't detected
face wasn't detected


Processing Images:  65%|██████▌   | 1618/2471 [04:35<01:20, 10.55it/s]

face wasn't detected


Processing Images:  67%|██████▋   | 1657/2471 [04:42<02:07,  6.41it/s]

face wasn't detected


Processing Images:  68%|██████▊   | 1681/2471 [04:45<01:05, 12.00it/s]

face wasn't detected


Processing Images:  69%|██████▊   | 1696/2471 [04:48<01:58,  6.54it/s]

face wasn't detected


Processing Images:  73%|███████▎  | 1794/2471 [05:08<01:26,  7.86it/s]

face wasn't detected


Processing Images:  73%|███████▎  | 1800/2471 [05:08<01:07, 10.01it/s]

face wasn't detected


Processing Images:  73%|███████▎  | 1804/2471 [05:09<01:07,  9.95it/s]

face wasn't detected


Processing Images:  73%|███████▎  | 1815/2471 [05:10<01:11,  9.20it/s]

face wasn't detected
face wasn't detected


Processing Images:  75%|███████▍  | 1849/2471 [05:16<01:25,  7.27it/s]

face wasn't detected


Processing Images:  76%|███████▌  | 1875/2471 [05:19<01:20,  7.42it/s]

face wasn't detected


Processing Images:  76%|███████▌  | 1879/2471 [05:20<01:06,  8.91it/s]

face wasn't detected


Processing Images:  77%|███████▋  | 1909/2471 [05:24<01:07,  8.30it/s]

face wasn't detected


Processing Images:  80%|███████▉  | 1968/2471 [05:40<01:29,  5.64it/s]

face wasn't detected


Processing Images:  80%|███████▉  | 1973/2471 [05:40<01:12,  6.87it/s]

face wasn't detected


Processing Images:  80%|████████  | 1979/2471 [05:41<01:04,  7.60it/s]

face wasn't detected


Processing Images:  81%|████████  | 1998/2471 [05:44<00:57,  8.16it/s]

face wasn't detected


Processing Images:  82%|████████▏ | 2020/2471 [05:47<01:02,  7.21it/s]

face wasn't detected


Processing Images:  82%|████████▏ | 2022/2471 [05:47<00:51,  8.77it/s]

face wasn't detected


Processing Images:  82%|████████▏ | 2028/2471 [05:48<01:27,  5.09it/s]

face wasn't detected


Processing Images:  82%|████████▏ | 2031/2471 [05:49<01:12,  6.05it/s]

face wasn't detected


Processing Images:  82%|████████▏ | 2037/2471 [05:49<00:41, 10.46it/s]

face wasn't detected


Processing Images:  85%|████████▌ | 2112/2471 [06:00<00:46,  7.68it/s]

face wasn't detected


Processing Images:  87%|████████▋ | 2155/2471 [06:07<00:33,  9.37it/s]

face wasn't detected


Processing Images:  92%|█████████▏| 2268/2471 [06:26<00:27,  7.44it/s]

face wasn't detected


Processing Images:  93%|█████████▎| 2287/2471 [06:28<00:16, 11.35it/s]

face wasn't detected


Processing Images:  94%|█████████▍| 2326/2471 [06:34<00:14,  9.72it/s]

face wasn't detected


Processing Images:  94%|█████████▍| 2330/2471 [06:35<00:14,  9.96it/s]

face wasn't detected


Processing Images:  95%|█████████▌| 2351/2471 [06:38<00:16,  7.17it/s]

face wasn't detected


Processing Images:  95%|█████████▌| 2355/2471 [06:38<00:12,  9.54it/s]

face wasn't detected


Processing Images:  96%|█████████▋| 2384/2471 [06:42<00:10,  8.51it/s]

face wasn't detected


Processing Images:  97%|█████████▋| 2395/2471 [06:43<00:06, 11.11it/s]

face wasn't detected


Processing Images: 100%|█████████▉| 2463/2471 [06:52<00:00, 10.31it/s]

face wasn't detected


Processing Images: 100%|██████████| 2471/2471 [06:54<00:00,  5.95it/s]


In [9]:
EmotioNet_emotion.size

4942

In [10]:
cv2.imshow("test",EmotioNet_emotion.Data[0])
cv2.waitKey(0)
cv2.destroyAllWindows()


In [11]:
df = pd.read_excel(EmotioNet_EmotioNet_labels)

In [12]:
emotion_labels_number = {"angry": 1, "surprised": 2, "disgusted": 3, "fearful": 4, "happy": 5, "sad": 6}
emotions = list(emotion_labels_number.keys())
emotions.extend(['ID'])
df_clean = df[emotions]
df_clean["emotion"] = df.apply(
    lambda x: next((int(value) for emotion, value in emotion_labels_number.items() if x[emotion] == 1), None), 
    axis=1
)
EmotioNet_labels = df_clean[['ID', 'emotion']].dropna()
EmotioNet_labels.rename(columns={"ID": "Name"}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["emotion"] = df.apply(


In [13]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(EmotioNet_labels)


          Name  emotion
3     0004.jpg      1.0
4     0005.jpg      5.0
5     0006.jpg      5.0
14    0015.jpg      1.0
17    0018.jpg      1.0
19    0020.jpg      1.0
21    0022.jpg      5.0
26    0027.jpg      5.0
31    0032.jpg      6.0
33    0034.jpg      5.0
34    0035.jpg      5.0
35    0036.jpg      6.0
36    0037.jpg      6.0
38    0039.jpg      1.0
39    0040.jpg      5.0
42    0043.jpg      5.0
43    0044.jpg      4.0
45    0046.jpg      5.0
50    0051.jpg      5.0
53    0054.jpg      5.0
54    0055.jpg      5.0
57    0058.jpg      6.0
58    0059.jpg      6.0
59    0060.jpg      3.0
62    0063.jpg      5.0
64    0065.jpg      5.0
69    0070.jpg      5.0
70    0071.jpg      1.0
72    0073.jpg      5.0
75    0076.jpg      6.0
78    0079.jpg      1.0
79    0080.jpg      4.0
80    0081.jpg      5.0
81    0082.jpg      5.0
82    0083.jpg      5.0
84    0085.jpg      4.0
87    0088.jpg      1.0
91    0092.jpg      5.0
92    0093.jpg      5.0
94    0095.jpg      5.0
101   0102.jpg  

In [14]:
EmotioNet_emotion = pd.merge(EmotioNet_labels,EmotioNet_emotion, how="inner", on=['Name'])

In [16]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(undetected.size)

4942


In [17]:
EmotioNet_emotion_clean =EmotioNet_emotion.dropna()

In [91]:
EmotioNet_emotion_clean.Data.iloc[0].shape

(224, 224)

In [37]:
EmotioNet_emotion_clean.to_pickle(f"./EmotioNet_Data_{size[0]}_Emotions")

In [21]:
undetected.dropna(inplace=True)

In [23]:
EmotioNet_emotion_clean.to_pickle(f"./EmotioNet_undetected_{size[0]}_Emotions")

In [14]:
cv2.imshow("test",undetected.Data.iloc[3])
cv2.waitKey(0)
cv2.destroyAllWindows()


In [92]:
from sklearn.model_selection import train_test_split

def create_stratified_datasets(X, y, test_size=0.10, val_size=0.10, random_state=None):
    """
    Creates three stratified datasets - train, validation, and test - from the input data X and target y.
    
    Parameters:
        X (array-like): The input data.
        y (array-like): The target variable.
        test_size (float): The proportion of the data to include in the test set.
        val_size (float): The proportion of the remaining data to include in the validation set.
        random_state (int): Seed for the random number generator.
    
    Returns:
        A tuple containing the train, validation, and test datasets, each as a tuple of input and target variables.
    """
    
    # split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    
    # split remaining data into validation and train sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size/(1-test_size), random_state=random_state, stratify=y_train)
    
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

In [93]:
EmotioNet_emotion_clean = pd.read_pickle("./EmotioNet_Data_224_Emotions")

In [94]:
EmotioNet_emotion_clean['emotion'].isnull().values.any()

False

In [95]:
EmotioNet_emotion_target = EmotioNet_emotion_clean['emotion']
EmotioNet_emotion_data = EmotioNet_emotion_clean['Data']
(X_train, y_train), (X_val, y_val), (X_test, y_test) = create_stratified_datasets(EmotioNet_emotion_data,EmotioNet_emotion_target)



In [96]:
print(f'Distribution in training set ({len(y_train)}): \n{y_train.value_counts().sort_index() / len(y_train)}\n\n'+
      f'Distribution in validation set ({len(y_val)}): \n{y_val.value_counts().sort_index() / len(y_val)}\n\n'+
      f'Distribution in testing set ({len(y_test)}): \n{y_test.value_counts().sort_index() / len(y_test)}')

Distribution in training set (877): 
1.0    0.055872
2.0    0.059293
3.0    0.050171
4.0    0.036488
5.0    0.681870
6.0    0.116306
Name: emotion, dtype: float64

Distribution in validation set (110): 
1.0    0.054545
2.0    0.054545
3.0    0.054545
4.0    0.036364
5.0    0.681818
6.0    0.118182
Name: emotion, dtype: float64

Distribution in testing set (110): 
1.0    0.054545
2.0    0.063636
3.0    0.045455
4.0    0.036364
5.0    0.681818
6.0    0.118182
Name: emotion, dtype: float64


In [97]:
X_train.to_pickle('./Clean_datasets/EmotioNet/EmotioNet_Emotion_X_train')
y_train.to_pickle('./Clean_datasets/EmotioNet/EmotioNet_Emotion_Y_train')
X_val.to_pickle('./Clean_datasets/EmotioNet/EmotioNet_Emotion_X_val')
y_val.to_pickle('./Clean_datasets/EmotioNet/EmotioNet_Emotion_Y_val')
X_test.to_pickle('./Clean_datasets/EmotioNet/EmotioNet_Emotion_X_test')
y_test.to_pickle('./Clean_datasets/EmotioNet/EmotioNet_Emotion_Y_test')

FACS

In [5]:
EmotioNet_facs_images = pathlib.Path("C:/Users/cdr03/Documents/Thesis/dataset/EmotioNet/emotioNet_FACS")
EmotioNet_facs_labels = pathlib.Path("C:/Users/cdr03/Documents/Thesis/dataset/EmotioNet/EmotioNet_FACS_clean_v3.xlsx")
#emotioNet_emotion_files = list_files(EmotioNet_emotion_images)

In [None]:
EmotioNet_facs, undetected_facs = readImages(emotioNet_emotion_files)

In [36]:
EmotioNet_facs.to_pickle(f"./EmontioNet_FACS_Data_{size[0]}")
undetected_facs.to_pickle(f"./EmontioNet_FACS_undetected_{size[0]}")

In [3]:
EmotioNet_facs = pd.read_pickle('EmontioNet_FACS_Data_224')

In [6]:
EmotioNet_facs_labels_df = pd.read_excel(EmotioNet_facs_labels)

In [7]:
EmotioNet_facs_labels_df.head(10)

Unnamed: 0,Name,AU 1,AU 2,AU 4,AU 5,AU 6,AU 9,AU 10,AU 12,AU 15,...,AU 24,AU 25,AU 26,AU 28,AU 51,AU 52,AU 53,AU 54,AU 55,AU 56
0,N_0000000001_00001.jpg,0,0,0,1,0,0,1,1,1,...,0,1,0,0,0,0,1,0,1,0
1,N_0000000001_00003.jpg,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
2,N_0000000001_00004.jpg,1,0,1,1,0,0,0,1,0,...,1,0,0,0,1,0,0,0,1,0
3,N_0000000001_00005.jpg,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0
4,N_0000000001_00006.jpg,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
5,N_0000000001_00007.jpg,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
6,N_0000000001_00008.jpg,0,0,0,0,0,0,1,1,0,...,0,1,0,0,0,0,0,0,1,0
7,N_0000000001_00009.jpg,0,0,0,1,0,0,1,0,1,...,1,0,0,0,1,0,0,0,0,0
8,N_0000000001_00010.jpg,0,0,0,0,1,0,1,1,0,...,0,1,0,0,1,0,0,0,0,0
9,N_0000000001_00011.jpg,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,1,1,0,0,1


In [8]:
EmotioNet_facs_images = pd.read_pickle('./EmontioNet_FACS_Data_224')

In [9]:
EmotioNet_facs_images.dropna(inplace=True)

In [29]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(EmotioNet_facs_images.head(10))


                      Name                                               Data
0   N_0000000023_00258.jpg  [[0.1764706, 0.1764706, 0.1764706, 0.1764706, ...
1   N_0000000042_00292.jpg  [[0.10980392, 0.10980392, 0.10980392, 0.109803...
2   N_0000000010_00667.jpg  [[0.8509804, 0.85490197, 0.85490197, 0.8509804...
3   N_0000000033_00872.jpg  [[0.07058824, 0.078431375, 0.078431375, 0.0784...
5   N_0000000028_00799.jpg  [[0.25882354, 0.27058825, 0.3019608, 0.3333333...
6   N_0000000005_00703.jpg  [[0.2901961, 0.2901961, 0.29411766, 0.29411766...
7   N_0000000028_00525.jpg  [[0.8980392, 0.84313726, 0.64705884, 0.4156862...
8   N_0000000001_00719.jpg  [[0.47843137, 0.47843137, 0.48235294, 0.482352...
9   N_0000000030_00499.jpg  [[0.39607844, 0.39607844, 0.39607844, 0.396078...
10  N_0000000007_00812.jpg  [[0.7529412, 0.75686276, 0.75686276, 0.7490196...


In [30]:
EmotioNet_facs_labels_df.columns
au_cols = EmotioNet_facs_labels_df.columns[1:]

# use apply method to generate new column containing array of column names with value of 1
EmotioNet_facs_labels_df['AU'] = EmotioNet_facs_labels_df[au_cols].apply(lambda x: [int(col[3:]) for col in x.index[x == 1].tolist()], axis=1)

In [31]:
EmotioNet_facs_clean_labels = EmotioNet_facs_labels_df[['Name', 'AU']]

In [32]:
EmotioNet_facs_data = pd.merge(EmotioNet_facs_clean_labels,EmotioNet_facs_images, how="inner", on='Name')

In [33]:
EmotioNet_facs_data.shape

(15895, 3)

In [34]:
EmotioNet_facs_data.columns

Index(['Name', 'AU', 'Data'], dtype='object')

In [35]:
EmotioNet_facs_data.to_pickle('./EmontioNet_FACS_Data_clean_224')

In [10]:
EmotioNet_facs_data = pd.read_pickle('./EmontioNet_FACS_Data_clean_224')

In [11]:
EmotioNet_facs_data['Data'].isnull().values.any()

False

In [20]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# fit the binarizer on the 'AU' column
mlb.fit(EmotioNet_facs_data['AU'])
# transform the 'AU' column into a one-hot encoded matrix
one_hot_encoded_au = mlb.transform(EmotioNet_facs_data['AU'])
# convert the matrix into a Pandas dataframe
one_hot_encoded_au_df = pd.DataFrame(one_hot_encoded_au, columns=mlb.classes_)
one_hot_encoded_au_df['Data'] = EmotioNet_facs_data['Data']

In [21]:
AU_columns = [1, 2, 4, 5, 6, 9, 10, 12, 15, 17, 18, 20, 24, 25, 26, 28, 'Data']

In [22]:
pandas_df = one_hot_encoded_au_df[one_hot_encoded_au_df.columns.intersection(AU_columns)]

In [23]:
EmotioNet_facs_X = pandas_df['Data']

In [24]:
Au_labels_df = pandas_df.drop(columns = ['Data'])

In [26]:
all_zeros = Au_labels_df.eq(0).all(axis=1)
print(all_zeros.loc[all_zeros==True])
# display rows with only zeros
empty_rows = all_zeros.loc[all_zeros==True].index
print(empty_rows)

12       True
15       True
25       True
27       True
93       True
         ... 
14248    True
14561    True
15385    True
15386    True
15872    True
Length: 101, dtype: bool
Int64Index([   12,    15,    25,    27,    93,   110,   113,   132,   134,
              136,
            ...
            12482, 13398, 13861, 13868, 14216, 14248, 14561, 15385, 15386,
            15872],
           dtype='int64', length=101)


In [27]:
EmotioNet_facs_X_clean = EmotioNet_facs_X.drop(empty_rows)
Au_labels_df_clean  = Au_labels_df.drop(empty_rows)

In [61]:
EmotioNet_facs_X = EmotioNet_facs_data['Data']

In [28]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import numpy as np
def multilable_split(X, y, size=0.1, random_state=12):
   msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=size, random_state=random_state)
   for train_index, test_index in msss.split(X, y):
      X_train, X_test = X.iloc[train_index], X.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]
      return X_train, X_test, y_train, y_test

In [29]:
def create_balanced_datasets_multilabel(X, y, test_size=0.10, val_size=0.10):
    """
    Creates three stratified datasets - train, validation, and test - from the input data X and target y.
    
    Parameters:
        X (array-like): The input data.
        y (array-like): The target variable.
        test_size (float): The proportion of the data to include in the test set.
        val_size (float): The proportion of the remaining data to include in the validation set.
        random_state (int): Seed for the random number generator.
    
    Returns:
        A tuple containing the train, validation, and test datasets, each as a tuple of input and target variables.
    """
    # split data into train and test sets
    X_train, X_test, y_train, y_test = multilable_split(X, y, size = test_size)
    
    # split remaining data into validation and train sets
    X_train, X_val, y_train, y_val = multilable_split(X_train, y_train, size=val_size/(1-test_size))
    
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

In [30]:
(X_train, y_train), (X_val, y_val), (X_test, y_test) = create_balanced_datasets_multilabel(EmotioNet_facs_X_clean,Au_labels_df_clean)

In [31]:
X_train.to_pickle('./Clean_datasets/EmotioNet/EmotioNet_FACS_X_train')
y_train.to_pickle('./Clean_datasets/EmotioNet/EmotioNet_FACS_Y_train')
X_val.to_pickle('./Clean_datasets/EmotioNet/EmotioNet_FACS_X_val')
y_val.to_pickle('./Clean_datasets/EmotioNet/EmotioNet_FACS_Y_val')
X_test.to_pickle('./Clean_datasets/EmotioNet/EmotioNet_FACS_X_test')
y_test.to_pickle('./Clean_datasets/EmotioNet/EmotioNet_FACS_Y_test')

In [32]:
print(f'Distribution in training set ({len(y_train)}): \n{y_train.value_counts().sort_index() / len(y_train)}\n\n'+
      f'Distribution in validation set ({len(y_val)}): \n{y_val.value_counts().sort_index() / len(y_val)}\n\n'+
      f'Distribution in testing set ({len(y_test)}): \n{y_test.value_counts().sort_index() / len(y_test)}')

Distribution in training set (12629): 
1  2  4  5  6  9  10  12  15  17  18  20  24  25  26  28
0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   1     0.000396
                                                  1   0     0.000158
                                              1   0   0     0.013778
                                                      1     0.000396
                                                  1   0     0.002455
                                                              ...   
1  1  1  0  0  0  1   0   0   0   1   0   1   0   0   0     0.000079
                          1   0   0   0   0   1   0   0     0.000079
                                          1   0   0   0     0.000079
                                      1   0   1   0   0     0.000079
                      1   0   0   0   0   0   1   1   0     0.000079
Length: 755, dtype: float64

Distribution in validation set (1578): 
1  2  4  5  6  9  10  12  15  17  18  20  24  25  26  28
0  0  0  0  0  0  0