### In this notebook, we will:
- Load the CK+ dataset
- Save the standard mesh structure
- Load the standard mesh graph structure
- Split the data into train, validation and test sets
- Verify that the label distribution in the original data is maintained in the splits
- Visualize the 3D face mesh
- Save the data splits to the disk
- Load the data splits from the disk

In [1]:
import pickle
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data
import networkx as nx
import plotly.graph_objects as go

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
# load ck_landmarks from pickle file
ck_landmarks_path = 'ck_data/ck_landmarks.pkl'
with open(ck_landmarks_path, 'rb') as f:
    ck_landmarks_df = pickle.load(f)
num_landmarks = len(ck_landmarks_df['landmarks'][0])
print(f"Number of landmarks: {num_landmarks}, Number of samples: {len(ck_landmarks_df.index)}")
# delete first 500 rows of the dataframe and reset the index
ck_landmarks_df = ck_landmarks_df.iloc[500:]
ck_landmarks_df = ck_landmarks_df.reset_index(drop=True)
ck_landmarks_df

Number of landmarks: 468


Unnamed: 0,filename,label,bbox,landmarks
0,ck_raw/neutral/S108_008_00000001.png,neutral,"[224.21793, 122.48273, -53.10254, 449.32162, 3...","[[0.564, 0.7085, 0.1967], [0.5415, 0.5957, 0.0..."
1,ck_raw/neutral/S099_004_00000001.png,neutral,"[181.2319, 94.297295, -59.701385, 417.40485, 3...","[[0.5337, 0.673, 0.1963], [0.537, 0.5645, 0.02..."
2,ck_raw/neutral/S078_006_00000001.png,neutral,"[195.918, 128.69635, -55.448437, 420.37506, 37...","[[0.54, 0.713, 0.1896], [0.543, 0.6, 0.02655],..."
3,ck_raw/neutral/S034_002_00000001.png,neutral,"[208.24384, 163.83702, -46.601257, 394.40558, ...","[[0.5493, 0.72, 0.1946], [0.5425, 0.578, 0.027..."
4,ck_raw/neutral/S133_002_00000001.png,neutral,"[200.54695, 151.09795, -53.00201, 412.79205, 3...","[[0.538, 0.692, 0.1843], [0.549, 0.5703, 0.027..."
...,...,...,...,...
410,ck_raw/contempt/S139_002_00000013.png,contempt,"[143.23639, 146.92712, -62.064163, 378.90234, ...","[[0.5405, 0.718, 0.1906], [0.5464, 0.5903, 0.0..."
411,ck_raw/contempt/S506_002_00000009.png,contempt,"[191.23018, 116.78687, -68.944695, 460.76178, ...","[[0.5483, 0.695, 0.2035], [0.54, 0.57, 0.0286]..."
412,ck_raw/contempt/S157_002_00000011.png,contempt,"[189.72801, 114.486626, -57.939888, 412.4796, ...","[[0.539, 0.6997, 0.2002], [0.54, 0.5674, 0.025..."
413,ck_raw/contempt/S503_002_00000008.png,contempt,"[219.90367, 142.75923, -71.32669, 487.42218, 4...","[[0.5215, 0.7046, 0.1896], [0.5396, 0.5703, 0...."


### Create and save the standard mesh from the mediapipe library

In [3]:
import mediapipe as mp
tesselation = mp.solutions.face_mesh.FACEMESH_TESSELATION
G = nx.Graph()
for idx, (x, y, z) in enumerate(ck_landmarks_df['landmarks'][0]):
    G.add_node(idx, pos=(x, y, z))
for connection in tesselation:
    G.add_edge(connection[0], connection[1])

# Export adjacency matrix
np.savetxt("standard_mesh_adj_matrix.csv", nx.to_numpy_array(G), delimiter=",")

2024-05-30 18:29:52.010306: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Load the graph structure of the standard mesh

In [4]:
adjacency_matrix = np.loadtxt('standard_mesh_adj_matrix.csv', delimiter=',')
G = nx.from_numpy_array(adjacency_matrix)

sparsity = 1.0 - np.count_nonzero(adjacency_matrix) / adjacency_matrix.size
print(f"Sparsity of the adjacency matrix: {100*sparsity:.2f}%")
if nx.is_connected(G):
    print(f"Graph is connected")
else:
    print("Unconnected nodes:", [node for node, degree in dict(G.degree()).items() if degree == 0])

Sparsity of the adjacency matrix: 98.79%
Graph is connected


### Split the data into train, validation and test sets

In [59]:
# Map string labels to integers
label_mapping = {
    'neutral': 0,
    'happiness': 1,
    'sadness': 2,
    'surprise': 3,
    'fear': 4,
    'disgust': 5,
    'anger': 6,
    'contempt': 7
}

# Inverse mapping for label names
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Function to convert df to data list using a given graph
def df_to_data_list(df, graph):
    data_list = []
    for _, row in df.iterrows():
        landmarks = torch.tensor(row['landmarks'], dtype=torch.float16)
        bbox = torch.tensor(row['bbox'], dtype=torch.float16)
        label = label_mapping[row['label']]
        
        # Create edge_index from the given graph
        edge_index = []
        for edge in graph.edges:
            edge_index.append([edge[0], edge[1]])
            edge_index.append([edge[1], edge[0]])
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        
        data = Data(x=landmarks, edge_index=edge_index, y=torch.tensor([label], dtype=torch.long), bbox=bbox)
        data_list.append(data)
    return data_list

# Function to split data while maintaining label ratio
def split_data(df, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
    assert abs((train_ratio + val_ratio + test_ratio) - 1.0) < 1e-10, "Ratios must sum to 1"
    
    train_list = []
    val_list = []
    test_list = []
    
    # Group by label
    grouped = df.groupby('label')
    
    for label, group in grouped:
        train, temp = train_test_split(group, train_size=train_ratio, stratify=group['label'])
        val, test = train_test_split(temp, test_size=test_ratio/(test_ratio + val_ratio), stratify=temp['label'])
        
        train_list.append(train)
        val_list.append(val)
        test_list.append(test)
    
    # Combine all the splits
    train_df = pd.concat(train_list)
    val_df = pd.concat(val_list)
    test_df = pd.concat(test_list)
    
    return train_df, val_df, test_df

# Perform the split
train_df, val_df, test_df = split_data(ck_landmarks_df)

# Convert splits to Data lists using the graph
train_data = df_to_data_list(train_df, G)
val_data = df_to_data_list(val_df, G)
test_data = df_to_data_list(test_df, G)


# Print size of each split
print(f"Train: {len(train_data)}  Validation: {len(val_data)}  Test: {len(test_data)}")

# Print a sample data point
train_data[0]

Train: 291  Validation: 84  Test: 45


Data(x=[468, 3], edge_index=[2, 2644], y=[1], bbox=[6])

### Verify that the label distribution in the original data is maintained in the splits 

In [60]:
# Inverse mapping for label names
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

def calculate_label_distribution(data_list):
    label_counts = {}
    for data in data_list:
        label = data.y.item()  # assuming y is a tensor with a single item
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1
    
    total = sum(label_counts.values())
    label_distribution = {label: (count / total) * 100 for label, count in label_counts.items()}
    return label_distribution

# Calculate distributions
original_distribution = calculate_label_distribution(df_to_data_list(ck_landmarks_df, G))
train_distribution = calculate_label_distribution(train_data)
val_distribution = calculate_label_distribution(val_data)
test_distribution = calculate_label_distribution(test_data)

def print_ratio_differences(original, train, val, test):
    labels = sorted(original.keys())
    print("\nLabel Ratios (in percentages):")
    print("Label\t\tOriginal\tTrain\tValidation\tTest")
    for label in labels:
        label_name = inverse_label_mapping[label]
        orig_ratio = original.get(label, 0)
        train_ratio = train.get(label, 0)
        val_ratio = val.get(label, 0)
        test_ratio = test.get(label, 0)
        print(f"{label_name.ljust(10)}\t{orig_ratio:.0f}%\t\t\t{train_ratio:.0f}%\t\t{val_ratio:.0f}%\t\t\t{test_ratio:.0f}%")

print_ratio_differences(original_distribution, train_distribution, val_distribution, test_distribution)


Label Ratios (in percentages):
Label		Original	Train	Validation	Test
neutral   	22%			22%		21%			22%
happiness 	16%			16%		17%			16%
sadness   	7%			7%		7%			7%
surprise  	20%			20%		19%			20%
fear      	6%			6%		6%			7%
disgust   	14%			14%		14%			13%
anger     	11%			11%		11%			11%
contempt  	4%			4%		5%			4%


### Visualize the 3D face mesh

In [61]:
# Add positions to the graph
for i, (x, y, z) in enumerate(ck_landmarks_df['landmarks'][0]):
    G.nodes[i]['pos'] = (x, y, z)

# Extract node positions
pos = nx.get_node_attributes(G, 'pos')

# Prepare data for 3D plot
edge_x, edge_y, edge_z = [], [], []
for edge in G.edges():
    x0, y0, z0 = pos[edge[0]]
    x1, y1, z1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]
    edge_z += [z0, z1, None]

node_x, node_y, node_z = [], [], []
for node in G.nodes():
    x, y, z = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_z.append(z)

# Create plotly figure
fig = go.Figure()

# Add edges to the plot
fig.add_trace(go.Scatter3d(
    x=edge_x, y=edge_y, z=edge_z,
    mode='lines',
    line=dict(color='blue', width=2),
    hoverinfo='none'
))

# Add nodes to the plot
fig.add_trace(go.Scatter3d(
    x=node_x, y=node_y, z=node_z,
    mode='markers',
    marker=dict(size=4, color='red'),
    hoverinfo='text'
))

# Update layout
fig.update_layout(
    title="3D Face Mesh",
    showlegend=False,
    scene=dict(
        xaxis=dict(showbackground=False),
        yaxis=dict(showbackground=False),
        zaxis=dict(showbackground=False)
    )
)

fig.show()

### Save and load data splits to / from disk

In [62]:
# Save data splits to disk
import pickle

# Paths to save the data
train_data_path = 'ck_data/train_data_70_20_10.pkl'
val_data_path = 'ck_data/val_data_70_20_10.pkl'
test_data_path = 'ck_data/test_data_70_20_10.pkl'

# Save the data splits
with open(train_data_path, 'wb') as f:
    pickle.dump(train_data, f)
with open(val_data_path, 'wb') as f:
    pickle.dump(val_data, f)
with open(test_data_path, 'wb') as f:
    pickle.dump(test_data, f)

In [63]:
# Load data splits from disk
import pickle

# Paths to load the data
train_data_path = 'ck_data/train_data_70_20_10.pkl'
val_data_path = 'ck_data/val_data_70_20_10.pkl'
test_data_path = 'ck_data/test_data_70_20_10.pkl'

# Load the data splits
with open(train_data_path, 'rb') as f:
    train_data = pickle.load(f)
with open(val_data_path, 'rb') as f:
    val_data = pickle.load(f)
with open(test_data_path, 'rb') as f:
    test_data = pickle.load(f)