### In this notebook, we will:
- Load the CK+ dataset
- Save the standard mesh structure
- Load the standard mesh graph structure
- Split the data into train, validation and test sets
- Verify that the label distribution in the original data is maintained in the splits
- Visualize the 3D face mesh
- Save the data splits to the disk
- Load the data splits from the disk

In [None]:
import pickle
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data
import networkx as nx
import plotly.graph_objects as go

In [None]:
# load ck_landmarks from pickle file
ck_landmarks_path = 'ck_data/ck_landmarks.pkl'
with open(ck_landmarks_path, 'rb') as f:
    ck_landmarks_df = pickle.load(f)
num_landmarks = len(ck_landmarks_df['landmarks'][0])
print(f"Number of landmarks: {num_landmarks}, Number of samples: {len(ck_landmarks_df.index)}")
# delete first 500 rows of the dataframe and reset index
ck_landmarks_df = ck_landmarks_df.iloc[500:]
ck_landmarks_df = ck_landmarks_df.reset_index(drop=True)
ck_landmarks_df

### Create and save the standard mesh from the mediapipe library

In [None]:
import mediapipe as mp
tesselation = mp.solutions.face_mesh.FACEMESH_TESSELATION
G = nx.Graph()
for idx, (x, y, z) in enumerate(ck_landmarks_df['landmarks'][0]):
    G.add_node(idx, pos=(x, y, z))
for connection in tesselation:
    G.add_edge(connection[0], connection[1])

# Export adjacency matrix
np.savetxt("standard_mesh_adj_matrix.csv", nx.to_numpy_array(G), delimiter=",")

### Load the graph structure of the standard mesh

In [None]:
adjacency_matrix = np.loadtxt('standard_mesh_adj_matrix.csv', delimiter=',')
G = nx.from_numpy_array(adjacency_matrix)

sparsity = 1.0 - np.count_nonzero(adjacency_matrix) / adjacency_matrix.size
print(f"Sparsity of the adjacency matrix: {100*sparsity:.2f}%")
if nx.is_connected(G):
    print(f"Graph is connected")
else:
    print("Unconnected nodes:", [node for node, degree in dict(G.degree()).items() if degree == 0])

### Split the data into train, validation and test sets

In [None]:
# Map string labels to integers
label_mapping = {
    'neutral': 0,
    'happiness': 1,
    'sadness': 2,
    'surprise': 3,
    'fear': 4,
    'disgust': 5,
    'anger': 6,
    'contempt': 7
}

# Inverse mapping for label names
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Function to convert df to data list using a given graph
def df_to_data_list(df, graph):
    data_list = []
    edge_index = []
    
    for edge in graph.edges:
        edge_index.append([edge[0], edge[1]])
        edge_index.append([edge[1], edge[0]])
    edge_index = torch.tensor(edge_index, dtype=torch.int16).t().contiguous()
    
    for _, row in df.iterrows():
        landmarks = torch.tensor(row['landmarks'], dtype=torch.float16)
        bbox = torch.tensor(row['bbox'], dtype=torch.float16)
        label = label_mapping[row['label']]
        
        data = Data(x=landmarks, edge_index=edge_index, y=torch.tensor([label], dtype=torch.long), bbox=bbox)
        data_list.append(data)
    return data_list

# Function to split data while maintaining label ratio
def split_data(df, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
    assert abs((train_ratio + val_ratio + test_ratio) - 1.0) < 1e-10, "Ratios must sum to 1"
    
    train_list = []
    val_list = []
    test_list = []
    
    # Group by label
    grouped = df.groupby('label')
    
    for label, group in grouped:
        train, temp = train_test_split(group, train_size=train_ratio, stratify=group['label'])
        val, test = train_test_split(temp, test_size=test_ratio/(test_ratio + val_ratio), stratify=temp['label'])
        
        train_list.append(train)
        val_list.append(val)
        test_list.append(test)
    
    # Combine all the splits
    train_df = pd.concat(train_list)
    val_df = pd.concat(val_list)
    test_df = pd.concat(test_list)
    
    return train_df, val_df, test_df

# Perform the split
train_df, val_df, test_df = split_data(ck_landmarks_df)

# Convert splits to Data lists using the graph
train_data = df_to_data_list(train_df, G)
val_data = df_to_data_list(val_df, G)
test_data = df_to_data_list(test_df, G)


# Print size of each split
print(f"Train: {len(train_data)}  Validation: {len(val_data)}  Test: {len(test_data)}")

# Print a sample data point
train_data[0]

### Verify that the label distribution in the original data is maintained in the splits 

In [None]:
# Inverse mapping for label names
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

def calculate_label_distribution(data_list):
    label_counts = {}
    for data in data_list:
        label = data.y.item()  # assuming y is a tensor with a single item
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1
    
    total = sum(label_counts.values())
    label_distribution = {label: (count / total) * 100 for label, count in label_counts.items()}
    return label_distribution

# Calculate distributions
original_distribution = calculate_label_distribution(df_to_data_list(ck_landmarks_df, G))
train_distribution = calculate_label_distribution(train_data)
val_distribution = calculate_label_distribution(val_data)
test_distribution = calculate_label_distribution(test_data)

def print_ratio_differences(original, train, val, test):
    labels = sorted(original.keys())
    print("\nLabel Ratios (in percentages):")
    print("Label\t\tOriginal\tTrain\tValidation\tTest")
    for label in labels:
        label_name = inverse_label_mapping[label]
        orig_ratio = original.get(label, 0)
        train_ratio = train.get(label, 0)
        val_ratio = val.get(label, 0)
        test_ratio = test.get(label, 0)
        print(f"{label_name.ljust(10)}\t{orig_ratio:.0f}%\t\t\t{train_ratio:.0f}%\t\t{val_ratio:.0f}%\t\t\t{test_ratio:.0f}%")

print_ratio_differences(original_distribution, train_distribution, val_distribution, test_distribution)

### Visualize the 3D face mesh

In [None]:
# Add positions to the graph
for i, (x, y, z) in enumerate(ck_landmarks_df['landmarks'][0]):
    G.nodes[i]['pos'] = (x, y, z)

# Extract node positions
pos = nx.get_node_attributes(G, 'pos')

# Prepare data for 3D plot
edge_x, edge_y, edge_z = [], [], []
for edge in G.edges():
    x0, y0, z0 = pos[edge[0]]
    x1, y1, z1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]
    edge_z += [z0, z1, None]

node_x, node_y, node_z = [], [], []
for node in G.nodes():
    x, y, z = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_z.append(z)

# Create plotly figure
fig = go.Figure()

# Add edges to the plot
fig.add_trace(go.Scatter3d(
    x=edge_x, y=edge_y, z=edge_z,
    mode='lines',
    line=dict(color='blue', width=2),
    hoverinfo='none'
))

# Add nodes to the plot
fig.add_trace(go.Scatter3d(
    x=node_x, y=node_y, z=node_z,
    mode='markers',
    marker=dict(size=4, color='red'),
    hoverinfo='text'
))

# Update layout
fig.update_layout(
    title="3D Face Mesh",
    showlegend=False,
    scene=dict(
        xaxis=dict(showbackground=False),
        yaxis=dict(showbackground=False),
        zaxis=dict(showbackground=False)
    )
)

fig.show()

### Save and load data splits to / from disk

In [None]:
# Save data splits to disk
import pickle

# Paths to save the data
train_data_path = 'ck_data/train_data_70_20_10.pkl'
val_data_path = 'ck_data/val_data_70_20_10.pkl'
test_data_path = 'ck_data/test_data_70_20_10.pkl'

# Save the data splits
with open(train_data_path, 'wb') as f:
    pickle.dump(train_data, f)
with open(val_data_path, 'wb') as f:
    pickle.dump(val_data, f)
with open(test_data_path, 'wb') as f:
    pickle.dump(test_data, f)
print("Data splits saved to disk.")

In [None]:
# Load data splits from disk
import pickle

# Paths to load the data
train_data_path = 'ck_data/train_data_70_20_10.pkl'
val_data_path = 'ck_data/val_data_70_20_10.pkl'
test_data_path = 'ck_data/test_data_70_20_10.pkl'

# Load the data splits
with open(train_data_path, 'rb') as f:
    train_data = pickle.load(f)
with open(val_data_path, 'rb') as f:
    val_data = pickle.load(f)
with open(test_data_path, 'rb') as f:
    test_data = pickle.load(f)
print("Data splits loaded from disk.")

In [None]:
import pandas as pd
import plotly.graph_objects as go
# Function to plot landmarks of the same expression on top of each other
def plot_expression_landmarks(df, expression_label):
    # Filter the DataFrame by the specified expression label
    expression_df = df[df['label'] == expression_label]
    
    # Create an empty figure
    fig = go.Figure()
    
    # Add all landmarks of the same expression to the figure
    for idx, row in expression_df.iterrows():
        landmarks = row['landmarks']
        x_vals = [coord[0] for coord in landmarks]
        y_vals = [coord[1] for coord in landmarks]
        z_vals = [coord[2] for coord in landmarks]
        
        fig.add_trace(go.Scatter3d(
            x=x_vals, y=y_vals, z=z_vals,
            mode='markers',
            marker=dict(size=2, opacity=0.5)
        ))
    
    # Update layout for better visualization
    fig.update_layout(
        title=f'3D Landmarks for Expression: {expression_label}',
        scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z')
    )
    
    # Show the plot
    fig.show(renderer='browser')

# List of unique expressions in the DataFrame
expressions = ck_landmarks_df['label'].unique()
print(expressions)
# Plot landmarks for each expression
# for expression in expressions:
plot_expression_landmarks(ck_landmarks_df, 'neutral')