In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def create_stratified_datasets(X, y, test_size=0.2, val_size=0.25, random_state=None):
    """
    Creates three stratified datasets - train, validation, and test - from the input data X and target y.
    
    Parameters:
        X (array-like): The input data.
        y (array-like): The target variable.
        test_size (float): The proportion of the data to include in the test set.
        val_size (float): The proportion of the remaining data to include in the validation set.
        random_state (int): Seed for the random number generator.
    
    Returns:
        A tuple containing the train, validation, and test datasets, each as a tuple of input and target variables.
    """
    
    # split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    
    # split remaining data into validation and train sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size/(1-test_size), random_state=random_state, stratify=y_train)
    
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

In [2]:
CK_emotion = pd.read_pickle("./CK_Data_96_Emotion_LF")

In [24]:
CK_emotion

Unnamed: 0,Subject,Number,Code,Image,Emotion
0,S032,005,00000016,"[[0.5294118, 0.5254902, 0.49803922, 0.39215687...",3
1,S108,006,00000020,"[[0.13725491, 0.16470589, 0.19215687, 0.133333...",3
2,S052,004,00000033,"[[0.38431373, 0.39215687, 0.39215687, 0.384313...",5
3,S087,007,00000016,"[[0.21176471, 0.2, 0.20784314, 0.20784314, 0.1...",1
4,S895,002,00000007,"[[0.7882353, 0.4627451, 0.4862745, 0.6313726, ...",2
...,...,...,...,...,...
322,S014,005,00000017,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7764706...",5
323,S107,005,00000011,"[[0.2509804, 0.26666668, 0.22745098, 0.0705882...",3
324,S060,003,00000018,"[[0.31764707, 0.30588236, 0.2901961, 0.2823529...",7
325,S111,001,00000014,"[[0.1764706, 0.18039216, 0.16862746, 0.1647058...",7


In [26]:
CK_emotion_target = CK_emotion['Emotion']
CK_data_target = CK_emotion['Image']

In [27]:
(X_train, y_train), (X_val, y_val), (X_test, y_test) = create_stratified_datasets(CK_data_target,CK_emotion_target)

In [29]:
print(f'Distribution in training set: \n{y_train.value_counts().sort_index() / len(y_train)}\n\n'+
      f'Distribution in validation set: \n{y_val.value_counts().sort_index() / len(y_val)}\n\n'+
      f'Distribution in testing set: \n{y_test.value_counts().sort_index() / len(y_test)}')
     

Distribution in training set: 
1    0.139665
2    0.055866
3    0.178771
4    0.078212
5    0.212291
6    0.083799
7    0.251397
Name: Emotion, dtype: float64

Distribution in validation set: 
1    0.134146
2    0.060976
3    0.182927
4    0.073171
5    0.207317
6    0.085366
7    0.256098
Name: Emotion, dtype: float64

Distribution in testing set: 
1    0.136364
2    0.045455
3    0.181818
4    0.075758
5    0.212121
6    0.090909
7    0.257576
Name: Emotion, dtype: float64


294    5
189    6
41     5
52     4
306    1
      ..
112    2
256    5
277    2
310    6
91     6
Name: Emotion, Length: 261, dtype: object