# Processing action units 

In [12]:
import pandas as pd
import pickle

## Loading the file 

In [16]:
filepath = "/Users/dionnespaltman/Desktop/V3/action_units_temp.pkl"

# Load the dataset
action_units = pd.read_pickle(filepath)

In [3]:
print(action_units.columns)

Index(['Frame', 'Face_id', 'Confidence', 'Success', ' AU01_r', ' AU02_r',
       ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r',
       ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r',
       ' AU25_r', ' AU26_r', ' AU45_r', ' AU01_c', ' AU02_c', ' AU04_c',
       ' AU05_c', ' AU06_c', ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c',
       ' AU14_c', ' AU15_c', ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c',
       ' AU26_c', ' AU28_c', ' AU45_c', 'ID', 'Timeframe'],
      dtype='object')


In [4]:
display(action_units.head(5))

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Timeframe
103,104,0.0,0.88,1.0,0.0,0.0,0.0,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
104,105,0.0,0.98,1.0,0.0,0.0,0.0,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
111,112,0.0,0.98,1.0,0.31,0.59,0.0,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
114,115,0.0,0.98,1.0,0.0,0.12,0.0,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
153,154,0.0,0.98,1.0,1.26,1.67,0.0,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"


The temporary data has 184 unique variables (missing IDs from 147 to 287). 

In [5]:
print(len(action_units['ID'].unique()))

184


In [13]:
# Count occurrences of each value in the "Timeframe" column
timeframe_counts = action_units['Timeframe'].value_counts()

print(timeframe_counts)

[4, 5, 6]       2118735
[3, 4, 5, 6]     285556
[7]              269554
[1]              163222
[3]              134364
[2]               78159
[4]               68762
[4, 5]            40338
[4, 6]            15713
Name: Timeframe, dtype: int64


## Missing values 

In [8]:
nan_counts_au_df = action_units.isna().sum()
print(nan_counts_au_df)

Frame              0
Face_id            0
Confidence         0
Success            0
 AU01_r       243394
 AU02_r       243394
 AU04_r       243394
 AU05_r       243394
 AU06_r       243394
 AU07_r       243394
 AU09_r       243394
 AU10_r       243394
 AU12_r       243394
 AU14_r       243394
 AU15_r       243394
 AU17_r       243394
 AU20_r       243394
 AU23_r       243394
 AU25_r       243394
 AU26_r       243394
 AU45_r       243394
 AU01_c       243394
 AU02_c       243394
 AU04_c       243394
 AU05_c       243394
 AU06_c       243394
 AU07_c       243394
 AU09_c       243394
 AU10_c       243394
 AU12_c       243394
 AU14_c       243394
 AU15_c       243394
 AU17_c       243394
 AU20_c       243394
 AU23_c       243394
 AU25_c       243394
 AU26_c       243394
 AU28_c       243394
 AU45_c       243394
ID                 0
Timeframe          0
dtype: int64


## Dealing with missing values - MICE

Link: https://www.machinelearningplus.com/machine-learning/mice-imputation/?utm_content=cmp-true

In [17]:
# need to enable iterative imputer explicitly since its still experimental
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [18]:
# Define imputer
imputer = IterativeImputer(random_state=100, max_iter=10)

In [19]:
action_units.head(10)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Timeframe
0,104,0.0,0.88,1.0,0.0,0.0,0.0,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
1,105,0.0,0.98,1.0,0.0,0.0,0.0,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
2,112,0.0,0.98,1.0,0.31,0.59,0.0,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
3,115,0.0,0.98,1.0,0.0,0.12,0.0,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
4,154,0.0,0.98,1.0,1.26,1.67,0.0,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
5,155,0.0,0.98,1.0,1.5,2.48,0.0,0.0,0.6,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
6,156,0.0,0.88,1.0,0.88,2.77,0.0,0.0,0.59,0.02,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
7,157,0.0,0.98,1.0,0.34,2.22,0.0,0.0,0.73,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
8,160,0.0,0.98,1.0,0.0,1.3,0.0,0.0,0.19,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,80,"[4, 5, 6]"
9,161,0.0,0.98,1.0,0.0,1.79,0.0,0.0,0.25,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,80,"[4, 5, 6]"


In [20]:
columns = list(action_units.columns)
print(columns)

['Frame', 'Face_id', 'Confidence', 'Success', 'AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', ' AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', ' AU28_c', 'AU45_c', 'ID', 'Timeframe']


In [21]:
# Use Numeric Features
action_units_train = action_units.loc[:, ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', ' AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', ' AU28_c', 'AU45_c']]
action_units_train.head(10)

Unnamed: 0,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,AU09_r,AU10_r,AU12_r,AU14_r,...,AU12_c,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c
0,0.0,0.0,0.0,0.0,1.34,0.22,0.57,1.56,1.52,0.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.34,0.22,0.57,1.56,1.52,0.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.31,0.59,0.0,0.0,0.31,0.31,0.23,0.21,0.62,0.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.12,0.0,0.0,0.32,0.06,0.1,0.6,0.5,0.29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.26,1.67,0.0,0.0,0.39,0.26,0.0,1.22,0.64,0.18,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
5,1.5,2.48,0.0,0.0,0.6,0.26,0.0,1.89,1.02,0.55,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
6,0.88,2.77,0.0,0.0,0.59,0.02,0.0,1.84,1.22,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
7,0.34,2.22,0.0,0.0,0.73,0.0,0.0,1.88,1.51,1.44,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
8,0.0,1.3,0.0,0.0,0.19,0.0,0.04,1.18,0.69,0.72,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.0,1.79,0.0,0.0,0.25,0.0,0.12,1.57,0.99,0.88,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


Running everything up until the imputer was very fast. Running the imputer took around 9 minutes (on the temp data!). 

In [22]:
# fit on the dataset
imputer.fit(action_units_train)

In [23]:
action_units_imputed = imputer.transform(action_units_train)
action_units_imputed[:10]

array([[0.  , 0.  , 0.  , 0.  , 1.34, 0.22, 0.57, 1.56, 1.52, 0.54, 0.11,
        1.96, 1.08, 1.3 , 0.78, 0.  , 0.65, 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 1.34, 0.22, 0.57, 1.56, 1.52, 0.54, 0.11,
        1.96, 1.08, 1.3 , 0.78, 0.  , 0.65, 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  ],
       [0.31, 0.59, 0.  , 0.  , 0.31, 0.31, 0.23, 0.21, 0.62, 0.12, 0.  ,
        0.04, 0.49, 0.15, 0.  , 0.27, 0.38, 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  ],
       [0.  , 0.12, 0.  , 0.  , 0.32, 0.06, 0.1 , 0.6 , 0.5 , 0.29, 0.2 ,
        0.36, 0.91, 0.75, 0.55, 0.  , 0.36, 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  ],
       [1.26, 1.67, 0.  , 0.

In [24]:
# Replace with imputed values
action_units.loc[:, ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', ' AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', ' AU28_c', 'AU45_c']] = action_units_imputed
action_units.head(10)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Timeframe
0,104,0.0,0.88,1.0,0.0,0.0,0.0,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
1,105,0.0,0.98,1.0,0.0,0.0,0.0,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
2,112,0.0,0.98,1.0,0.31,0.59,0.0,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
3,115,0.0,0.98,1.0,0.0,0.12,0.0,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
4,154,0.0,0.98,1.0,1.26,1.67,0.0,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
5,155,0.0,0.98,1.0,1.5,2.48,0.0,0.0,0.6,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
6,156,0.0,0.88,1.0,0.88,2.77,0.0,0.0,0.59,0.02,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
7,157,0.0,0.98,1.0,0.34,2.22,0.0,0.0,0.73,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
8,160,0.0,0.98,1.0,0.0,1.3,0.0,0.0,0.19,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,80,"[4, 5, 6]"
9,161,0.0,0.98,1.0,0.0,1.79,0.0,0.0,0.25,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,80,"[4, 5, 6]"


In [25]:
nan_counts_au_df = action_units.isna().sum()
print(nan_counts_au_df)

Frame         0
Face_id       0
Confidence    0
Success       0
AU01_r        0
AU02_r        0
AU04_r        0
AU05_r        0
AU06_r        0
AU07_r        0
AU09_r        0
AU10_r        0
AU12_r        0
AU14_r        0
AU15_r        0
AU17_r        0
AU20_r        0
AU23_r        0
AU25_r        0
AU26_r        0
 AU45_r       0
AU01_c        0
AU02_c        0
AU04_c        0
AU05_c        0
AU06_c        0
AU07_c        0
AU09_c        0
AU10_c        0
AU12_c        0
AU14_c        0
AU15_c        0
AU17_c        0
AU20_c        0
AU23_c        0
AU25_c        0
AU26_c        0
 AU28_c       0
AU45_c        0
ID            0
Timeframe     0
dtype: int64


## Saving the action units file with the imputed values 

In [26]:
# action_units.to_pickle("/Users/dionnespaltman/Desktop/V3/action_units_temp_imputed.pkl")

# action_units.to_csv("/Users/dionnespaltman/Desktop/V3/action_units_temp_imputed.csv", index=False)

In [13]:
action_units = pd.read_pickle("/Users/dionnespaltman/Desktop/V3/action_units_temp_imputed.pkl")

## Creating file with just Stage = [4, 5, 6]

In [22]:
display(action_units)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Timeframe,Timeframe_str
0,104,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
1,105,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
2,112,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
3,115,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
4,154,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,0.26,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3395911,2196,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,0.47,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]
3395912,2197,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,0.56,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]
3395913,2198,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,0.56,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]
3395914,2199,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,0.56,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]


In [15]:
# Count occurrences of Timeframe values in action_units DataFrame
timeframe_counts = action_units['Timeframe'].value_counts()

print(timeframe_counts)


[4, 5, 6]       2313881
[3, 4, 5, 6]     311923
[7]              269554
[1]              163222
[3]              134364
[2]               78159
[4]               68762
[4, 5]            40338
[4, 6]            15713
Name: Timeframe, dtype: int64


In [23]:
action_units_456 = action_units[action_units['Timeframe'].apply(lambda x: set(x) == set([4, 5, 6]))]

display(action_units_456)


Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Timeframe,Timeframe_str
0,104,0.0,0.88,1.0,0.00,0.00,0.00,0.00,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
1,105,0.0,0.98,1.0,0.00,0.00,0.00,0.00,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
2,112,0.0,0.98,1.0,0.31,0.59,0.00,0.00,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
3,115,0.0,0.98,1.0,0.00,0.12,0.00,0.00,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
4,154,0.0,0.98,1.0,1.26,1.67,0.00,0.00,0.39,0.26,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3301669,21069.0,0.0,0.93,1.0,2.08,2.22,0.89,0.16,0.22,0.47,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301670,21070.0,0.0,0.93,1.0,2.03,2.14,0.87,0.69,0.27,0.85,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301671,21071.0,0.0,0.93,1.0,2.02,2.11,1.12,0.49,0.47,0.67,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301672,21072.0,0.0,0.93,1.0,2.43,2.01,1.11,0.65,0.35,0.82,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"


In [24]:
# Count occurrences of Timeframe values in action_units DataFrame
timeframe_counts = action_units_456['Timeframe'].value_counts()

print(timeframe_counts)

[4, 5, 6]    2313881
Name: Timeframe, dtype: int64


## TS Fresh requirements

In [25]:
# Necessary imports
!pip install tsfresh

import pandas as pd
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
from sklearn.preprocessing import StandardScaler
from tsfresh.feature_extraction import ComprehensiveFCParameters




## Appling TS Fresh on action units 

In [29]:
# action_units_456 = action_units_456.rename(columns={' AU45_r': 'AU45_r'})

In [30]:
columns = list(action_units_456.columns)
print(columns)

['Frame', 'Face_id', 'Confidence', 'Success', 'AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', ' AU28_c', 'AU45_c', 'ID', 'Timeframe', 'Timeframe_str']


In [31]:
au_long_format = pd.melt(action_units_456, id_vars=['ID', 'Frame'],
                                   var_name='Measurement', value_name='Value',
                                   value_vars=['AU01_r', 'AU02_r',
       'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r',
       'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r',
       'AU25_r', 'AU26_r', 'AU45_r'])

In [32]:
display(au_long_format)

Unnamed: 0,ID,Frame,Measurement,Value
0,80,104,AU01_r,0.00
1,80,105,AU01_r,0.00
2,80,112,AU01_r,0.31
3,80,115,AU01_r,0.00
4,80,154,AU01_r,1.26
...,...,...,...,...
39335972,27,21069.0,AU45_r,1.50
39335973,27,21070.0,AU45_r,1.23
39335974,27,21071.0,AU45_r,1.43
39335975,27,21072.0,AU45_r,1.23


In [33]:
nan_counts = au_long_format.isna().sum()
print(nan_counts)

ID             0
Frame          0
Measurement    0
Value          0
dtype: int64


In Judita her article, she used the following six intensity level characteristics: sum, variance, standard deviation, maximum, mean, and mean-root square values (total number of extracted features = 102). 

In [34]:
# Convert 'ID' and 'Frame' columns to integer data type
au_long_format['ID'] = au_long_format['ID'].astype(int)
au_long_format['Frame'] = au_long_format['Frame'].astype(int)

# # Convert 'Measurement' column to string data type
au_long_format['Measurement'] = au_long_format['Measurement'].astype(str)

In [35]:
column_types = au_long_format.dtypes
print(column_types)

ID               int64
Frame            int64
Measurement     object
Value          float64
dtype: object


In [36]:
# Specificeer de gewenste kenmerken
settings = {
    # 'sum': {},
    'sum_values': {},
    'variance': {},
    'standard_deviation': {},
    'maximum': {},
    'minimum': {},
    #'median': {},
    'mean': {},
    'mean_abs_change': {},  #proxy voor gemiddelde van de wortel van de kwadraten als 'mean root square' niet direct beschikbaar is
    #'agg_linear_trend': [{'attr': 'slope', 'f_agg': 'max', 'chunk_len': 5}, {'attr': 'slope', 'f_agg': 'min', 'chunk_len': 5}]  #maximale en minimale helling
}

# Extract features met aangepaste instellingen
extracted_features_au = extract_features(
    au_long_format,
    column_id='ID',  # Identifies the time series
    column_sort='Frame',  # Orders the time points
    column_kind='Measurement',  # Distinguishes between different variables if necessary
    column_value='Value',  # The actual measurements
    default_fc_parameters=settings
)


Feature Extraction: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]


In [38]:
print(extracted_features_au.head)

<bound method NDFrame.head of      AU01_r__sum_values  AU01_r__variance  AU01_r__standard_deviation  \
23              4982.48          0.425041                    0.651952   
24              9390.23          0.448366                    0.669601   
25              6954.35          0.599805                    0.774471   
26              9707.43          0.873280                    0.934495   
27             21049.90          1.475421                    1.214669   
..                  ...               ...                         ...   
142            11750.48          0.739290                    0.859820   
143            13978.96          0.763932                    0.874032   
144             7175.00          0.461053                    0.679009   
145            11354.47          0.516678                    0.718803   
146            10215.25          0.212331                    0.460794   

     AU01_r__maximum  AU01_r__minimum  AU01_r__mean  AU01_r__mean_abs_change  \
23           

## Saving the extracted features to csv and pkl

In [39]:
extracted_features_au.to_csv('/Users/dionnespaltman/Desktop/V3/action_units_temp_456_extracted.csv', index=True)
extracted_features_au.to_pickle('/Users/dionnespaltman/Desktop/V3/action_units_temp_456_extracted.pkl')

## Understanding extracted features 

In [52]:
print(extracted_features_au.describe())

       AU01_r__sum_values  AU01_r__variance  AU01_r__standard_deviation  \
count          184.000000      1.840000e+02                1.840000e+02   
mean          7145.876891      3.950921e-01                5.827582e-01   
std           5258.113440      2.933320e-01                2.361951e-01   
min              1.590000      3.081488e-33                5.551115e-17   
25%           1468.632500      2.108621e-01                4.591160e-01   
50%           7317.655000      3.251451e-01                5.702067e-01   
75%          10545.170000      5.197051e-01                7.208982e-01   
max          22053.430000      1.617627e+00                1.271860e+00   

       AU01_r__maximum  AU01_r__minimum  AU01_r__mean  \
count       184.000000       184.000000    184.000000   
mean          4.187974        -0.041374      0.354239   
std           1.081899         0.441383      0.152848   
min           0.380000        -4.330000      0.056170   
25%           3.642500         0.000000

Running the visualization took around ... minutes. (Haven't run it yet)

In [None]:
# visualization (for example, using seaborn)
import seaborn as sns
sns.pairplot(extracted_features_au)

## Merging 

Creating a dataset including the dependent variable, ID, Stage (only 4, 5, 6) and extracted intensities.  


In [52]:
# Assuming 'ID' is the common column in both DataFrames
merged_df = pd.merge(VVR_scores, extracted_features_au, on='ID', how='inner')

# Check the length of the merged DataFrame
print(len(merged_df))
merged_df.drop('Unnamed: 0.1', axis=1, inplace=True)
merged_df.drop('Unnamed: 0', axis=1, inplace=True)

display(merged_df)

111


Unnamed: 0,ID,sum_12,sum_4567,sum_456,VVR_group,Condition,AU01_r__sum_values,AU01_r__variance,AU01_r__standard_deviation,AU01_r__maximum,...,AU26_r__minimum,AU26_r__mean,AU26_r__mean_abs_change,AU45_r__sum_values,AU45_r__variance,AU45_r__standard_deviation,AU45_r__maximum,AU45_r__minimum,AU45_r__mean,AU45_r__mean_abs_change
0,23,24.0,37.0,27.0,0,2,4982.48,0.425041,0.651952,5.00,...,0.00,0.633284,0.076328,9231.74,0.825039,0.908316,4.91,0.00,0.627753,0.133624
1,24,23.0,37.0,28.0,0,2,9390.23,0.448366,0.669601,5.00,...,0.00,1.484701,0.125851,11887.00,0.634554,0.796589,5.00,0.00,0.436942,0.098134
2,25,28.0,44.0,33.0,1,2,6954.35,0.599805,0.774471,4.53,...,0.00,0.862301,0.101969,9020.78,0.750701,0.866430,4.04,0.00,0.550652,0.085720
3,26,30.0,37.0,29.0,0,1,9707.43,0.873280,0.934495,4.73,...,0.00,0.552359,0.069582,6585.31,0.609348,0.780607,4.90,0.00,0.371673,0.056287
4,27,22.0,39.0,31.0,1,2,21049.90,1.475421,1.214669,5.99,...,-3.92,0.142027,0.386527,23027.73,1.160635,1.077328,5.04,-4.29,1.094318,0.231853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,142,20.0,34.0,26.0,0,3,11750.48,0.739290,0.859820,5.00,...,0.00,0.448007,0.076771,10547.03,0.506052,0.711374,3.53,0.00,0.457691,0.055759
107,143,25.0,48.0,36.0,1,3,13978.96,0.763932,0.874032,5.00,...,0.00,0.960124,0.142141,6782.20,0.152370,0.390346,2.81,0.00,0.237997,0.035817
108,144,24.0,35.0,27.0,0,3,7175.00,0.461053,0.679009,4.97,...,0.00,0.551210,0.075307,7319.31,0.418456,0.646882,3.78,0.00,0.391344,0.072455
109,145,20.0,37.0,28.0,0,1,11354.47,0.516678,0.718803,5.00,...,0.00,0.641290,0.064991,16768.57,0.748187,0.864978,4.31,0.00,0.535566,0.118258


In [51]:
# merged_df.to_csv('/Users/dionnespaltman/Desktop/V3/merged_df.csv', index=True)


## Reducing features to the most important ones 
## Doesn't work yet 
In this example, y represents the target variable you are trying to predict or classify. The select_features function filters out the irrelevant features, keeping only those with significant predictive power.

In [41]:
VVR_scores = pd.read_csv('/Users/dionnespaltman/Desktop/V3/VVR_scores_final.csv')

# Double check what extracted features you need (so from what stage)
extracted_features_au = pd.read_csv('/Users/dionnespaltman/Desktop/V3/action_units_temp_456_extracted.csv')

In [42]:
print(len(VVR_scores['VVR_group']))
print(len(extracted_features_au))

320
111


In [43]:
extracted_features_au = extracted_features_au.rename(columns={'Unnamed: 0': 'ID'})

In [53]:
display(extracted_features_au.head(5))

Unnamed: 0,ID,AU01_r__sum_values,AU01_r__variance,AU01_r__standard_deviation,AU01_r__maximum,AU01_r__minimum,AU01_r__mean,AU01_r__mean_abs_change,AU02_r__sum_values,AU02_r__variance,...,AU26_r__minimum,AU26_r__mean,AU26_r__mean_abs_change,AU45_r__sum_values,AU45_r__variance,AU45_r__standard_deviation,AU45_r__maximum,AU45_r__minimum,AU45_r__mean,AU45_r__mean_abs_change
0,23,4982.48,0.425041,0.651952,5.0,0.0,0.338806,0.051614,2244.43,0.164346,...,0.0,0.633284,0.076328,9231.74,0.825039,0.908316,4.91,0.0,0.627753,0.133624
1,24,9390.23,0.448366,0.669601,5.0,0.0,0.345166,0.042494,5122.35,0.277836,...,0.0,1.484701,0.125851,11887.0,0.634554,0.796589,5.0,0.0,0.436942,0.098134
2,25,6954.35,0.599805,0.774471,4.53,0.0,0.424512,0.048749,2192.4,0.120917,...,0.0,0.862301,0.101969,9020.78,0.750701,0.86643,4.04,0.0,0.550652,0.08572
3,26,9707.43,0.87328,0.934495,4.73,0.0,0.547885,0.033221,2641.0,0.150057,...,0.0,0.552359,0.069582,6585.31,0.609348,0.780607,4.9,0.0,0.371673,0.056287
4,27,21049.9,1.475421,1.214669,5.99,-4.07,1.000328,0.187191,16193.17,1.597831,...,-3.92,0.142027,0.386527,23027.73,1.160635,1.077328,5.04,-4.29,1.094318,0.231853


I can't really do anything further, because I don't have all the video data. But I'll try to match the IDs of what I do have. 

In [59]:
from tsfresh import select_features
# from tsfresh.utilities.dataframe_functions import impute

# # Impute missing values 
# extracted_features_au = impute(extracted_features_au)

# Your target variable 
y = merged_df['VVR_group']

print(y)
display(extracted_features_au)


0      0
1      0
2      1
3      0
4      1
      ..
106    0
107    1
108    0
109    0
110    1
Name: VVR_group, Length: 111, dtype: int64


Unnamed: 0,ID,AU01_r__sum_values,AU01_r__variance,AU01_r__standard_deviation,AU01_r__maximum,AU01_r__minimum,AU01_r__mean,AU01_r__mean_abs_change,AU02_r__sum_values,AU02_r__variance,...,AU26_r__minimum,AU26_r__mean,AU26_r__mean_abs_change,AU45_r__sum_values,AU45_r__variance,AU45_r__standard_deviation,AU45_r__maximum,AU45_r__minimum,AU45_r__mean,AU45_r__mean_abs_change
0,23,4982.48,0.425041,0.651952,5.00,0.00,0.338806,0.051614,2244.43,0.164346,...,0.00,0.633284,0.076328,9231.74,0.825039,0.908316,4.91,0.00,0.627753,0.133624
1,24,9390.23,0.448366,0.669601,5.00,0.00,0.345166,0.042494,5122.35,0.277836,...,0.00,1.484701,0.125851,11887.00,0.634554,0.796589,5.00,0.00,0.436942,0.098134
2,25,6954.35,0.599805,0.774471,4.53,0.00,0.424512,0.048749,2192.40,0.120917,...,0.00,0.862301,0.101969,9020.78,0.750701,0.866430,4.04,0.00,0.550652,0.085720
3,26,9707.43,0.873280,0.934495,4.73,0.00,0.547885,0.033221,2641.00,0.150057,...,0.00,0.552359,0.069582,6585.31,0.609348,0.780607,4.90,0.00,0.371673,0.056287
4,27,21049.90,1.475421,1.214669,5.99,-4.07,1.000328,0.187191,16193.17,1.597831,...,-3.92,0.142027,0.386527,23027.73,1.160635,1.077328,5.04,-4.29,1.094318,0.231853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,142,11750.48,0.739290,0.859820,5.00,0.00,0.509915,0.052530,4726.74,0.271937,...,0.00,0.448007,0.076771,10547.03,0.506052,0.711374,3.53,0.00,0.457691,0.055759
107,143,13978.96,0.763932,0.874032,5.00,0.00,0.490541,0.040663,5675.33,0.235525,...,0.00,0.960124,0.142141,6782.20,0.152370,0.390346,2.81,0.00,0.237997,0.035817
108,144,7175.00,0.461053,0.679009,4.97,0.00,0.383628,0.040392,3864.99,0.282831,...,0.00,0.551210,0.075307,7319.31,0.418456,0.646882,3.78,0.00,0.391344,0.072455
109,145,11354.47,0.516678,0.718803,5.00,0.00,0.362647,0.035199,7260.07,0.303435,...,0.00,0.641290,0.064991,16768.57,0.748187,0.864978,4.31,0.00,0.535566,0.118258


In [60]:
# Selecting important features 
important_features = select_features(extracted_features_au, y)

# Display important features
print(important_features.head())

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


## ...

## ...

## ...

## ...

## ...