In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import torch
import numpy as np
from matplotlib import pyplot as plt
import os
from pathlib import Path
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
def train_and_eval(model, train_in, train_out, val_in, val_out):
    model.fit(train_in, train_out)
    predicted_val = model.predict(val_in)
    print("\nPredicted classes: ", predicted_val, "\n")

    # Evaluate model
    return accuracy_score(val_out, predicted_val)

In [3]:
path=Path(os.getcwd()).parent
DIR_PATH=str(path) + '\\'

In [4]:
df = Path(str(path) + '/data/extracted_df.csv')
data = pd.read_csv(df)

In [5]:
data['expression'].value_counts()

expression
0    410
1    336
3    166
6    159
2     89
4     72
5     53
Name: count, dtype: int64

In [7]:
df_to_work = data[['expression', 'AU01', 'AU02', 'AU04', 'AU05', 'AU06', 'AU07', 'AU09', 'AU10', 'AU11', 'AU12', 'AU14', 'AU15', 'AU17', 'AU20', 'AU23', 'AU24', 'AU25', 'AU26', 'AU28', 'AU43']]
expression = {"anger": 0, "disgust": 1, "fear": 2, "happiness": 3, "neutral": 4, "sadness": 5, "surprise": 6}

In [9]:
class DiffFER(Dataset):
    def __init__(self, data):
        super().__init__()

        # everything in pytorch needs to be a tensor
        self.inputs = torch.tensor(data.drop("expression", axis=1).to_numpy(dtype=np.float32))

        # we need to transform label (str) to a number. In sklearn, this is done internally
        self.index2label = [label for label in data["expression"].unique()]
        label2index = {label: i for i, label in enumerate(self.index2label)}

        self.labels = torch.tensor(data["expression"].apply(lambda x: torch.tensor(label2index[x])))

    def __getitem__(self, index):
        return self.inputs[index], self.labels[index]

    def __len__(self):
        return len(self.inputs)

In [11]:
# See classes
print("Unique classes", df_to_work["expression"].unique(), "\n")

# see class balance
for class0 in df_to_work["expression"].unique():
    print(f"{(df_to_work['expression'] == class0).value_counts().iloc[1]}, {class0}")

Unique classes [6 3 0 2 5 4 1] 

159, 6
166, 3
410, 0
89, 2
53, 5
72, 4
336, 1


In [12]:
# Let split the dataset for training
labels = df_to_work["expression"]
inputs = df_to_work.drop("expression", axis=1)

In [13]:
# split = 70/20/10
data_in, test_in, data_out, test_out = train_test_split(
    inputs,
    labels,
    test_size=0.1,
    random_state=42,
    stratify=labels  # balances labels across the sets
)
train_in, val_in, train_out, val_out = train_test_split(
    data_in,
    data_out,
    test_size=(0.2/0.9),  # 20% of the original data
    random_state=42,
    stratify=data_out
)
print("\nLenght of each split of the data: ", len(train_in), len(val_in), len(test_in), "\n")


Lenght of each split of the data:  899 257 129 



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [14]:
model_1 = DecisionTreeClassifier()
print(
    "\nAccuracy of model_1: ",
    train_and_eval(model_1, train_in, train_out, val_in, val_out)
)



Predicted classes:  [3 1 0 0 2 1 4 4 4 0 0 3 4 1 3 3 4 1 0 3 1 3 5 6 0 5 1 6 1 3 4 3 0 2 3 6 6
 3 3 1 2 1 1 0 1 1 4 3 1 0 5 0 2 0 6 4 3 4 0 0 2 1 0 0 1 1 1 1 1 6 0 0 0 5
 0 2 0 1 0 3 6 0 6 0 1 1 4 0 1 1 1 0 0 0 1 3 6 0 0 6 4 0 1 0 1 3 3 2 0 1 4
 1 6 6 6 1 6 6 1 2 4 6 0 5 1 0 4 1 1 1 0 0 5 0 0 4 6 6 2 1 0 6 6 6 0 1 1 0
 1 0 6 2 0 6 2 0 3 6 6 3 3 1 5 0 0 1 3 0 2 1 1 1 3 1 1 0 3 1 1 0 0 1 2 2 1
 0 0 6 1 0 0 4 3 1 0 0 6 4 1 1 2 1 0 6 1 0 3 2 4 0 0 1 0 6 0 4 1 6 0 5 2 1
 0 0 1 1 4 0 0 0 3 0 0 1 1 1 1 0 4 6 0 0 6 2 0 1 6 0 5 0 1 4 3 3 0 3 1] 


Accuracy of model_1:  0.4591439688715953


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [15]:
model_2 = SVC()
print(
    "\nAccuracy of model_2: ",
    train_and_eval(model_2, train_in, train_out, val_in, val_out)
)


Predicted classes:  [6 1 0 0 0 1 0 0 0 0 0 0 0 0 3 3 0 0 0 3 0 3 0 6 3 0 0 3 1 3 3 3 0 0 0 3 0
 3 0 1 0 1 1 0 0 1 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 1 1 0 0 0 6
 0 1 0 1 1 3 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 3 0 1 0 1 0 3 0 0 0 3
 1 6 6 1 1 3 0 1 0 3 3 0 0 3 1 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 6 0 0 1 1 1
 1 0 0 0 0 6 1 0 6 6 3 3 3 0 0 0 0 1 1 0 0 1 0 1 6 1 1 0 0 1 1 0 0 1 0 6 1
 0 0 0 1 0 0 0 0 1 0 0 3 0 0 1 3 1 0 0 3 0 0 3 0 0 0 1 0 3 0 0 1 0 0 3 0 1
 0 0 1 1 0 1 0 0 0 0 3 1 1 1 1 0 0 6 0 0 6 0 0 1 6 0 0 0 0 3 3 6 0 0 1] 


Accuracy of model_2:  0.622568093385214


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [None]:
print(
    "Best model accuracy on test set: ",
    accuracy_score(
        test_out,
        model_2.predict(test_in)
    )
)

Best model accuracy on test set:  0.6068376068376068


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [None]:
model_3 = KNeighborsClassifier()
print(
    "\nAccuracy of model_3: ",
    train_and_eval(model_3, train_in, train_out, val_in, val_out)
)


Predicted classes:  ['happy' 'surprise' 'happy' 'happy' 'surprise' 'disgust' 'happy' 'neutral'
 'surprise' 'happy' 'neutral' 'surprise' 'happy' 'neutral' 'surprise'
 'happy' 'fear' 'fear' 'neutral' 'happy' 'disgust' 'fear' 'neutral'
 'happy' 'happy' 'angry' 'happy' 'fear' 'fear' 'neutral' 'happy' 'neutral'
 'happy' 'neutral' 'happy' 'neutral' 'disgust' 'angry' 'happy' 'happy'
 'angry' 'happy' 'disgust' 'happy' 'neutral' 'surprise' 'surprise'
 'surprise' 'happy' 'neutral' 'surprise' 'fear' 'angry' 'neutral' 'happy'
 'neutral' 'angry' 'neutral' 'disgust' 'happy' 'neutral' 'surprise'
 'surprise' 'neutral' 'neutral' 'surprise' 'fear' 'happy' 'neutral'
 'neutral' 'angry' 'happy' 'neutral' 'angry' 'neutral' 'happy' 'neutral'
 'neutral' 'happy' 'angry' 'neutral' 'neutral' 'happy' 'happy' 'neutral'
 'angry' 'neutral' 'happy' 'disgust' 'angry' 'fear' 'happy' 'disgust'
 'happy' 'neutral' 'happy' 'surprise' 'neutral' 'surprise' 'surprise'
 'neutral' 'angry' 'neutral' 'neutral' 'neutral' 'happy' 

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [None]:
print(
    "Best model accuracy on test set: ",
    accuracy_score(
        test_out,
        model_3.predict(test_in)
    )
)

Best model accuracy on test set:  0.5555555555555556


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [16]:
# Hyperparameter search/tuning
param_grid = [
    {"kernel": ["linear"],  "degree": [1,2,3,5,10,15,20,25]}, #poly
    #{"kernel": ["rbf", "linear", "sigmoid"]}
]

best_model = GridSearchCV(SVC(), param_grid)
best_model.fit(train_in, train_out)  # Fits on all combinations and keeps best model

print(
    "\n\nBest model with best parameters on test set: ",
    accuracy_score(
        test_out,
        best_model.predict(test_in)
    )
)
print(
    "Best parameters of best model: ",
    best_model.best_params_
)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if



Best model with best parameters on test set:  0.6356589147286822
Best parameters of best model:  {'degree': 1, 'kernel': 'linear'}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype

In [17]:
param_grid = [
    {"criterion": ["gini", "entropy"], "max_depth": [1, 2, 5, 10, 50, 80, 100]}
]

decision_tree_search = GridSearchCV(DecisionTreeClassifier(), param_grid)
decision_tree_search.fit(train_in, train_out)
print(
    "\n\nDecision tree with best parameters on test set: ",
    accuracy_score(
        test_out,
        decision_tree_search.predict(test_in)
    )
)
print(
    "Best parameters of Decision tree: ",
    decision_tree_search.best_params_
)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if



Decision tree with best parameters on test set:  0.5426356589147286
Best parameters of Decision tree:  {'criterion': 'gini', 'max_depth': 2}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [18]:
# Hyperparameter search/tuning
param_grid = [
    {"n_neighbors": [2,3,4,5,6,7,8,9,10,12,14,15,16,17,18,19,20,22,24,25,30]}
]

knn_best_model = GridSearchCV(KNeighborsClassifier(), param_grid)
knn_best_model.fit(train_in, train_out)  # Fits on all combinations and keeps best model

print(
    "\n\nBest model with best parameters on test set: ",
    accuracy_score(
        test_out,
        knn_best_model.predict(test_in)
    )
)
print(
    "Best parameters of best model: ",
    knn_best_model.best_params_
)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if



Best model with best parameters on test set:  0.5658914728682171
Best parameters of best model:  {'n_neighbors': 19}


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if

: 

In [None]:
print(
    "Best parameters of best model: ",
    best_model
)

Best parameters of best model:  GridSearchCV(estimator=SVC(),
             param_grid=[{'degree': [2, 3, 5, 10, 15, 20, 25],
                          'kernel': ['linear']}])


In [None]:
test_dataset = pd.read_csv(DIR_PATH + r'dataset/test_to_submit.csv')

In [None]:
predictions= best_model.predict(test_dataset)
predictions

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array(['happy', 'surprise', 'neutral', 'neutral', 'surprise', 'neutral',
       'neutral', 'angry', 'happy', 'angry', 'neutral', 'neutral',
       'neutral', 'surprise', 'happy', 'neutral', 'neutral', 'angry',
       'happy', 'happy', 'neutral', 'neutral', 'surprise', 'surprise',
       'angry', 'neutral', 'surprise', 'surprise', 'neutral', 'neutral',
       'happy', 'happy', 'neutral', 'neutral', 'neutral', 'happy',
       'neutral', 'surprise', 'neutral', 'surprise', 'neutral', 'neutral',
       'neutral', 'neutral', 'happy', 'neutral', 'neutral', 'surprise',
       'surprise', 'neutral', 'angry', 'surprise', 'neutral', 'neutral',
       'neutral', 'neutral', 'angry', 'neutral', 'surprise', 'neutral',
       'happy', 'neutral', 'happy', 'angry', 'neutral', 'disgust',
       'happy', 'neutral', 'neutral', 'neutral', 'surprise', 'happy',
       'neutral', 'surprise', 'surprise', 'angry', 'neutral', 'neutral',
       'neutral', 'happy', 'surprise', 'neutral', 'happy', 'happy',
       'h

In [None]:
#predictions.tofile(DIR_PATH +r'dataset/output')

# Create a DataFrame with the predictions
output_df = pd.DataFrame(predictions, columns=['emotion_label'])

# Save the DataFrame to a text file
output_df.to_csv(DIR_PATH +r'dataset/output', index=False, header=False)