In [2]:
# import useful libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import os
import joblib

In [3]:
# read the dataset
dataset_directory = "../processed/"
data = pd.read_csv(dataset_directory+"aus.csv")
inputs = data.drop("Unnamed: 0", axis=1)
inputs = inputs.drop(["input","face","frame"],axis=1)

inputs = inputs.dropna()

inputs.head()

labels = inputs["emotions"]
inputs = inputs.drop("emotions", axis=1)


In [4]:

# split = 70/20/10
data_in, test_in, data_out, test_out = train_test_split(
    inputs,
    labels,
    test_size=0.1,
    random_state=18,
    stratify=labels,  # balances labels across the sets
)

train_in, val_in, train_out, val_out = train_test_split(
    data_in,
    data_out,
    test_size=(0.2 / 0.9),  # 20% of the original data
    random_state=18,
    stratify=data_out,
)
train_in

Unnamed: 0,FaceRectX,FaceRectY,FaceRectWidth,FaceRectHeight,FaceScore,x_0,x_1,x_2,x_3,x_4,...,AU26,AU28,AU43,anger,disgust,fear,happiness,sadness,surprise,neutral
1236,25.736223,12.141386,184.504759,202.126955,0.997854,14.221463,12.755213,14.328785,21.170532,34.757541,...,0.379287,0.048427,0.095835,0.000573,0.000086,1.143200e-04,0.000360,0.661190,0.000435,0.337242
712,14.743028,2.997759,187.488326,207.703840,0.998749,7.541536,13.386972,21.044698,31.467807,47.116771,...,0.066171,0.345989,0.434884,0.208928,0.024792,2.353687e-03,0.004968,0.057586,0.001341,0.700031
364,8.470014,12.212509,192.603686,204.587296,0.999350,6.174214,9.634752,14.608539,21.362039,33.840603,...,0.152316,0.508313,0.824280,0.000281,0.000005,7.806845e-07,0.986076,0.000312,0.000009,0.013316
431,16.174509,14.903926,185.194891,203.781484,0.999118,7.937333,7.479537,9.962593,15.705766,27.800853,...,0.125997,0.389856,0.419714,0.001103,0.000373,1.017517e-04,0.986971,0.000895,0.000912,0.009645
846,22.175928,-2.835185,191.182425,214.655604,0.998952,16.075465,13.695630,15.072491,22.049187,35.734631,...,0.157669,0.188250,0.039044,0.029047,0.002627,2.646730e-03,0.001942,0.166528,0.001175,0.796034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,21.915174,14.619707,174.617022,198.709547,0.997993,19.748803,19.781839,21.229985,24.798247,33.727691,...,0.056478,0.529340,0.715230,0.000660,0.000050,7.196333e-06,0.998514,0.000375,0.000251,0.000143
468,18.808022,10.543539,189.865959,214.632669,0.999271,15.768042,14.696542,15.879377,19.421814,29.609184,...,0.140021,0.309670,0.681390,0.000142,0.000053,7.314109e-05,0.997792,0.000994,0.000319,0.000627
871,22.309469,12.064909,169.461908,195.683779,0.993389,22.860135,23.466042,27.300253,35.888995,46.786997,...,0.263751,0.021937,0.064050,0.000167,0.000025,2.506478e-04,0.000231,0.123477,0.000357,0.875492
93,20.305311,9.821847,185.389803,207.411780,0.998977,12.425982,11.786830,13.758410,19.872629,32.564612,...,0.122129,0.092681,0.155465,0.985851,0.004189,5.906412e-03,0.000021,0.001594,0.001774,0.000664


In [27]:
# Train model 2 svc rbl

from sklearn.preprocessing import StandardScaler

# Create Pipeline with SVC
pipeline = Pipeline(
    [
        # Step 1: Preprocessing - Standardize features
        ("scaler", StandardScaler()),
        # Step 2: Model - Support Vector Classifier
        ("svc", SVC(probability=True)),  # probability=True for prediction probabilities
    ]
)

# Define Hyperparameter Grid for Tuning
param_grid = {
    # Hyperparameters for the SVC
    "svc__C": [0.1, 1, 10, 100],  # Regularization parameter
    "svc__kernel": ["linear", "rbf", "poly"],  # Kernel type
    "svc__gamma": ["scale", "auto", 0.1, 1],  # Kernel coefficient
    "svc__class_weight": [None, "balanced"],  # Handle class imbalance
}

# Grid Search with Cross-Validation
grid_search = GridSearchCV(
    pipeline,  # The pipeline we created
    param_grid,  # Hyperparameter combinations to try
    cv=5,  # 5-fold cross-validation
    scoring="accuracy",  # Metric to evaluate
    n_jobs=-1,  # Use all available cores
)

# Fit the grid search on training data
grid_search.fit(train_in, train_out)

# Best model and parameters
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Validate on validation set
val_score = grid_search.score(val_in, val_out)
print("Validation Set Accuracy:", val_score)

# Final evaluation on test set
test_score = grid_search.score(test_in, test_out)
print("Test Set Accuracy:", test_score)

# Optional: Detailed Classification Report
from sklearn.metrics import classification_report

# Get the best model
best_model = grid_search.best_estimator_

# Predictions on test set
y_pred = best_model.predict(test_in)

# Classification report
print("\nClassification Report:")
print(classification_report(test_out, y_pred))

# Optional: Prediction probabilities
y_pred_proba = best_model.predict_proba(test_in)

Best Parameters: {'svc__C': 0.1, 'svc__class_weight': None, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Best Cross-Validation Score: 0.7385474860335195
Validation Set Accuracy: 0.78125
Test Set Accuracy: 0.7421875

Classification Report:
              precision    recall  f1-score   support

       angry       0.60      0.50      0.55        18
     disgust       0.50      0.56      0.53         9
        fear       1.00      0.29      0.44         7
       happy       0.97      0.94      0.95        33
     neutral       0.67      0.94      0.79        35
         sad       0.71      0.62      0.67         8
    surprise       0.77      0.56      0.65        18

    accuracy                           0.74       128
   macro avg       0.75      0.63      0.65       128
weighted avg       0.76      0.74      0.73       128



In [15]:
class MachineLearningWorkflow:
    def __init__(self, data_path, target_column):
        """
        Initialize the machine learning workflow

        Args:
            data_path (str): Path to the dataset
            target_column (str): Name of the target column
        """
        self.data_path = data_path
        self.target_column = target_column
        self.data = None
        self.X_train = None
        self.X_val = None
        self.X_test = None
        self.y_train = None
        self.y_val = None
        self.y_test = None

    def prepare_dataset(self, test_size=0.1, val_size=(0.2 / 0.9), random_state=42):
        """
        Prepare the dataset by loading and splitting

        Args:
            test_size (float): Proportion of test set
            val_size (float): Proportion of validation set
            random_state (int): Random seed for reproducibility
        """
        # Load data
        self.data = pd.read_csv(self.data_path)

        # Display class distribution
        print("Class Distribution:")
        print(self.data[self.target_column].value_counts())

        # Separate features and target

        features = self.data.dropna()
        features = features.drop(["input", "face", "frame", "Unnamed: 0"], axis=1)
        labels = features[self.target_column]

        au_columns = [col for col in features.columns if col.startswith("AU")]
        features = features[au_columns]
        print(features)

        

        # First split
        X_temp, self.X_test, y_temp, self.y_test = train_test_split(
            features,
            labels,
            test_size=test_size,
            random_state=random_state,
            stratify=labels,
        )

        # Second split
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            X_temp, y_temp, test_size=val_size, random_state=random_state
        )

        return self

    def create_preprocessing_pipeline(self):
        """
        Create a preprocessing pipeline with imputation and scaling

        Returns:
            Pipeline: Preprocessing pipeline
        """
        preprocessing_pipeline = Pipeline(
            [
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler()),
            ]
        )

        return preprocessing_pipeline

    def evaluate_model(self, model, X_val, y_val):
        """
        Evaluate model performance

        Args:
            model: Trained model
            X_val: Validation features
            y_val: Validation labels

        Returns:
            dict: Performance metrics
        """
        y_pred = model.predict(X_val)

        metrics = {
            "accuracy": accuracy_score(y_val, y_pred),
            "precision": precision_score(y_val, y_pred, average="weighted"),
            "recall": recall_score(y_val, y_pred, average="weighted"),
            "f1_score": f1_score(y_val, y_pred, average="weighted"),
        }

        return metrics

    def tune_model_hyperparameters(self, model_class, param_grid):
        """
        Tune hyperparameters for a given model

        Args:
            model_class: Model class to tune
            param_grid (dict): Hyperparameter grid

        Returns:
            RandomizedSearchCV: Best model after hyperparameter tuning
        """
        full_pipeline = Pipeline(
            [
                ("preprocessor", self.create_preprocessing_pipeline()),
                ("classifier", model_class()),
            ]
        )

        random_search = RandomizedSearchCV(
            full_pipeline,
            param_distributions=param_grid,
            n_iter=50,
            cv=5,
            scoring="accuracy",
            random_state=42,
        )

        random_search.fit(self.X_train, self.y_train)

        print(f"Best parameters for {model_class.__name__}:")
        print(random_search.best_params_)

        return random_search

    def train_and_compare_models(self):
        """
        Train multiple models and compare their performance

        Returns:
            dict: Models with their performance metrics
        """
        # Define models and their hyperparameter grids
        models = {
            "RandomForest": {
                "class": RandomForestClassifier,
                "params": {
                    "classifier__n_estimators": [100, 200, 300],
                    "classifier__max_depth": [10, 20, 30, None],
                },
            },
            "SVM": {
                "class": SVC,
                "params": {
                    "classifier__kernel": ["rbf", "poly", "sigmoid"],
                    "classifier__C": [0.1, 1, 10, 100],
                },
            },
            "DecisionTree": {
                "class": DecisionTreeClassifier,
                "params": {
                    "classifier__max_depth": [5, 10, 15, 20],
                    "classifier__min_samples_split": [2, 5, 10],
                },
            },
        }

        model_performances = {}

        for name, model_config in models.items():
            print(f"\nTuning {name} Model")
            tuned_model = self.tune_model_hyperparameters(
                model_config["class"], model_config["params"]
            )

            # Evaluate on validation set
            metrics = self.evaluate_model(
                tuned_model.best_estimator_, self.X_val, self.y_val
            )

            model_performances[name] = {
                "best_model": tuned_model.best_estimator_,
                "metrics": metrics,
            }

        return model_performances

    def select_best_model(self, model_performances):
        """
        Select the best model based on accuracy

        Args:
            model_performances (dict): Dictionary of model performances

        Returns:
            tuple: Best model and its name
        """
        best_model_name = max(
            model_performances,
            key=lambda k: model_performances[k]["metrics"]["accuracy"],
        )

        best_model = model_performances[best_model_name]["best_model"]
        best_accuracy = model_performances[best_model_name]["metrics"]["accuracy"]

        print(f"\nBest Model: {best_model_name}")
        print(f"Validation Accuracy: {best_accuracy:.4f}")

        return best_model, best_model_name

    def save_best_model(self, best_model, best_model_name, save_dir="../model/"):
        """
        Save the best performing model

        Args:
            best_model: Best trained model
            best_model_name (str): Name of the best model
            save_dir (str): Directory to save the model
        """
        os.makedirs(save_dir, exist_ok=True)
        model_filename = f"best_emotion_model_{best_model_name}.pkl"
        model_path = os.path.join(save_dir, model_filename)

        joblib.dump(best_model, model_path)
        print(f"Model saved to {model_path}")

In [None]:
import warnings
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import joblib


workflow = MachineLearningWorkflow(
    data_path=dataset_directory + "aus.csv", target_column="emotions"
)

workflow.prepare_dataset()
model_performances = workflow.train_and_compare_models()
best_model, best_model_name = workflow.select_best_model(model_performances)
workflow.save_best_model(best_model, best_model_name)

Class Distribution:
emotions
neutral     348
happy       340
surprise    182
angry       178
disgust      88
sad          81
fear         72
Name: count, dtype: int64
          AU01      AU02      AU04      AU05      AU06  AU07      AU09  \
0     0.462118  0.187764  0.578946  0.328081  0.538130   1.0  0.535241   
1     0.360411  0.191716  0.247170  0.269536  0.079867   0.0  0.259076   
2     0.183939  0.150695  0.490818  0.274774  0.653131   1.0  0.636941   
3     0.561900  0.431248  0.545331  0.438458  0.145115   0.0  0.367367   
4     0.437871  0.435552  0.651622  0.320687  0.240153   1.0  0.413129   
...        ...       ...       ...       ...       ...   ...       ...   
1284  0.241089  0.370427  0.334878  0.328424  0.122249   0.0  0.330490   
1285  0.397340  0.158596  0.353561  0.404856  0.127621   1.0  0.411720   
1286  0.309860  0.305702  0.351745  0.406746  0.100878   1.0  0.240786   
1287  0.387912  0.216832  0.701315  0.353338  0.193991   0.0  0.504722   
1288  0.346435  0.1



Best parameters for RandomForestClassifier:
{'classifier__n_estimators': 300, 'classifier__max_depth': 20}

Tuning SVM Model


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best parameters for SVC:
{'classifier__kernel': 'rbf', 'classifier__C': 1}

Tuning DecisionTree Model


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best parameters for DecisionTreeClassifier:
{'classifier__min_samples_split': 5, 'classifier__max_depth': 5}

Best Model: SVM
Validation Accuracy: 0.6680
Model saved to ../model/best_emotion_model_SVM.pkl


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
