In [77]:
import matplotlib.pyplot as plt
import tensorflow as tf
import cv2
import numpy as np
import random
import os
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import load_model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from json_tricks import dump, load
from pydub import AudioSegment, effects
import librosa
import noisereduce as nr
import tensorflow as tf
import keras
import sklearn 
import librosa
import time
import noisereduce as nr  # Import the noise reduction library

## LOADING MODELS

In [195]:
FER2013 = load_model('Weights/FER2013_BestYet.keras')
AffectNET = load_model('Weights/AffectNet_Final.keras')
SER = load_model('Weights/best_weights.keras')

tokenizer = AutoTokenizer.from_pretrained("michellejieli/emotion_text_classifier")
textModel = AutoModelForSequenceClassification.from_pretrained("michellejieli/emotion_text_classifier")
textModel.load_state_dict(torch.load("Weights/trained_emotion_text_classifier_3.pth"))

  textModel.load_state_dict(torch.load("Weights/trained_emotion_text_classifier_3.pth"))


<All keys matched successfully>

## Inputs

In [196]:
voicePath = "/home/group02-f24/Documents/Zoghby/Audio/TESS/TESS Toronto emotional speech set data/YAF_sad/YAF_death_sad.wav"
img = cv2.imread("/home/group02-f24/Desktop/SadBaby.jpg")
img = img.astype('float32') / 255.0  # Normalize if needed
textInput = "I am so devastated!"

#### MAPPING

In [197]:
emotionMapping = {'0': 'Anger','1': 'Fear','2': 'Happiness', '3': 'Sadness', '4': 'Surprise', '5': 'Neutral'}
Textemotion_labels = ["Sadness", "Happiness", "Neutral", "Anger", "Fear", "Surprise"]
SER_labels =['Neutral', 'Calm', 'Happiness', 'Sadness', 'Anger', 'Fear', 'Disgust', 'Surprise']
SER_to_emotion_mapping = {4: 0, 5: 1, 2: 2, 3: 3, 7: 4, 0: 5}


reverse_mapping = {v: k for k, v in emotionMapping.items()}

TextClasifiedLabels = {}
SERClasifiedLabels = {}
for i, label in enumerate(Textemotion_labels):
    if label in reverse_mapping:
        TextClasifiedLabels[i] = int(reverse_mapping[label])  # map old index to new index
        
for i, label in enumerate(SER_labels):
    if label in reverse_mapping:
        SERClasifiedLabels[i] = int(reverse_mapping[label])  # map old index to new index

print("Mapped Text label indices:", TextClasifiedLabels)
print("Mapped SER label indices:", SERClasifiedLabels)

Mapped Text label indices: {0: 3, 1: 2, 2: 5, 3: 0, 4: 1, 5: 4}
Mapped SER label indices: {0: 5, 2: 2, 3: 3, 4: 0, 5: 1, 7: 4}


In [198]:
print('SER input shape: ',SER.input_shape)
print('FER2013 input shape: ',FER2013.input_shape)
print('AffectNET input shape: ',AffectNET.input_shape)
textModel.eval()

SER input shape:  (None, 335, 50)
FER2013 input shape:  (None, 48, 48, 1)
AffectNET input shape:  (None, 96, 96, 3)


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

# Text Function

In [199]:
def predict_emotion(model, text):

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1)
        predicted_class = torch.argmax(probs, dim=1).item()

    predicted_emotion = Textemotion_labels[predicted_class]
    return predicted_emotion,predicted_class, probs.numpy()


## Voice Preprocess

In [200]:
def preprocess_audio_file(file_path, total_length=173056, target_sr=22050, top_db=30):
    # Load using pydub (for normalization)
    rawsound = AudioSegment.from_file(file_path)
    normalized = effects.normalize(rawsound, headroom=5.0)
    samples = np.array(normalized.get_array_of_samples(), dtype='float32')
    max_val = float(2 ** (8 * rawsound.sample_width - 1))
    samples = samples / max_val
    
    trimmed, _ = librosa.effects.trim(samples, top_db=top_db)
    
    if len(trimmed) < total_length:
        padded = np.pad(trimmed, (0, total_length - len(trimmed)), mode='constant')
    else:
        padded = trimmed[:total_length]
    
    sr_orig = rawsound.frame_rate
    if sr_orig != target_sr:
        padded = librosa.resample(padded, orig_sr=sr_orig, target_sr=target_sr)
        sr = target_sr
    else:
        sr = sr_orig
    reduced = nr.reduce_noise(y=padded, sr=sr)
    
    return reduced, sr

total_length = 173056
frame_length = 2048
hop_length = 512
expected_frames = 1 + int((total_length - frame_length) / hop_length)

def extract_features_fixed(signal, sr, frame_length=2048, hop_length=512, n_mfcc=13, expected_frames=expected_frames):
    rms = librosa.feature.rms(y=signal, frame_length=frame_length, hop_length=hop_length)
    zcr = librosa.feature.zero_crossing_rate(y=signal, frame_length=frame_length, hop_length=hop_length, center=True)
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    
    centroid = librosa.feature.spectral_centroid(y=signal, sr=sr, hop_length=hop_length)
    contrast = librosa.feature.spectral_contrast(y=signal, sr=sr, hop_length=hop_length)
    rolloff = librosa.feature.spectral_rolloff(y=signal, sr=sr, hop_length=hop_length)
    
    features = np.vstack((zcr, rms, mfcc, mfcc_delta, mfcc_delta2, centroid, contrast, rolloff)).T
    current_frames = features.shape[0]
    
    if current_frames < expected_frames:
        pad_width = expected_frames - current_frames
        features = np.pad(features, ((0, pad_width), (0, 0)), mode='constant')
    elif current_frames > expected_frames:
        features = features[:expected_frames, :] 
    return features

def PredictSER(model, file_path, total_length=173056, target_sr=22050, top_db=30,frame_length=2048, hop_length=512, n_mfcc=13, expected_frames=335):  
    # Preprocess the audio file
    signal, sr = preprocess_audio_file(file_path, total_length=total_length, target_sr=target_sr, top_db=top_db)
    
    # Extract features with fixed number of frames
    features = extract_features_fixed(signal, sr, frame_length=frame_length, hop_length=hop_length, n_mfcc=n_mfcc, expected_frames=expected_frames)

    X_input = np.expand_dims(features, axis=0)
    predictions = model.predict(X_input)
    
    # Map predicted index to emotion
    predicted_class = np.argmax(predictions, axis=1)[0]
    emotion_map = {0: 'Neutral', 1: 'Calm', 2: 'Happiness', 3: 'Sadness', 4: 'Angry', 5: 'Fear', 6: 'Disgust', 7: 'Surprise'}
    predicted_emotion = emotion_map.get(predicted_class, "Unknown")
    
    return predictions, predicted_emotion

# FER Function

In [201]:
def fer2013inputshape(img):
    img = cv2.resize(img, (48, 48))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = np.expand_dims(img, axis=-1)
    img = np.expand_dims(img, axis=0)
    return img

# AffectNet Function

In [202]:
def AffectNetinputshape(img):
    img = cv2.resize(img, (96, 96))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = np.expand_dims(img, axis=-1)
    img = np.expand_dims(img, axis=0)
    return img

In [203]:
def SERinputshape(wav):
    wav = np.pad(wav, (0, 335-len(wav)), 'constant', constant_values=(0, 0))
    wav = np.expand_dims(wav, axis=-1)
    wav = np.expand_dims(wav, axis=0)
    return wav

In [204]:
FER2013img = fer2013inputshape(img)
AffectNETimg = AffectNetinputshape(img)

### RAW PREDICTIONS

In [205]:
FER2013pred = FER2013.predict(FER2013img)
AffectNETpred = AffectNET.predict(AffectNETimg)
emotion, predclass,probs = predict_emotion(textModel,textInput)
SERpredictions, SERpredicted_emotion = PredictSER(SER, voicePath)
SERpredictions = SERpredictions.flatten()

AffectNETpred = AffectNETpred
FER2013pred = FER2013pred
probs = probs[0, :6]

reordered_preds = np.zeros_like(probs)
reordered_predsSER = np.zeros_like(SERpredictions)

for old_idx, new_idx in SERClasifiedLabels.items():
    reordered_predsSER[new_idx] = SERpredictions[old_idx]

for old_idx, new_idx in TextClasifiedLabels.items():
    reordered_preds[new_idx] = probs[old_idx]

SERpredictions = reordered_predsSER [0:6] 
probs = reordered_preds
concPred = FER2013pred + AffectNETpred + probs + SERpredictions

print('FER2013 prediction: ',FER2013pred)
print('AffectNET prediction: ',AffectNETpred)
print('Text prediction: ',probs)
print('SER prediction: ',SERpredictions)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 324ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 262ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step
FER2013 prediction:  [[0.49917305 0.10686404 0.20733239 0.02033958 0.01804267 0.14824827]]
AffectNET prediction:  [[0.03042624 0.15741353 0.01259953 0.57219344 0.11132995 0.11603738]]
Text prediction:  [5.0969154e-04 1.1051499e-03 6.9200998e-04 9.9651402e-01 8.5336313e-04
 3.2578286e-04]
SER prediction:  [6.7629060e-08 5.7915036e-06 6.3169034e-07 9.9999011e-01 9.9687381e-10
 2.4821279e-06]


### Normalized PREDICTIONS

In [206]:
FER2013pred = np.argmax(FER2013pred, axis=1)
AffectNETpred = np.argmax(AffectNETpred, axis=1)
print('Text Classified Emotion:',emotionMapping[str(TextClasifiedLabels[predclass])])
print('FER2013 prediction: ',emotionMapping[str(FER2013pred[0])])
print('AffectNET prediction: ',emotionMapping[str(AffectNETpred[0])])
print('SER prediction: ',SERpredicted_emotion)

Text Classified Emotion: Sadness
FER2013 prediction:  Anger
AffectNET prediction:  Sadness
SER prediction:  Sadness


### Concatenated Predictions

In [209]:
finalPrediction = np.argmax(concPred, axis=1)
concPred = concPred / np.sum(concPred)
print("Final Prediction Class:", finalPrediction)
print("Raw Concatenated Predictions:",concPred)
print('Final Prediction: ',emotionMapping[str(finalPrediction[0])])

Final Prediction Class: [3]
Raw Concatenated Predictions: [[0.13252728 0.06634714 0.05515615 0.6472594  0.0325565  0.06615348]]
Final Prediction:  Sadness
