THIS IS FOR THE AUDIO EMOTION RECOGNITION

In [17]:
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder
from moviepy.editor import VideoFileClip
from pydub import AudioSegment
from math import ceil
import os  # Ensure os is imported
import pandas as pd
import numpy as np
import librosa

video_path = "Video1\Angry_video.mp4"  # Change this to the path of your video file
audio_path = "Audio2\emotion.mp3"  # Output path for the extracted audio

# Load the video file and extract the audio
video_clip = VideoFileClip(video_path)
audio_clip = video_clip.audio
audio_clip.write_audiofile(audio_path)

audio_clip.close()
video_clip.close()

# Load the extracted audio
audio = AudioSegment.from_file(audio_path)

# Define the length of each chunk in milliseconds
chunk_length_ms = 3000

# Calculate the number of chunks needed
num_chunks = ceil(len(audio) / chunk_length_ms)

# Split the audio and save each chunk
for i in range(num_chunks):
    start_time = i * chunk_length_ms
    end_time = min((i + 1) * chunk_length_ms, len(audio))
    chunk = audio[start_time:end_time]
    chunk_name = f'Audio2\chunk_{i+1}.mp3'  # Naming each chunk
    chunk.export(chunk_name, format="mp3")
    print(f'Exported {chunk_name}')

# After exporting all chunks, delete the original audio file
os.remove(audio_path)
print(f'Deleted original audio file at {audio_path}')

# Directory where the chunks are saved
chunks_dir = 'Audio2'
audio_files = [f for f in os.listdir(chunks_dir) if f.endswith('.mp3')]
audio_files.sort()  # Optional, to process the files in a sorted order

df = pd.DataFrame(columns=['feature'])
bookmark = 0

for index, filename in enumerate(audio_files):
    # Adjust the condition according to your naming convention, if needed
    file_path = os.path.join(chunks_dir, filename)
    X, sample_rate = librosa.load(file_path, res_type='kaiser_fast', duration=2.5, sr=22050*2, offset=0.5)
    sample_rate = np.array(sample_rate)
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
    feature = mfccs
    df.loc[bookmark] = [feature]
    bookmark += 1

# Now, df contains the features extracted from each audio chunk
#print(df)
df = pd.DataFrame(df['feature'].values.tolist())
df[:]
df.fillna('0')

model=load_model('Voice/Voice-Emotion-Detector/saved_models/Emotion_Voice_Detection_Model.h5')   
print('Imported the model named %s ' % model)

labels = ['female_angry', 'female_calm', 'female_fearful', 'female_happy', 'female_sad', 'male_angry', 'male_calm', 'male_fearful', 'male_happy', 'male_sad']

lb = LabelEncoder()

# Fit the LabelEncoder instance to your known labels
lb.fit(labels)

x_traincnn =np.expand_dims(df, axis=2)

preds = model.predict(x_traincnn, batch_size=1, verbose=1)
preds1=preds.argmax(axis=1)
abc = preds1.astype(int).flatten()
predictions = (lb.inverse_transform((abc)))
preddf = pd.DataFrame({'predictedvalues': predictions})
preddf[:]

# Assuming preddf is your DataFrame
# Remove 'male_' and 'female_' prefixes from the 'predictedvalues' column
preddf['predictedvalues'] = preddf['predictedvalues'].str.replace('female_', '').str.replace('male_', '')

print(preddf)
# Calculate the percentage of each emotion
emotion_counts = preddf['predictedvalues'].value_counts(normalize=True) * 100

# Convert the emotion_counts to a DataFrame for nicer formatting, if desired
emotion_percentage_df_audio_sound = emotion_counts.reset_index().rename(columns={'index': 'emotion', 'predictedvalues': 'percentage'})

# Display the emotion percentage DataFrame
print(emotion_percentage_df_audio_sound)


MoviePy - Writing audio in Audio2\emotion.mp3


                                                                  

MoviePy - Done.




Exported Audio2\chunk_1.mp3
Exported Audio2\chunk_2.mp3
Exported Audio2\chunk_3.mp3
Deleted original audio file at Audio2\emotion.mp3
Imported the model named <keras.engine.sequential.Sequential object at 0x000002A5D3EF8A60> 
  predictedvalues
0         fearful
1           angry
2           angry
  percentage  proportion
0      angry   66.666667
1    fearful   33.333333


# This is the text analysis to find the emotions in the text

In [38]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import soundfile as sf
import torch
import os
from pydub import AudioSegment
import librosa
import soundfile as sf
import numpy as np

# Check if CUDA is available for GPU usage, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"PyTorch is using device: {device}")

# Load the pretrained Wav2Vec2 model and processor
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Directory where the chunks are saved
chunks_dir = 'Audio2'  # Update this path to your chunks directory
audio_chunks = os.listdir(chunks_dir)

# Function to transcribe a single audio file
def transcribe(audio_path):
    # Load the audio file using librosa
    speech, sr = librosa.load(audio_path, sr=16000)  # Resample to 16000 Hz
    
    # Process the speech file
    input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
    input_values = input_values.to(device)

    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    return transcription

# Iterate over audio chunks and save transcriptions to text files
for chunk in audio_chunks:
    if chunk.endswith('.mp3'):  # Make sure to process only audio files
        audio_path = os.path.join(chunks_dir, chunk)
        transcription = transcribe(audio_path)
        
        # Saving the transcription to a text file
        text_filename = chunk.replace('.mp3', '.txt')
        text_path = os.path.join(chunks_dir, text_filename)  # Saving text files in the same directory as chunks
        with open(text_path, 'w') as text_file:
            text_file.write(transcription[0])  # Assuming one transcription per audio chunk

        print(f"Transcription saved to {text_path}")


PyTorch is using device: cuda


Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Transcription saved to Audio2\chunk_1.txt
Transcription saved to Audio2\chunk_2.txt
Transcription saved to Audio2\chunk_3.txt


In [37]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
import pandas as pd

# Function to read text from a file
def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read().strip()

# Function to classify text
def classify_emotion(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        logits = model(**inputs).logits
    probabilities = torch.softmax(logits, dim=-1)
    return probabilities

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")

# Path to your directory containing .txt files
text_files_directory = 'Audio2/'
text_files = [f for f in os.listdir(text_files_directory) if f.endswith('.txt')]

labels = model.config.id2label

# Initialize a list to collect dictionaries
data = []

for text_file in text_files:
    file_path = os.path.join(text_files_directory, text_file)
    text = read_text_from_file(file_path)
    
    probabilities = classify_emotion(text, tokenizer, model)
    top_probs, top_lbls = torch.topk(probabilities, 3, dim=-1)
    
    # Append a new dictionary to the list for each file
    data.append({
        'Emotion 1': labels[top_lbls[0][0].item()], 'Prob Emotion 1': f"{top_probs[0][0].item():.4f}",
        'Emotion 2': labels[top_lbls[0][1].item()], 'Prob Emotion 2': f"{top_probs[0][1].item():.4f}",
        'Emotion 3': labels[top_lbls[0][2].item()], 'Prob Emotion 3': f"{top_probs[0][2].item():.4f}",
    })

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

# Optionally, save the DataFrame to a CSV file
df.to_csv('emotion_predictions_from_audio_text.csv', index=False)


  Emotion 1 Prob Emotion 1 Emotion 2 Prob Emotion 2  Emotion 3 Prob Emotion 3
0   neutral         0.7595     anger         0.1270  annoyance         0.0659
1   neutral         0.9962  approval         0.0006  annoyance         0.0006
2   neutral         0.9301     anger         0.0355  annoyance         0.0094


This is the code to analyze the audio and generate a text file of it.

In [None]:
import torch
from transformers import (
    Wav2Vec2ForCTC, Wav2Vec2Processor,
    AutoTokenizer, AutoModelForSequenceClassification
)
import librosa
import os
import pandas as pd

# Check device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"PyTorch is using device: {device}")

# Load Wav2Vec2 model and processor for transcription
wav2vec2_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Load RoBERTa model and tokenizer for emotion classification
emotion_tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
emotion_model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions").to(device)

# Define labels
labels = emotion_model.config.id2label

# Function to transcribe audio
def transcribe(audio_path):
    speech, sr = librosa.load(audio_path, sr=16000)
    input_values = wav2vec2_processor(speech, return_tensors="pt", sampling_rate=16000).input_values.to(device)
    with torch.no_grad():
        logits = wav2vec2_model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = wav2vec2_processor.batch_decode(predicted_ids)
    return transcription[0]

# Function to classify text
def classify_emotion(text):
    inputs = emotion_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        logits = emotion_model(**inputs).logits
    probabilities = torch.softmax(logits, dim=-1)
    top_probs, top_lbls = torch.topk(probabilities, 3, dim=-1)
    return [(labels[top_lbls[0][i].item()], top_probs[0][i].item()) for i in range(3)]

# Directory with audio chunks
chunks_dir = 'Audio2'  # Adjust as necessary
audio_chunks = [f for f in os.listdir(chunks_dir) if f.endswith('.mp3')]

# Initialize list for DataFrame
data = []


# Specify the directory where you want to save the text files
text_files_directory = 'Audio2'

# Ensure the directory exists
os.makedirs(text_files_directory, exist_ok=True)

# Process each chunk
for chunk in audio_chunks:
    audio_path = os.path.join(chunks_dir, chunk)
    transcription = transcribe(audio_path)
    
    # Define the path for the new text file
    text_filename = chunk.replace('.mp3', '.txt')
    text_file_path = os.path.join(text_files_directory, text_filename)
    
    # Save the transcription to the text file
    with open(text_file_path, 'w', encoding='utf-8') as text_file:
        text_file.write(transcription)
    
    print(f"Transcription saved to {text_file_path}")

    # Now proceed with emotion classification
    emotions = classify_emotion(transcription)
    
    # Prepare data for DataFrame
    row = {}  # Using text_filename to refer to the saved file
    for i, emo in enumerate(emotions):
        row[f'Emotion {i+1}'] = emo[0]
        row[f'Prob Emotion {i+1}'] = f"{emo[1]:.4f}"
    
    data.append(row)

# Create DataFrame
df = pd.DataFrame(data)

# Display DataFrame
print(df)

# Save to CSV
df.to_csv('emotion_predictions_from_audio_text.csv', index=False)

In [1]:
import os
import cv2
from paz.applications import HaarCascadeFrontalFace, MiniXceptionFER
import paz.processors as pr

class EmotionDetector(pr.Processor):
    def __init__(self):
        super(EmotionDetector, self).__init__()
        self.detect = HaarCascadeFrontalFace(draw=False)
        self.crop = pr.CropBoxes2D()
        self.classify = MiniXceptionFER()
        self.draw = pr.DrawBoxes2D(self.classify.class_names)

    def call(self, image):
        boxes2D = self.detect(image)['boxes2D']
        emotions = []
        for cropped_image, box2D in zip(self.crop(image, boxes2D), boxes2D):
            emotion = self.classify(cropped_image)['class_name']
            emotions.append(emotion)
            box2D.class_name = emotion
        image_with_boxes = self.draw(image, boxes2D)
        return image_with_boxes, emotions

# Initialize the emotion detector
detect = EmotionDetector()

video_chunks_dir = 'Video2'
images_dir = os.path.join(video_chunks_dir, 'images')
os.makedirs(images_dir, exist_ok=True)

# Process each video chunk
for chunk_filename in os.listdir(video_chunks_dir):
    if chunk_filename.endswith('.mp4'):
        chunk_path = os.path.join(video_chunks_dir, chunk_filename)
        # Extract frames and apply the EmotionDetector
        cap = cv2.VideoCapture(chunk_path)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        emotions_all_frames = []

        for _ in range(frame_count):
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            _, emotions = detect(frame)
            emotions_all_frames.extend(emotions)

            # Optionally save frame with detected emotions drawn
            # frame_save_path = os.path.join(images_dir, f"frame_{frame_index}_{chunk_filename}.jpg")
            # cv2.imwrite(frame_save_path, cv2.cvtColor(processed_frame, cv2.COLOR_RGB2BGR))

        # Calculate the average emotion for the video chunk
        # This part requires defining how you quantify and average emotions
        # Example: Count occurrences of each emotion and find the most frequent

        cap.release()



