In [1]:
#pip install ffmpeg-python

In [2]:
#!pip install git+https://github.com/openai/whisper.git  deepface ffmpeg-python

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My\ Drive/LLM_MODEL_CLASSIFCATION (1)

ValueError: mount failed

In [None]:
import os
import pandas as pd
import cv2
from deepface import DeepFace
import whisper
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import ffmpeg

# Initialize GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Step 1: Extract audio from the video
def extract_audio_from_video(video_path, audio_path="temp_audio.wav"):
    ffmpeg.input(video_path).output(audio_path).run(overwrite_output=True)

# Step 2: Transcribe audio to text using Whisper
def transcribe_audio(audio_path):
    whisper_model = whisper.load_model("base")
    result = whisper_model.transcribe(audio_path)
    return result['text']

def get_highest_non_neutral_emotion(emotion_dict):
    # Remove 'neutral' from the emotion dictionary
    if 'neutral' in emotion_dict:
        del emotion_dict['neutral']
    # Find the emotion with the highest probability
    dominant_emotion = max(emotion_dict, key=emotion_dict.get)
    return dominant_emotion, emotion_dict[dominant_emotion]

def analyze_video_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    emotions_across_frames = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Analyze frame with DeepFace
        result = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)

        # Get highest non-neutral emotion
        dominant_emotion, emotion_prob = get_highest_non_neutral_emotion(result[0]['emotion'])
        emotions_across_frames.append((dominant_emotion, emotion_prob))

    cap.release()

    # Find the dominant emotion with the highest probability across all frames
    if emotions_across_frames:
        overall_dominant_emotion, highest_prob = max(emotions_across_frames, key=lambda x: x[1])
        return overall_dominant_emotion, highest_prob
    else:
        return None, None

# Step 4: Generate a textual description using GPT-2
def generate_description(text):
    inputs = tokenizer.encode(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(inputs, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    description = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return description

def process_video(video_path):
    # Define paths
    audio_path = "temp_audio.wav"

    # Extract audio from the video
    extract_audio_from_video(video_path, audio_path)

    # Transcribe the audio to text
    transcription = transcribe_audio(audio_path)

    # Analyze the video for the most prominent emotion
    dominant_emotion, highest_prob = analyze_video_frames(video_path)

    # Generate a textual description using GPT-2
    description = generate_description(transcription)

    # Incorporate emotion into the enhanced description
    enhanced_description = f"{description} The person seems to be displaying '{dominant_emotion}' emotion."

    # Clean up
    os.remove(audio_path)

    return transcription, enhanced_description

def process_emotion_dataset(root_folder):
    # Create an empty list to store the results
    results = []

    # Iterate through the emotion folders
    for label in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, label)


        if os.path.isdir(folder_path):
            # Iterate through each video file in the emotion folder
            for video_file in os.listdir(folder_path):
                video_path = os.path.join(folder_path, video_file)


                if os.path.isfile(video_path):
                    # Process the video
                    transcription, enhanced_description = process_video(video_path)
                    label=video_file[0:2]
                    if(label=="an"):
                        label="angry"
                    elif(label=="di"):
                        label="disguise"
                    elif(label=="fe"):
                        label="fear"
                    elif(label=="su"):
                        label="surprise"
                    elif(label=="sa"):
                        label="sad"
                    elif(label=="ha"):
                        label="happy"



                    # Append the results to the list
                    results.append({
                        "label": label,
                        "textual_description": transcription,
                        "enhanced_textual_description": enhanced_description
                    })
                    print(video_path)

    # Convert the list to a pandas DataFrame
    df = pd.DataFrame(results)

    # Save the DataFrame to a CSV file (optional)
    df.to_csv("emotion_dataset_results.csv", index=False)

    return df


In [None]:
# Example usage
root_folder = "Dataset"  #
df = process_emotion_dataset(root_folder)

print(df)