In [5]:
from moviepy.editor import VideoFileClip
from transformers import HubertForSequenceClassification, Wav2Vec2FeatureExtractor
import torchaudio
import torch

def extract_audio_from_video(video_path, audio_path):
    """
    Extract audio from a video file and save it as a WAV file.
    """
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(audio_path, codec='pcm_s16le')  # Save as PCM WAV
    video.close()

def classify_audio_chunks(audio_path, chunk_duration=10, confidence_threshold=0.7):
    """
    Classify 10-second chunks of the audio for emotions.
    Returns a dictionary with start time as key and a tuple (predicted emotion, confidence) as value.
    If confidence is below the threshold, the predicted emotion will be 'unknown'.
    """
    # Load the model and feature extractor
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-large-ls960-ft")
    model = HubertForSequenceClassification.from_pretrained("xbgoose/hubert-speech-emotion-recognition-russian-dusha-finetuned")
    
    # Define emotion mapping
    num2emotion = {0: 'neutral', 1: 'angry', 2: 'positive', 3: 'sad', 4: 'other'}
    
    # Load the audio file
    waveform, sample_rate = torchaudio.load(audio_path, normalize=True)
    
    # Resample if necessary
    if sample_rate != 16000:  # Model expects 16kHz
        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = transform(waveform)

    # Ensure the audio input is mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)  # Convert to mono by averaging channels
    
    # Initialize the results dictionary
    results = {}

    # Calculate number of chunks
    total_length = waveform.size(1) / 16000  # Total length in seconds
    num_chunks = int(total_length // chunk_duration)

    for i in range(num_chunks):
        start_time = i * chunk_duration
        end_time = start_time + chunk_duration

        # Extract the chunk
        start_sample = int(start_time * 16000)
        end_sample = int(end_time * 16000)
        audio_chunk = waveform[:, start_sample:end_sample]

        # Prepare inputs for the model
        inputs = feature_extractor(
            audio_chunk.squeeze(0),  # Remove the channel dimension
            sampling_rate=feature_extractor.sampling_rate,
            return_tensors="pt",
            padding=True,
            max_length=16000 * chunk_duration,
            truncation=True
        )

        # Make predictions
        with torch.no_grad():
            logits = model(inputs['input_values']).logits
        
        # Get the predicted class index and confidence scores
        predicted_class = torch.argmax(logits, dim=-1).item()
        confidence_scores = torch.softmax(logits, dim=-1)  # Get confidence scores
        confidence = confidence_scores[0][predicted_class].item()  # Confidence for the predicted class
        
        # Map the predicted class to emotion, or set to 'unknown' if below threshold
        if confidence >= confidence_threshold:
            predicted_emotion = num2emotion[predicted_class]
        else:
            predicted_emotion = 'unknown'
        
        # Store the result with confidence
        results[start_time] = (predicted_emotion, confidence)

    return results

# Example Usage
# video_path = "/home/pe51k/PycharmProjects/secret-repo/data/video/test/test_0.mp4"
video_path = "/home/pe51k/PycharmProjects/secret-repo/data/video/test/Над расследованием по Северным потокам смеется вся Европа. Великий перепост [TubeRipper.com].mp4"
# video_path = "/home/pe51k/PycharmProjects/secret-repo/data/video/test/ПОЛОСА ПРЕПЯТСТВИЙ ДЛЯ КОТА АБРИКОСА - Кусь-шоу Весёлые челленджи [TubeRipper.com].mp4"
audio_path = "extracted_audio.wav"

# Step 1: Extract audio from video
extract_audio_from_video(video_path, audio_path)

# Step 2: Classify audio in chunks
confidence_threshold = 0.7  # Set your desired confidence threshold
classification_results = classify_audio_chunks(audio_path, confidence_threshold=confidence_threshold)

# Output the classification results
print(classification_results)

MoviePy - Writing audio in extracted_audio.wav


                                                                      

MoviePy - Done.


Some weights of the model checkpoint at xbgoose/hubert-speech-emotion-recognition-russian-dusha-finetuned were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at xbgoose/hubert-speech-emotion-recognition-russian-dusha-finetuned and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametri

{0: ('neutral', 0.898531436920166), 10: ('unknown', 0.5047914981842041), 20: ('unknown', 0.6240713000297546), 30: ('unknown', 0.4543779790401459), 40: ('angry', 0.758493185043335), 50: ('angry', 0.7123038172721863), 60: ('unknown', 0.6553771495819092), 70: ('unknown', 0.4523645341396332), 80: ('unknown', 0.616582989692688), 90: ('angry', 0.9587709307670593), 100: ('angry', 0.8447172045707703), 110: ('unknown', 0.6968732476234436), 120: ('unknown', 0.5530036687850952), 130: ('unknown', 0.5075026750564575), 140: ('angry', 0.8949615359306335), 150: ('unknown', 0.5432318449020386), 160: ('unknown', 0.5116928219795227), 170: ('neutral', 0.7228280305862427), 180: ('unknown', 0.6902819275856018), 190: ('unknown', 0.5846384167671204), 200: ('neutral', 0.9654733538627625), 210: ('neutral', 0.981601893901825), 220: ('neutral', 0.7543083429336548), 230: ('angry', 0.8148767948150635)}


In [9]:
import librosa
import numpy as np

def analyze_loudness(audio_path, chunk_duration=10, volume_threshold=0.1):
    """
    Analyze loudness of audio in chunks.
    Returns a dictionary with start time as key and a tuple (mean volume, percent difference, loudness classification) as value.
    """
    # Load the audio file
    waveform, sample_rate = torchaudio.load(audio_path, normalize=True)
    
    # Resample if necessary
    if sample_rate != 16000:  # Model expects 16kHz
        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = transform(waveform)

    # Ensure the audio input is mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)  # Convert to mono by averaging channels

    # Calculate the mean volume of the entire audio file
    audio_data = waveform.numpy()[0]  # Convert to numpy array
    overall_mean_volume = np.mean(librosa.feature.rms(y=audio_data))

    # Initialize the results dictionary
    loudness_results = {}

    # Calculate number of chunks
    total_length = waveform.size(1) / 16000  # Total length in seconds
    num_chunks = int(total_length // chunk_duration)

    for i in range(num_chunks):
        start_time = i * chunk_duration
        end_time = start_time + chunk_duration

        # Extract the chunk
        start_sample = int(start_time * 16000)
        end_sample = int(end_time * 16000)
        audio_chunk = waveform[:, start_sample:end_sample].numpy()[0]  # Convert to numpy array

        # Calculate the mean volume for the chunk
        chunk_mean_volume = np.mean(librosa.feature.rms(y=audio_chunk))
        
        # Calculate percent difference from overall mean volume
        percent_diff = (chunk_mean_volume - overall_mean_volume) / overall_mean_volume * 100  # Convert to percentage

        # Determine loudness classification
        if percent_diff > volume_threshold * 100:
            loudness_status = "loud"
        elif percent_diff < -volume_threshold * 100:
            loudness_status = "quiet"
        else:
            loudness_status = "normal"

        # Store the result
        loudness_results[start_time] = (chunk_mean_volume, percent_diff, loudness_status)

    return loudness_results

# Step 2: Analyze loudness in chunks
volume_threshold = 0.1  # Set your desired volume threshold for loudness classification
loudness_results = analyze_loudness(audio_path, chunk_duration=10, volume_threshold=volume_threshold)

print("Loudness Analysis Results:", loudness_results)

Loudness Analysis Results: {0: (0.052737, -4.346724599599838, 'normal'), 10: (0.047385897, -14.052446186542511, 'quiet'), 20: (0.052177295, -5.3619083017110825, 'normal'), 30: (0.054364823, -1.3942159712314606, 'normal'), 40: (0.05919643, 7.369254529476166, 'normal'), 50: (0.05567366, 0.9797235950827599, 'normal'), 60: (0.055661913, 0.9584192186594009, 'normal'), 70: (0.056004174, 1.5792051330208778, 'normal'), 80: (0.05390702, -2.2245725616812706, 'normal'), 90: (0.05659374, 2.6485448703169823, 'normal'), 100: (0.05345071, -3.052212856709957, 'normal'), 110: (0.054948863, -0.3348967060446739, 'normal'), 120: (0.05337519, -3.1891945749521255, 'normal'), 130: (0.05855794, 6.211170554161072, 'normal'), 140: (0.056462802, 2.4110550060868263, 'normal'), 150: (0.056765426, 2.959948033094406, 'normal'), 160: (0.058398627, 5.922213569283485, 'normal'), 170: (0.04427266, -19.69916820526123, 'quiet'), 180: (0.05806135, 5.310468375682831, 'normal'), 190: (0.0502579, -8.84326919913292, 'normal'),