In [5]:
from moviepy.editor import VideoFileClip
from transformers import HubertForSequenceClassification, Wav2Vec2FeatureExtractor
import torchaudio
import torch

def extract_audio_from_video(video_path, audio_path):
    """
    Extract audio from a video file and save it as a WAV file.
    """
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(audio_path, codec='pcm_s16le')  # Save as PCM WAV
    video.close()

def classify_audio_chunks(audio_path, chunk_duration=10, confidence_threshold=0.7):
    """
    Classify 10-second chunks of the audio for emotions.
    Returns a dictionary with start time as key and a tuple (predicted emotion, confidence) as value.
    If confidence is below the threshold, the predicted emotion will be 'unknown'.
    """
    # Load the model and feature extractor
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-large-ls960-ft")
    model = HubertForSequenceClassification.from_pretrained("xbgoose/hubert-speech-emotion-recognition-russian-dusha-finetuned")
    
    # Define emotion mapping
    num2emotion = {0: 'neutral', 1: 'angry', 2: 'positive', 3: 'sad', 4: 'other'}
    
    # Load the audio file
    waveform, sample_rate = torchaudio.load(audio_path, normalize=True)
    
    # Resample if necessary
    if sample_rate != 16000:  # Model expects 16kHz
        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = transform(waveform)

    # Ensure the audio input is mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)  # Convert to mono by averaging channels
    
    # Initialize the results dictionary
    results = {}

    # Calculate number of chunks
    total_length = waveform.size(1) / 16000  # Total length in seconds
    num_chunks = int(total_length // chunk_duration)

    for i in range(num_chunks):
        start_time = i * chunk_duration
        end_time = start_time + chunk_duration

        # Extract the chunk
        start_sample = int(start_time * 16000)
        end_sample = int(end_time * 16000)
        audio_chunk = waveform[:, start_sample:end_sample]

        # Prepare inputs for the model
        inputs = feature_extractor(
            audio_chunk.squeeze(0),  # Remove the channel dimension
            sampling_rate=feature_extractor.sampling_rate,
            return_tensors="pt",
            padding=True,
            max_length=16000 * chunk_duration,
            truncation=True
        )

        # Make predictions
        with torch.no_grad():
            logits = model(inputs['input_values']).logits
        
        # Get the predicted class index and confidence scores
        predicted_class = torch.argmax(logits, dim=-1).item()
        confidence_scores = torch.softmax(logits, dim=-1)  # Get confidence scores
        confidence = confidence_scores[0][predicted_class].item()  # Confidence for the predicted class
        
        # Map the predicted class to emotion, or set to 'unknown' if below threshold
        if confidence >= confidence_threshold:
            predicted_emotion = num2emotion[predicted_class]
        else:
            predicted_emotion = 'unknown'
        
        # Store the result with confidence
        results[start_time] = (predicted_emotion, confidence)

    return results

# Example Usage
# video_path = "/home/pe51k/PycharmProjects/secret-repo/data/video/test/test_0.mp4"
video_path = "/home/pe51k/PycharmProjects/secret-repo/data/video/test/Над расследованием по Северным потокам смеется вся Европа. Великий перепост [TubeRipper.com].mp4"
# video_path = "/home/pe51k/PycharmProjects/secret-repo/data/video/test/ПОЛОСА ПРЕПЯТСТВИЙ ДЛЯ КОТА АБРИКОСА - Кусь-шоу Весёлые челленджи [TubeRipper.com].mp4"
audio_path = "extracted_audio.wav"

# Step 1: Extract audio from video
extract_audio_from_video(video_path, audio_path)

# Step 2: Classify audio in chunks
confidence_threshold = 0.7  # Set your desired confidence threshold
classification_results = classify_audio_chunks(audio_path, confidence_threshold=confidence_threshold)

# Output the classification results
print(classification_results)

MoviePy - Writing audio in extracted_audio.wav


                                                                      

MoviePy - Done.


Some weights of the model checkpoint at xbgoose/hubert-speech-emotion-recognition-russian-dusha-finetuned were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at xbgoose/hubert-speech-emotion-recognition-russian-dusha-finetuned and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametri

{0: ('neutral', 0.898531436920166), 10: ('unknown', 0.5047914981842041), 20: ('unknown', 0.6240713000297546), 30: ('unknown', 0.4543779790401459), 40: ('angry', 0.758493185043335), 50: ('angry', 0.7123038172721863), 60: ('unknown', 0.6553771495819092), 70: ('unknown', 0.4523645341396332), 80: ('unknown', 0.616582989692688), 90: ('angry', 0.9587709307670593), 100: ('angry', 0.8447172045707703), 110: ('unknown', 0.6968732476234436), 120: ('unknown', 0.5530036687850952), 130: ('unknown', 0.5075026750564575), 140: ('angry', 0.8949615359306335), 150: ('unknown', 0.5432318449020386), 160: ('unknown', 0.5116928219795227), 170: ('neutral', 0.7228280305862427), 180: ('unknown', 0.6902819275856018), 190: ('unknown', 0.5846384167671204), 200: ('neutral', 0.9654733538627625), 210: ('neutral', 0.981601893901825), 220: ('neutral', 0.7543083429336548), 230: ('angry', 0.8148767948150635)}
