In [None]:
!pip install moviepy

In [12]:
from moviepy.editor import VideoFileClip
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import torchaudio
import torch

def extract_audio_from_video(video_path, audio_path):
    """
    Extract audio from a video file and save it as a WAV file.
    """
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(audio_path, codec='pcm_s16le')  # Save as PCM WAV
    video.close()

def classify_audio_chunks(audio_path, chunk_duration=10):
    """
    Classify 10-second chunks of the audio for music presence.
    Returns a dictionary with start time as key and "music"/"non_music" as value.
    """
    # Load the model and feature extractor
    extractor = AutoFeatureExtractor.from_pretrained("MarekCech/GenreVim-Music-Detection-DistilHuBERT")
    model = AutoModelForAudioClassification.from_pretrained("MarekCech/GenreVim-Music-Detection-DistilHuBERT")
    
    # Load the audio file
    audio_input, sample_rate = torchaudio.load(audio_path)
    
    # Resample if necessary
    if sample_rate != 16000:  # Model expects 16kHz
        audio_input = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio_input)

    # Ensure the audio input is mono
    if audio_input.shape[0] > 1:
        audio_input = audio_input.mean(dim=0, keepdim=True)  # Convert to mono by averaging channels
    
    # Initialize the results dictionary
    results = {}

    # Calculate number of chunks
    total_length = audio_input.size(1) / 16000  # Total length in seconds
    num_chunks = int(total_length // chunk_duration)

    for i in range(num_chunks):
        start_time = i * chunk_duration
        end_time = start_time + chunk_duration

        # Extract the chunk
        start_sample = int(start_time * 16000)
        end_sample = int(end_time * 16000)
        audio_chunk = audio_input[:, start_sample:end_sample]

        # Extract features
        inputs = extractor(audio_chunk.squeeze(0), sampling_rate=16000, return_tensors="pt", padding=True)

        # Make predictions
        with torch.no_grad():
            logits = model(**inputs).logits
        
        # Get the predicted class index
        predicted_class = logits.argmax(dim=-1).item()

        # Map the predicted class to "music" or "non_music"
        results[start_time] = "music" if predicted_class == 1 else "non_music"

    return results

# Example Usage
# video_path = "/home/pe51k/PycharmProjects/secret-repo/data/video/test/test_0.mp4"
# video_path = "/home/pe51k/PycharmProjects/secret-repo/data/video/test/Над расследованием по Северным потокам смеется вся Европа. Великий перепост [TubeRipper.com].mp4"
video_path = "/home/pe51k/PycharmProjects/secret-repo/data/video/test/ПОЛОСА ПРЕПЯТСТВИЙ ДЛЯ КОТА АБРИКОСА - Кусь-шоу Весёлые челленджи [TubeRipper.com].mp4"
audio_path = "extracted_audio.wav"

# Step 1: Extract audio from video
extract_audio_from_video(video_path, audio_path)

# Step 2: Classify audio in chunks
classification_results = classify_audio_chunks(audio_path)

# Output the classification results
print(classification_results)

MoviePy - Writing audio in extracted_audio.wav


                                                                      

MoviePy - Done.
{0: 'non_music', 10: 'music', 20: 'music', 30: 'music', 40: 'non_music', 50: 'non_music', 60: 'non_music', 70: 'non_music', 80: 'music', 90: 'non_music'}
