In [2]:
import os

while os.getcwd().split(os.sep)[-1] != "secret-repo":
    os.chdir("..")
    
!ls

 abracadabra.log       lib		  notebooks     src
 data		       LICENSE.md	  README.md     venv
 extracted_audio.wav  'loss weight.png'   settings.py   weights


In [8]:
from moviepy.editor import VideoFileClip
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import torchaudio
import torch
import os
from collections import defaultdict
import numpy as np
from lib.abracadabra.abracadabra.fingerprint import fingerprint_audio, fingerprint_file


def extract_audio_from_video(video_path, audio_path):
    """
    Extract audio from a video file and save it as a WAV file.
    """
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(audio_path, codec='pcm_s16le')  # Save as PCM WAV
    video.close()

def classify_and_match_audio_chunks(audio_path, search_files, chunk_duration=10, threshold=5):
    """
    Classify 10-second chunks of the audio for music presence.
    If a chunk contains music, search for matches among the list of audio files.
    Returns a dictionary with start time as key and "match"/"no match" as value.
    """
    # Load the model and feature extractor
    extractor = AutoFeatureExtractor.from_pretrained("MarekCech/GenreVim-Music-Detection-DistilHuBERT")
    model = AutoModelForAudioClassification.from_pretrained("MarekCech/GenreVim-Music-Detection-DistilHuBERT")
    
    # Load the audio file
    audio_input, sample_rate = torchaudio.load(audio_path)
    
    # Resample if necessary
    if sample_rate != 16000:  # Model expects 16kHz
        audio_input = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio_input)

    # Ensure the audio input is mono
    if audio_input.shape[0] > 1:
        audio_input = audio_input.mean(dim=0, keepdim=True)  # Convert to mono by averaging channels
    
    # Initialize the results dictionary
    results = {}

    # Calculate number of chunks
    total_length = audio_input.size(1) / 16000  # Total length in seconds
    num_chunks = int(total_length // chunk_duration)

    for i in range(num_chunks):
        start_time = i * chunk_duration
        end_time = start_time + chunk_duration

        # Extract the chunk
        start_sample = int(start_time * 16000)
        end_sample = int(end_time * 16000)
        audio_chunk = audio_input[:, start_sample:end_sample]

        # Save the chunk to a temporary file
        chunk_file = f"temp_chunk_{i}.wav"
        torchaudio.save(chunk_file, audio_chunk, 16000)

        # Extract features for classification
        inputs = extractor(audio_chunk.squeeze(0), sampling_rate=16000, return_tensors="pt", padding=True)

        # Make predictions
        with torch.no_grad():
            logits = model(**inputs).logits
        
        # Get the predicted class index
        predicted_class = logits.argmax(dim=-1).item()

        # If the chunk contains music, search for matches
        if predicted_class == 1:
            match_result = compare_files_with_timing(chunk_file, search_files, threshold=threshold)
            if match_result:
                results[start_time] = f"match found: {match_result}"
            else:
                results[start_time] = "no match"
        else:
            results[start_time] = "non_music"

        # Optionally remove the chunk file after processing
        os.remove(chunk_file)

    return results

def compare_files_with_timing(target_audio_file, search_files, threshold=5):
    """
    Compare audio chunk file with a list of search files and find the best match.
    :param target_audio_file: Path to the target audio chunk file
    :param search_files: List of file paths to compare against
    :param threshold: Minimum number of hash matches required for a file to be considered a match
    :return: The best matching file or None if no match is found
    """
    def score_match(offsets):
        """Score the match based on time shifts of hashes."""
        binwidth = 0.5
        tks = list(map(lambda x: x[0] - x[1], offsets))  # Calculate time deltas between matches
        hist, _ = np.histogram(tks, bins=np.arange(int(min(tks)), int(max(tks)) + binwidth + 1, binwidth))
        return np.max(hist) if len(hist) > 0 else 0

    def best_match(matches):
        """Return the file with the highest match score."""
        matched_file = None
        best_score = 0
        for search_file, offsets in matches.items():
            if len(offsets) < threshold:
                continue  # Skip files with insufficient matches
            score = score_match(offsets)
            if score > best_score:
                best_score = score
                matched_file = search_file
        return matched_file

    # Fingerprint the target audio chunk
    target_hashes = fingerprint_file(target_audio_file)
    target_hash_dict = {h[0]: h[1] for h in target_hashes}  # Hash -> time offset

    # Compare with each search file
    matches = defaultdict(list)
    for search_file in search_files:
        search_hashes = fingerprint_file(search_file)
        search_hash_dict = {h[0]: h[1] for h in search_hashes}

        # Look for hash matches
        for h in target_hash_dict:
            if h in search_hash_dict:
                # Store time offsets for further analysis
                matches[search_file].append((search_hash_dict[h], target_hash_dict[h]))

    # Return the best match
    return best_match(matches)

# Example Usage
video_path = "data/video/test/Portugal. The Man - Feel It Still (Official Music Video).mp4"
audio_path = "extracted_audio.wav"
# search_files = ["/path/to/file1.wav", "/path/to/file2.wav", "/path/to/file3.wav"]
search_files = [f"data/audio/test/{f}" for f in os.listdir("data/audio/test")]

# Step 1: Extract audio from video
extract_audio_from_video(video_path, audio_path)

# Step 2: Classify audio in chunks and find matches
classification_results = classify_and_match_audio_chunks(audio_path, search_files)

# Output the classification results with matches
print(classification_results)

MoviePy - Writing audio in extracted_audio.wav


                                                                       

MoviePy - Done.
{0: 'non_music', 10: 'match found: data/audio/test/Portugal. The Man - Feel It Still (Official Music Video).wav', 20: 'match found: data/audio/test/Portugal. The Man - Feel It Still (Official Music Video).wav', 30: 'match found: data/audio/test/Portugal. The Man - Feel It Still (Official Music Video).wav', 40: 'match found: data/audio/test/Portugal. The Man - Feel It Still (Official Music Video).wav', 50: 'match found: data/audio/test/Portugal. The Man - Feel It Still (Official Music Video).wav', 60: 'match found: data/audio/test/Portugal. The Man - Feel It Still (Official Music Video).wav', 70: 'match found: data/audio/test/Portugal. The Man - Feel It Still (Official Music Video).wav', 80: 'match found: data/audio/test/Portugal. The Man - Feel It Still (Official Music Video).wav', 90: 'match found: data/audio/test/Portugal. The Man - Feel It Still (Official Music Video).wav', 100: 'match found: data/audio/test/Portugal. The Man - Feel It Still (Official Music Video).wa