In [1]:
import os
import sqlite3
import hashlib
import json
import librosa  # For loading audio files
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch

In [2]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")


Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2Model: ['project_hid.weight', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_hid.bias', 'project_q.bias', 'quantizer.weight_proj.weight', 'quantizer.codevectors']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
audio_dir = os.path.join('input', 'audio')
os.makedirs(audio_dir, exist_ok=True)  # Create directory if missing

In [4]:
db_path = 'database/multimodal_rag.db'
conn = sqlite3.connect(db_path)

In [5]:
for filename in os.listdir(audio_dir):
    if filename.lower().endswith('.mp3'):  # Process MP3 files only
        audio_path = os.path.join(audio_dir, filename)
        
        try:
            # Generate content hash to avoid duplicates
            with open(audio_path, "rb") as f:
                content_hash = hashlib.sha256(f.read()).hexdigest()
            
            # Check if audio already exists in database
            cursor = conn.cursor()
            cursor.execute('SELECT id FROM embeddings WHERE content_hash = ?', (content_hash,))
            if cursor.fetchone():
                print(f"Audio already processed: {filename}")
                continue
            
            # Load and process MP3 file (convert MP3 to waveform)
            waveform, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz
            inputs = processor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
            with torch.no_grad():
                embedding = model(**inputs).last_hidden_state.mean(dim=1).numpy()
            
            
            # Insert into database with metadata
            conn.execute('''
                INSERT INTO embeddings (modality, content_hash, embedding, metadata)
                VALUES (?, ?, ?, ?)
            ''', ('audio', content_hash, embedding.tobytes(), json.dumps({
                "filename": filename,
                "path": audio_path,
                "duration": librosa.get_duration(y=waveform, sr=sr)
            })))
            
            print(f"Processed and stored: {filename}")
        
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

Audio already processed: bird-sound-249237.mp3


In [6]:
conn.commit()
conn.close()
print("Audio processing complete!")

Audio processing complete!
