<a href="https://colab.research.google.com/github/Ameesha02/Blog_website/blob/master/MIR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tensorflow_text as tf_text

# 1. Audio feature extractor (pre-trained VGGish style model from TF Hub)
audio_model_url = "https://tfhub.dev/google/vggish/1"
audio_embedder = hub.KerasLayer(audio_model_url, input_shape=[None, 96, 64, 1], output_shape=[128], trainable=False)

# 2. Text encoder (USE architecture as example, replace with RoBERTa TensorFlow model if available)
text_model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
text_embedder = hub.KerasLayer(text_model_url, trainable=False)

# 3. Dual encoders model architecture for contrastive learning
class AudioTextRetrievalModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.audio_embedder = audio_embedder
        self.text_embedder = text_embedder

    def call(self, audio_inputs, text_inputs):
        # audio_inputs: Batch of mel spectrogram images (Batch, time, freq, channel)
        # text_inputs: Batch of raw text strings

        audio_emb = self.audio_embedder(audio_inputs)          # (batch, 128)
        text_emb = self.text_embedder(text_inputs)             # (batch, 512)

        # Normalize embeddings for cosine similarity
        audio_emb = tf.math.l2_normalize(audio_emb, axis=1)
        text_emb = tf.math.l2_normalize(text_emb, axis=1)
        return audio_emb, text_emb

# 4. Contrastive loss using cosine similarity and temperature scaling
def contrastive_loss(audio_emb, text_emb, temperature=0.07):
    logits = tf.matmul(audio_emb, text_emb, transpose_b=True) / temperature
    batch_size = tf.shape(audio_emb)[0]
    labels = tf.range(batch_size)
    loss_a2t = tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
    loss_t2a = tf.keras.losses.sparse_categorical_crossentropy(labels, tf.transpose(logits), from_logits=True)
    return tf.reduce_mean(loss_a2t + loss_t2a)

# 5. Example dummy pipeline for training with batched audio and text data (pseudocode)
@tf.function
def train_step(model, audio_batch, text_batch, optimizer):
    with tf.GradientTape() as tape:
        audio_emb, text_emb = model(audio_batch, text_batch)
        loss = contrastive_loss(audio_emb, text_emb)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# 6. Build and train model (trainer loop simplified)
model = AudioTextRetrievalModel()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

# Assume audio_dataset and text_dataset are tf.data.Dataset batches of spectrograms and texts aligned (not shown here)
# for epoch in range(num_epochs):
#     for audio_batch, text_batch in zip(audio_dataset, text_dataset):
#         loss = train_step(model, audio_batch, text_batch, optimizer)
#         print(f'Training loss: {loss.numpy()}')

# 7. Inference example for audio retrieval given query text:
text_model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
text_embedder = hub.KerasLayer(text_model_url, trainable=False)

# Dummy audio embeddings database as normalized vectors (e.g. 100 audio clips with 512-dim embeddings)
num_audio = 100
embedding_dim = 512
np.random.seed(42)
audio_embeddings_db = np.random.rand(num_audio, embedding_dim).astype(np.float32)
audio_embeddings_db /= np.linalg.norm(audio_embeddings_db, axis=1, keepdims=True)  # L2 normalize
audio_embeddings_db = tf.constant(audio_embeddings_db)

def retrieve_audio(audio_embeddings_db, query_text, text_embedder, top_k=5):
    query_emb = text_embedder(tf.constant([query_text]))
    query_emb = tf.math.l2_normalize(query_emb, axis=1)
    similarities = tf.linalg.matmul(audio_embeddings_db, query_emb, transpose_b=True)
    top_k_indices = tf.math.top_k(tf.squeeze(similarities), k=top_k).indices.numpy()
    return top_k_indices

# Example query text
query_text = "rain and thunder sounds"

# Run retrieval
top_results = retrieve_audio(audio_embeddings_db, query_text, text_embedder, top_k=5)

print("Top audio file indices for query:", top_results)

# Note: This code illustrates main blocks; real implementation requires:
# - Proper audio preprocessing to mel spectrograms matching input shape
# - Efficient dataset loading and batching
# - Building an indexed audio embedding database for retrieval
# - Using domain-specific pretrained models (RoBERTa text, ResNet/PANN audio encoder)
# - Training with augmented data to capture temporal/contextual audio-text relationships



Top audio file indices for query: [66 58 12 94  6]
