In [1]:
# =============================================================================
# Multimodal Audio Tool: Classifier and Generator
#
# This script demonstrates how to approach two advanced audio tasks:
# 1. CLASSIFICATION: Identify a sound from a predefined set of classes
#    (e.g., 'rain', 'dog_bark', 'children_playing'). This part is similar
#    to your previous script but adapted for multiple classes.
#
# 2. GENERATION: Generate a sound based on a text description using a
#    powerful, pre-trained model from Hugging Face.
#
# --- HOW TO USE ---
# 1.  Set up your data folders for classification: 'rain', 'dog_bark', etc.
# 2.  Install all required libraries:
    # %pip install tensorflow pandas numpy librosa matplotlib scikit-learn os
    # %pip install transformers torch
# 3.  Run the script.
# =============================================================================

%pip install tensorflow pandas numpy librosa matplotlib scikit-learn 
%pip install transformers torch
import os
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.utils import to_categorical
import soundfile as sf

# --- Hugging Face Transformers for Text-to-Audio ---
from transformers import pipeline

# --- Configuration ---
# For Classification
DATA_DIR = './'
CLASSES = ['rain', 'dog_bark', 'children_playing'] # Add any other folder names here
TARGET_DURATION = 5.0
MAX_PAD_LEN = 216


[31mERROR: Could not find a version that satisfies the requirement os (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for os[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Collecting transformers
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Collecting torch
  Downloading torch-2.7.1-cp313-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.7.34-cp313-cp313-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.4-cp39-a

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# =============================================================================
# PART 1: AUDIO CLASSIFICATION
# (This section is for training a model to IDENTIFY sounds)
# =============================================================================

def extract_features(file_path):
    """Extracts Mel spectrogram features from an audio file."""
    try:
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
        audio_duration = librosa.get_duration(y=audio, sr=sample_rate)
        if audio_duration > TARGET_DURATION:
            start = int((audio_duration - TARGET_DURATION) / 2 * sample_rate)
            end = start + int(TARGET_DURATION * sample_rate)
            audio = audio[start:end]
        
        melspectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128)
        pad_width = MAX_PAD_LEN - melspectrogram.shape[1]
        if pad_width > 0:
            melspectrogram = np.pad(melspectrogram, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            melspectrogram = melspectrogram[:, :MAX_PAD_LEN]
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None
    return melspectrogram

def train_classifier():
    """Trains a model to classify sounds based on the folders in DATA_DIR."""
    features = []
    print("--- Training Classifier: Reading audio files... ---")
    for class_label in CLASSES:
        class_path = os.path.join(DATA_DIR, class_label)
        if not os.path.isdir(class_path):
            print(f"Warning: Directory not found for class '{class_label}'. Skipping.")
            continue
        for filename in os.listdir(class_path):
            filepath = os.path.join(class_path, filename)
            if filepath.endswith(('.wav', '.mp3')):
                data = extract_features(filepath)
                if data is not None:
                    features.append([data, class_label])
    
    if not features:
        print("No audio data found for training the classifier. Aborting.")
        return None, None

    features_df = pd.DataFrame(features, columns=['feature', 'class_label'])
    X = np.array(features_df['feature'].tolist())
    y = np.array(features_df['class_label'].tolist())

    le = LabelEncoder()
    y_encoded = to_categorical(le.fit_transform(y))

    x_train, _, y_train, _ = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
    x_train = x_train.reshape(x_train.shape[0], 128, MAX_PAD_LEN, 1)
    
    model = Sequential([
        Conv2D(16, (3, 3), activation='relu', input_shape=(128, MAX_PAD_LEN, 1)),
        MaxPooling2D((2, 2)), Dropout(0.2),
        Conv2D(32, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)), Dropout(0.2),
        Flatten(),
        Dense(128, activation='relu'), Dropout(0.5),
        Dense(len(CLASSES), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print("--- Starting classifier training... ---")
    model.fit(x_train, y_train, batch_size=32, epochs=20, validation_split=0.1)
    print("--- Classifier training complete. ---")
    return model, le

def classify_audio(model, label_encoder, file_path):
    """Identifies the class of a given audio file."""
    if model is None:
        print("Classifier model not available.")
        return
    feature = extract_features(file_path)
    if feature is not None:
        feature = feature.reshape(1, 128, MAX_PAD_LEN, 1)
        prediction = model.predict(feature)
        predicted_index = np.argmax(prediction)
        predicted_class = label_encoder.inverse_transform([predicted_index])[0]
        print(f"\n--- Classification Result ---")
        print(f"File '{os.path.basename(file_path)}' is identified as: '{predicted_class}'")
        print("---------------------------\n")

# =============================================================================
# PART 2: TEXT-TO-AUDIO GENERATION
# (This section uses a pre-trained model to GENERATE sounds from text)
# =============================================================================

def generate_audio_from_text(text_prompt):
    """Generates audio from a text description using a Hugging Face model."""
    print(f"--- Generating audio for prompt: '{text_prompt}' ---")
    print("This may take a moment as the model is loaded...")
    try:
        # Load the text-to-audio pipeline from Hugging Face
        # This model is great for sound effects and ambient sounds
        synthesiser = pipeline("text-to-audio", "facebook/musicgen-small")
        
        # Generate the audio
        music = synthesiser(text_prompt, forward_params={"do_sample": True})
        
        # Save the audio to a file
        output_filename = f"{text_prompt.replace(' ', '_')}.wav"
        sf.write(output_filename, music["audio"], samplerate=music["sampling_rate"])
        
        print(f"\n--- Generation Complete ---")
        print(f"Audio saved as '{output_filename}'")
        print("---------------------------\n")
        
    except Exception as e:
        print(f"An error occurred during audio generation: {e}")
        print("Please ensure you have a stable internet connection and all libraries are installed.")

# --- Main Execution ---
if __name__ == '__main__':
    # # --- Task 1: Train our custom classifier ---
    # # Note: This part requires you to have the data folders set up.
    # # If you don't have the data, you can comment this section out.
    # classifier_model, label_encoder = train_classifier()

    # # --- Task 2: Use the classifier to identify a sound ---
    # # We will try to classify the first rain sound file we find
    # if classifier_model and os.path.exists('rain'):
    #     rain_files = [f for f in os.listdir('rain') if f.endswith(('.wav', '.mp3'))]
    #     if rain_files:
    #         classify_audio(classifier_model, label_encoder, os.path.join('rain', rain_files[0]))

    # --- Task 3: Use the generative model to create a sound from text ---
    generate_audio_from_text("heavy rain with thunder")
    generate_audio_from_text("a dog barking in the distance")


--- Generating audio for prompt: 'heavy rain with thunder' ---
This may take a moment as the model is loaded...


Device set to use mps:0


An error occurred during audio generation: Invalid shape: (1, 1, 957440) (too many dimensions)
Please ensure you have a stable internet connection and all libraries are installed.
--- Generating audio for prompt: 'a dog barking in the distance' ---
This may take a moment as the model is loaded...


Device set to use mps:0


An error occurred during audio generation: Invalid shape: (1, 1, 957440) (too many dimensions)
Please ensure you have a stable internet connection and all libraries are installed.
