<a href="https://colab.research.google.com/github/Abhilash2240/project/blob/main/project_24(GEN_AI).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!git clone https://github.com/Abhilash2240/project



fatal: destination path 'project' already exists and is not an empty directory.


In [None]:
import os
import librosa
import librosa.display
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow_hub as hub
import soundfile as sf
from tensorflow import keras
from tensorflow.keras import layers

# Function to load WAV files and convert to Mel spectrograms
def load_wav_files(folder_path, sample_rate=22050, n_mels=64, max_duration=3, fixed_length=256):
    mel_spectrograms = []

    for file in os.listdir(folder_path):
        if file.endswith(".wav"):
            file_path = os.path.join(folder_path, file)
            y, sr = librosa.load(file_path, sr=sample_rate, duration=max_duration)
            mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
            mel_spec = librosa.power_to_db(mel_spec, ref=np.max)  # Convert to decibels
            mel_spec = librosa.util.fix_length(mel_spec, size=fixed_length, axis=1)  # Ensure uniform shape
            mel_spectrograms.append(mel_spec.astype(np.float32))  # Use float32 for stability

    return np.array(mel_spectrograms, dtype=np.float32)

# Set folder path for dataset
FOLDER_PATH = "/content/project"  # Change this to your WAV files folder

# Load dataset
mel_specs = load_wav_files(FOLDER_PATH)
mel_specs = np.expand_dims(mel_specs, axis=-1)  # Add channel dimension

print("Dataset shape:", mel_specs.shape)  # Debugging check

# Normalize function
def normalize_batchwise(data):
    return np.array([(x - np.min(x)) / (np.max(x) - np.min(x) + 1e-8) for x in data])

mel_specs = normalize_batchwise(mel_specs)

# Define VAE model with text conditioning
latent_dim = 64
text_embedding_dim = 512

# Load pre-trained text encoder
text_encoder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Define the TextVAE model
class TextVAE(keras.Model):
    def __init__(self, latent_dim, text_embedding_dim):
        super(TextVAE, self).__init__()
        self.encoder = keras.Sequential([
            layers.Input(shape=(64, 256, 1)),  # Input shape: Mel spectrogram
            layers.Conv2D(32, 3, activation="relu", strides=2, padding="same"),  # (32, 128, 32)
            layers.Conv2D(64, 3, activation="relu", strides=2, padding="same"),  # (16, 64, 64)
            layers.Flatten(),  # Produces 65536
            layers.Dense(latent_dim + latent_dim),  # Mean and log variance
        ])
        self.text_fc = layers.Dense(latent_dim, activation="relu")
        self.decoder = keras.Sequential([
            layers.Input(shape=(latent_dim,)),
            layers.Dense(16 * 64 * 64, activation="relu"),  # Fixed output size
            layers.Reshape((16, 64, 64)),
            layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same"),
            layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same"),
            layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same"),
        ])

    def encode(self, x):
        z = self.encoder(x)
        mean, log_var = tf.split(z, num_or_size_splits=2, axis=1)
        return mean, log_var

    def reparameterize(self, mean, log_var):
        eps = tf.random.normal(shape=tf.shape(mean))
        return eps * tf.exp(log_var * 0.5) + mean

    def decode(self, z):
        return self.decoder(z)

    def call(self, inputs):
        mean, log_var = self.encode(inputs)
        z = self.reparameterize(mean, log_var)
        reconstructed = self.decode(z)
        return reconstructed

    def generate(self, text_prompt):
        text_embedding = text_encoder([text_prompt]).numpy()
        text_latent = self.text_fc(text_embedding)
        generated_spec = self.decode(text_latent)
        return generated_spec

# Training function
@tf.function
def train_step(model, data, optimizer):
    with tf.GradientTape() as tape:
        reconstructed = model(data)
        loss = tf.reduce_mean(tf.square(reconstructed - data))  # MSE loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Initialize and compile the model
vae = TextVAE(latent_dim, text_embedding_dim)
vae.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.MeanSquaredError())

# Train the model
batch_size = 8
num_epochs = 50

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    for batch_start in range(0, len(mel_specs), batch_size):
        batch_end = batch_start + batch_size
        batch_data = mel_specs[batch_start:batch_end]
        loss = train_step(vae, batch_data, vae.optimizer)
        print(f"Batch {batch_start//batch_size+1}, Loss: {loss.numpy():.4f}")

# Function to generate sound from text and save as WAV file
def generate_sound_from_text(prompt, output_path="generated_sound.wav"):
    generated_spec = vae.generate(prompt)
    generated_spec = generated_spec.numpy().squeeze()

    # Display the generated Mel spectrogram
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(generated_spec, sr=22050, x_axis="time", y_axis="mel")
    plt.colorbar()
    plt.title(f"Generated Sound Effect for: {prompt}")
    plt.show()

    # Convert Mel spectrogram back to waveform
    audio_waveform = librosa.feature.inverse.mel_to_audio(generated_spec, sr=22050)

    # Save as a WAV file
    sf.write(output_path, audio_waveform, 22050)
    print(f"Audio saved as {output_path}")

# Get user input and generate WAV file
prompt = input("Enter a sound description: ")
generate_sound_from_text(prompt)
