In [9]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [1]:
!pip install numpy scipy tensorflow soundfile




In [8]:
import numpy as np
import scipy.fftpack
import scipy.signal
import scipy.io.wavfile as wav
import soundfile as sf

# Constants
N_FFT = 512  # FFT Window Size
HOP_LENGTH = 256  # Step Size for FFT
N_MEL = 40  # Number of Mel Filters
N_MFCC = 40  # Number of MFCC Features

# Mel Filterbank Calculation (Precomputed)
def mel_filterbank(num_filters, fft_size, sample_rate):
    min_freq = 0
    max_freq = sample_rate // 2
    mel_min = 2595 * np.log10(1 + min_freq / 700)
    mel_max = 2595 * np.log10(1 + max_freq / 700)
    mel_points = np.linspace(mel_min, mel_max, num_filters + 2)
    hz_points = 700 * (10**(mel_points / 2595) - 1)
    bin_points = np.floor((fft_size + 1) * hz_points / sample_rate).astype(int)

    filters = np.zeros((num_filters, fft_size // 2 + 1))
    for i in range(1, num_filters + 1):
        filters[i - 1, bin_points[i - 1]:bin_points[i]] = np.linspace(0, 1, bin_points[i] - bin_points[i - 1])
        filters[i - 1, bin_points[i]:bin_points[i + 1]] = np.linspace(1, 0, bin_points[i + 1] - bin_points[i])
    return filters

# Function to Compute MFCC Features from WAV File
def extract_mfcc(audio_file, num_mfcc=40):
    # Load WAV File
    sample_rate, signal = wav.read(audio_file)
    if len(signal.shape) > 1:
        signal = np.mean(signal, axis=1)  # Convert to Mono if Stereo

    # Step 1: Compute Short-Time Fourier Transform (STFT)
    _, _, stft_output = scipy.signal.stft(signal, fs=sample_rate, nperseg=N_FFT, noverlap=HOP_LENGTH)

    # Step 2: Compute Spectrogram (Magnitude of STFT)
    spectrogram = np.abs(stft_output)

    # Step 3: Apply Mel Filterbank
    mel_filters = mel_filterbank(N_MEL, N_FFT, sample_rate)
    mel_spectrogram = np.dot(mel_filters, spectrogram)

    # Step 4: Convert to Log Scale
    log_mel_spectrogram = np.log(mel_spectrogram + 1e-10)

    # Step 5: Apply Discrete Cosine Transform (DCT) to Get MFCCs
    mfcc = scipy.fftpack.dct(log_mel_spectrogram, axis=0, norm='ortho')[:num_mfcc]

    # Take Mean Across Time to Get a Fixed-Length Feature Vector
    return np.mean(mfcc, axis=1)


In [14]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers, models
import scipy.fftpack
import scipy.signal

# Constants
N_FFT = 512  # FFT Window Size
HOP_LENGTH = 256  # Step Size for FFT
N_MEL = 40  # Number of Mel Filters
N_MFCC = 40  # Number of MFCC Features

# Load TensorFlow's Speech Commands Dataset
dataset_name = "speech_commands"
dataset, info = tfds.load(dataset_name, with_info=True, as_supervised=True, split=["train", "test"])

# List of desired keywords
KEYWORDS = ["up", "down", "left", "right", "stop", "go"]
KEYWORD_MAP = {word: i for i, word in enumerate(KEYWORDS)}

# Mel Filterbank Calculation
def mel_filterbank(num_filters, fft_size, sample_rate):
    min_freq = 0
    max_freq = sample_rate // 2
    mel_min = 2595 * np.log10(1 + min_freq / 700)
    mel_max = 2595 * np.log10(1 + max_freq / 700)
    mel_points = np.linspace(mel_min, mel_max, num_filters + 2)
    hz_points = 700 * (10**(mel_points / 2595) - 1)
    bin_points = np.floor((fft_size + 1) * hz_points / sample_rate).astype(int)

    filters = np.zeros((num_filters, fft_size // 2 + 1))
    for i in range(1, num_filters + 1):
        filters[i - 1, bin_points[i - 1]:bin_points[i]] = np.linspace(0, 1, bin_points[i] - bin_points[i - 1])
        filters[i - 1, bin_points[i]:bin_points[i + 1]] = np.linspace(1, 0, bin_points[i + 1] - bin_points[i])
    return filters

# Function to Compute MFCC Features from Raw Audio
def extract_mfcc(audio_array, num_mfcc=40, sample_rate=16000):
    if len(audio_array.shape) > 1:
        audio_array = np.mean(audio_array, axis=1)  # Convert to Mono if Stereo

    # Compute Short-Time Fourier Transform (STFT)
    _, _, stft_output = scipy.signal.stft(audio_array, fs=sample_rate, nperseg=N_FFT, noverlap=HOP_LENGTH)

    # Compute Spectrogram (Magnitude of STFT)
    spectrogram = np.abs(stft_output)

    # Apply Mel Filterbank
    mel_filters = mel_filterbank(N_MEL, N_FFT, sample_rate)
    mel_spectrogram = np.dot(mel_filters, spectrogram)

    # Convert to Log Scale
    log_mel_spectrogram = np.log(mel_spectrogram + 1e-10)

    # Apply Discrete Cosine Transform (DCT) to Get MFCCs
    mfcc = scipy.fftpack.dct(log_mel_spectrogram, axis=0, norm='ortho')[:num_mfcc]

    # Take Mean Across Time to Get a Fixed-Length Feature Vector
    return np.mean(mfcc, axis=1)

# Function to Process Dataset
def filter_keywords(data):
    mfcc_features_list = []
    labels = []

    for audio, label in tfds.as_numpy(data):
        word = info.features["label"].int2str(label)
        if word in KEYWORDS:
            # Convert TensorFlow tensor to NumPy array
            audio_numpy = np.array(audio, dtype=np.float32)

            # Extract MFCC features
            mfcc_features = extract_mfcc(audio_numpy)
            mfcc_features_list.append(mfcc_features)
            labels.append(KEYWORD_MAP[word])

    return np.array(mfcc_features_list), np.array(labels)

# Extract MFCC Features from Training and Test Data
X_train, y_train = filter_keywords(dataset[0])  # Train Set
X_test, y_test = filter_keywords(dataset[1])  # Test Set

# Normalize Data (Standardize MFCCs)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert Labels to One-Hot Encoding (Required for Classification)
y_train = to_categorical(y_train, num_classes=len(KEYWORDS))
y_test = to_categorical(y_test, num_classes=len(KEYWORDS))

# Define Neural Network Model
def create_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Dense(128, activation='relu', kernel_regularizer='l2'),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu', kernel_regularizer='l2'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Compile and Train Model
model = create_model(input_shape=(40,), num_classes=len(KEYWORDS))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate Model on Test Set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")


Epoch 1/50
[1m459/459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.2500 - loss: 2.8971 - val_accuracy: 0.4726 - val_loss: 1.8152
Epoch 2/50
[1m459/459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4369 - loss: 1.7746 - val_accuracy: 0.4950 - val_loss: 1.5095
Epoch 3/50
[1m459/459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4642 - loss: 1.5446 - val_accuracy: 0.5018 - val_loss: 1.4386
Epoch 4/50
[1m459/459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4779 - loss: 1.4814 - val_accuracy: 0.5099 - val_loss: 1.4167
Epoch 5/50
[1m459/459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4778 - loss: 1.4760 - val_accuracy: 0.5023 - val_loss: 1.4112
Epoch 6/50
[1m459/459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4862 - loss: 1.4603 - val_accuracy: 0.5089 - val_loss: 1.3995
Epoch 7/50
[1m459/459[0m 

In [15]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers, models
import scipy.fftpack
import scipy.signal

# Constants
N_FFT = 512  # FFT Window Size
HOP_LENGTH = 256  # Step Size for FFT
N_MEL = 40  # Number of Mel Filters
N_MFCC = 40  # Number of MFCC Features

# Load TensorFlow's Speech Commands Dataset
dataset_name = "speech_commands"
dataset, info = tfds.load(dataset_name, with_info=True, as_supervised=True, split="train")

# List of desired keywords
KEYWORDS = ["up", "down", "left", "right", "stop", "go"]
KEYWORD_MAP = {word: i for i, word in enumerate(KEYWORDS)}

# Function to Compute MFCC Features from Raw Audio
def extract_mfcc(audio_array, num_mfcc=40, sample_rate=16000):
    if len(audio_array.shape) > 1:
        audio_array = np.mean(audio_array, axis=1)  # Convert to Mono if Stereo

    # Compute Short-Time Fourier Transform (STFT)
    _, _, stft_output = scipy.signal.stft(audio_array, fs=sample_rate, nperseg=N_FFT, noverlap=HOP_LENGTH)

    # Compute Spectrogram (Magnitude of STFT)
    spectrogram = np.abs(stft_output)

    # Apply Mel Filterbank
    mel_filters = mel_filterbank(N_MEL, N_FFT, sample_rate)
    mel_spectrogram = np.dot(mel_filters, spectrogram)

    # Convert to Log Scale
    log_mel_spectrogram = np.log(mel_spectrogram + 1e-10)

    # Apply Discrete Cosine Transform (DCT) to Get MFCCs
    mfcc = scipy.fftpack.dct(log_mel_spectrogram, axis=0, norm='ortho')[:num_mfcc]

    # Take Mean Across Time to Get a Fixed-Length Feature Vector
    return np.mean(mfcc, axis=1)

# Function to Filter Only Selected Keywords
def filter_keywords(data):
    mfcc_features_list = []
    labels = []

    for audio, label in tfds.as_numpy(data):
        word = info.features["label"].int2str(label)
        if word in KEYWORDS:  # Only process selected words
            audio_numpy = np.array(audio, dtype=np.float32)
            mfcc_features = extract_mfcc(audio_numpy)  # Extract MFCCs
            mfcc_features_list.append(mfcc_features)
            labels.append(KEYWORD_MAP[word])  # Assign label

    return np.array(mfcc_features_list), np.array(labels)

# Extract MFCC Features from Training Data (Only Selected Words)
X_train, y_train = filter_keywords(dataset)  # Train Set

# Normalize Data (Standardize MFCCs)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Convert Labels to One-Hot Encoding (Required for Classification)
y_train = to_categorical(y_train, num_classes=len(KEYWORDS))

# Split into Train/Test
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define Neural Network Model
def create_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Dense(128, activation='relu', kernel_regularizer='l2'),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu', kernel_regularizer='l2'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Compile and Train Model
model = create_model(input_shape=(40,), num_classes=len(KEYWORDS))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate Model on Test Set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")


Epoch 1/50
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.2324 - loss: 2.9248 - val_accuracy: 0.4665 - val_loss: 1.9031
Epoch 2/50
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4209 - loss: 1.8485 - val_accuracy: 0.4934 - val_loss: 1.5640
Epoch 3/50
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4603 - loss: 1.5894 - val_accuracy: 0.4825 - val_loss: 1.4807
Epoch 4/50
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.4646 - loss: 1.5169 - val_accuracy: 0.5063 - val_loss: 1.4448
Epoch 5/50
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4752 - loss: 1.4805 - val_accuracy: 0.5015 - val_loss: 1.4331
Epoch 6/50
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4826 - loss: 1.4695 - val_accuracy: 0.5029 - val_loss: 1.4206
Epoch 7/50
[1m368/368[0m 

In [18]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers, models
import scipy.fftpack
import scipy.signal
from sklearn.model_selection import train_test_split

# Constants
N_FFT = 512  # FFT Window Size
HOP_LENGTH = 256  # Step Size for FFT
N_MEL = 40  # Number of Mel Filters
N_MFCC = 40  # Number of MFCC Features
FIXED_TIME_STEPS = 32  # Fixed time steps for MFCCs

# Load TensorFlow's Speech Commands Dataset
dataset_name = "speech_commands"
dataset, info = tfds.load(dataset_name, with_info=True, as_supervised=True, split="train")

# List of desired keywords
KEYWORDS = ["up", "down", "left", "right", "stop", "go"]
KEYWORD_MAP = {word: i for i, word in enumerate(KEYWORDS)}

# Mel Filterbank Calculation
def mel_filterbank(num_filters, fft_size, sample_rate):
    min_freq = 0
    max_freq = sample_rate // 2
    mel_min = 2595 * np.log10(1 + min_freq / 700)
    mel_max = 2595 * np.log10(1 + max_freq / 700)
    mel_points = np.linspace(mel_min, mel_max, num_filters + 2)
    hz_points = 700 * (10**(mel_points / 2595) - 1)
    bin_points = np.floor((fft_size + 1) * hz_points / sample_rate).astype(int)

    filters = np.zeros((num_filters, fft_size // 2 + 1))
    for i in range(1, num_filters + 1):
        filters[i - 1, bin_points[i - 1]:bin_points[i]] = np.linspace(0, 1, bin_points[i] - bin_points[i - 1])
        filters[i - 1, bin_points[i]:bin_points[i + 1]] = np.linspace(1, 0, bin_points[i + 1] - bin_points[i])
    return filters

# Function to Compute MFCC Features from Raw Audio
def extract_mfcc(audio_array, num_mfcc=40, sample_rate=16000):
    if len(audio_array.shape) > 1:
        audio_array = np.mean(audio_array, axis=1)  # Convert to Mono if Stereo

    # Compute Short-Time Fourier Transform (STFT)
    _, _, stft_output = scipy.signal.stft(audio_array, fs=sample_rate, nperseg=N_FFT, noverlap=HOP_LENGTH)

    # Compute Spectrogram (Magnitude of STFT)
    spectrogram = np.abs(stft_output)

    # Apply Mel Filterbank
    mel_filters = mel_filterbank(N_MEL, N_FFT, sample_rate)
    mel_spectrogram = np.dot(mel_filters, spectrogram)

    # Convert to Log Scale
    log_mel_spectrogram = np.log(mel_spectrogram + 1e-10)

    # Apply Discrete Cosine Transform (DCT) to Get MFCCs
    mfcc = scipy.fftpack.dct(log_mel_spectrogram, axis=0, norm='ortho')[:num_mfcc]

    # **Ensure Fixed Shape: Pad or Truncate**
    if mfcc.shape[1] < FIXED_TIME_STEPS:  # If too short, pad with zeros
        padding = np.zeros((num_mfcc, FIXED_TIME_STEPS - mfcc.shape[1]))
        mfcc = np.hstack((mfcc, padding))
    elif mfcc.shape[1] > FIXED_TIME_STEPS:  # If too long, truncate
        mfcc = mfcc[:, :FIXED_TIME_STEPS]

    return mfcc  # Returns shape (40, FIXED_TIME_STEPS)

# Function to Filter Only Selected Keywords
def filter_keywords(data):
    mfcc_features_list = []
    labels = []

    for audio, label in tfds.as_numpy(data):
        word = info.features["label"].int2str(label)
        if word in KEYWORDS:  # Only process selected words
            audio_numpy = np.array(audio, dtype=np.float32)
            mfcc_features = extract_mfcc(audio_numpy)  # Extract MFCCs
            mfcc_features_list.append(mfcc_features)
            labels.append(KEYWORD_MAP[word])  # Assign label

    return np.array(mfcc_features_list), np.array(labels)

# Extract MFCC Features from Training Data (Only Selected Words)
X_train, y_train = filter_keywords(dataset)  # Train Set

# Normalize Data (Standardize MFCCs)
scaler = StandardScaler()
X_train = np.array([scaler.fit_transform(mfcc.T).T for mfcc in X_train])  # Normalize across time axis

# Convert Labels to One-Hot Encoding
y_train = to_categorical(y_train, num_classes=len(KEYWORDS))

# Reshape for CNN Input: (samples, time_steps, features)
X_train = X_train.reshape(X_train.shape[0], FIXED_TIME_STEPS, N_MFCC, 1)  # Corrected shape

# Split into Train/Test
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# **📌 Optimized CNN Model for FPGA**
def create_fpga_friendly_cnn(input_shape, num_classes):
    model = models.Sequential([
        layers.Conv2D(16, kernel_size=(3,3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D(pool_size=(2,2)),
        layers.Conv2D(32, kernel_size=(3,3), activation='relu'),
        layers.MaxPooling2D(pool_size=(2,2)),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),  # Small dropout to prevent overfitting
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Compile and Train Model
model = create_fpga_friendly_cnn(input_shape=(FIXED_TIME_STEPS, N_MFCC, 1), num_classes=len(KEYWORDS))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

# Evaluate Model on Test Set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")


Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.1811 - loss: 1.7935 - val_accuracy: 0.2843 - val_loss: 1.6704
Epoch 2/30
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.3368 - loss: 1.5656 - val_accuracy: 0.4848 - val_loss: 1.3213
Epoch 3/30
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.4792 - loss: 1.3062 - val_accuracy: 0.5472 - val_loss: 1.1722
Epoch 4/30
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - accuracy: 0.5666 - loss: 1.1201 - val_accuracy: 0.5860 - val_loss: 1.0886
Epoch 5/30
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - accuracy: 0.6307 - loss: 0.9712 - val_accuracy: 0.6136 - val_loss: 1.0193
Epoch 6/30
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.6744 - loss: 0.8671 - val_accuracy: 0.6061 - val_loss: 1.0185
Epoch 7/30
[1m368/368[0m [

In [19]:
import joblib

# Save feature scaler (mean & std deviation)
scaler_filename = "feature_scaler.pkl"
joblib.dump(scaler, scaler_filename)
print(f"Feature scaler saved as {scaler_filename}")

# To Load on FPGA:
# scaler = joblib.load("feature_scaler.pkl")


Feature scaler saved as feature_scaler.pkl


In [22]:
import numpy as np
import librosa

# Extract MFCCs from a sample WAV file
sample_wav = "/content/UP1.wav"  # Example file

# **Load audio file correctly**
audio_array, sr = librosa.load(sample_wav, sr=16000)  # Ensure 16kHz sample rate

# Extract MFCC features
sample_mfcc = extract_mfcc(audio_array, num_mfcc=40, sample_rate=sr)

# Save as .npy file
np.save("sample_mfcc.npy", sample_mfcc)
print("Sample MFCCs saved as sample_mfcc.npy")


Sample MFCCs saved as sample_mfcc.npy


In [25]:
import numpy as np

# Function to save layer weights & biases
def save_layer_weights(model, layer_names):
    for layer_name in layer_names:
        layer = model.get_layer(name=layer_name)
        weights, biases = layer.get_weights()

        # Save weights (flattened for FPGA memory storage)
        np.savetxt(f"{layer_name}_weights.txt", weights.flatten(), fmt="%.6f")
        # Save biases
        np.savetxt(f"{layer_name}_biases.txt", biases.flatten(), fmt="%.6f")

        print(f"Saved {layer_name} weights & biases")

# ✅ Use correct layer names from the model summary
layer_names = ["conv2d", "conv2d_1", "dense_6", "dense_7"]

# Save weights & biases
save_layer_weights(model, layer_names)


Saved conv2d weights & biases
Saved conv2d_1 weights & biases
Saved dense_6 weights & biases
Saved dense_7 weights & biases
