<a href="https://colab.research.google.com/github/8kelena8/machine_learning/blob/main/CNN_and_Mel_spectrogram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub
path = kagglehub.dataset_download("mhantor/russian-voice-dataset")
print("Path to dataset files:", path)
!mv /root/.cache/kagglehub/datasets/mhantor/russian-voice-dataset/versions/3 /content/sample_data/

Downloading from https://www.kaggle.com/api/v1/datasets/download/mhantor/russian-voice-dataset?dataset_version_number=3...


100%|██████████| 1.30G/1.30G [00:35<00:00, 38.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/mhantor/russian-voice-dataset/versions/3


In [1]:
!pip install librosa scikit-learn matplotlib gradio
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.metrics import classification_report, confusion_matrix



In [2]:
base_dir = '/content/sample_data/3'
normal_dir = os.path.join(base_dir, 'Normal Voices', 'Normal Voices')
disorder_dir = os.path.join(base_dir, 'Disorder Voices', 'Disorder Voices')

def extract_mel_feature(file_path, n_mels=128):
    y, sr = librosa.load(file_path, sr=None)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

X, y = [], []

for file in os.listdir(normal_dir):
    if file.endswith('.wav'):
        filepath = os.path.join(normal_dir, file)
        mel = extract_mel_feature(filepath)
        X.append(mel)
        y.append("normal")

for file in os.listdir(disorder_dir):
    if file.endswith('.wav'):
        filepath = os.path.join(disorder_dir, file)
        mel = extract_mel_feature(filepath)
        X.append(mel)
        y.append("disorder")

lengths = [x.shape[1] for x in X]
max_len = int(np.percentile(lengths, 90))

X_padded = []
for x in X:
    if x.shape[1] > max_len:
        x = x[:, :max_len]
    else:
        pad_width = max_len - x.shape[1]
        x = np.pad(x, ((0, 0), (0, pad_width)), mode='constant')
    X_padded.append(x)

X = np.array(X_padded)[..., np.newaxis]



In [3]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_onehot = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2, random_state=42)


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
input_shape = (128, max_len, 1)
model = Sequential([
    Conv2D(8, (3, 3), activation='relu', input_shape=input_shape),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(16, (3, 3), activation='relu', strides=2),
    BatchNormalization(),
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(2, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [5]:
history = model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_test, y_test))

Epoch 1/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 655ms/step - accuracy: 0.9009 - loss: 0.6802 - val_accuracy: 0.9987 - val_loss: 0.0083
Epoch 2/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 651ms/step - accuracy: 0.9940 - loss: 0.0222 - val_accuracy: 0.9975 - val_loss: 0.0035
Epoch 3/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 666ms/step - accuracy: 0.9964 - loss: 0.0101 - val_accuracy: 0.9987 - val_loss: 0.0016
Epoch 4/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 691ms/step - accuracy: 0.9971 - loss: 0.0115 - val_accuracy: 0.9975 - val_loss: 0.0200
Epoch 5/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 662ms/step - accuracy: 0.9975 - loss: 0.0121 - val_accuracy: 0.9787 - val_loss: 0.1040
Epoch 6/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 642ms/step - accuracy: 0.9945 - loss: 0.0347 - val_accuracy: 0.9900 - val_loss: 0.1202
Epoc

In [6]:
y_pred_probs = model.predict(X_test)

y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

print("Classification Report:")
print(classification_report(y_true, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 452ms/step
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       378
           1       1.00      1.00      1.00       422

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800

Confusion Matrix:
[[378   0]
 [  0 422]]


In [7]:
print(model.input_shape)

(None, 128, 548, 1)


In [None]:
import librosa
import numpy as np
import gradio as gr

def extract_mel(file):
    expected_shape = (128, 548)

    # 1. Load the audio
    y, sr = librosa.load(file, sr=None)

    # 2. Extract mel spectrogram
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, hop_length=512)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    # 3. Pad or truncate to exactly (128, 548)
    if mel_db.shape[1] < expected_shape[1]:
        mel_db = np.pad(mel_db, ((0, 0), (0, expected_shape[1] - mel_db.shape[1])), mode='constant')
    else:
        mel_db = mel_db[:, :expected_shape[1]]

    # 4. Add batch and channel dimensions: (1, 128, 548, 1)
    mel_input = mel_db[np.newaxis, :, :, np.newaxis].astype(np.float32)
    # mel_input = mel_db.T[np.newaxis, :, :, np.newaxis]

    # 5. DEBUG print
    print("INPUT SHAPE:", mel_input.shape)  # Should be (1, 128, 548, 1)

    # 6. Predict
    prediction = model.predict(mel_input)
    predicted_label = np.argmax(prediction)
    confidence = prediction[0][predicted_label]

    # 7. Return prediction
    class_labels = ["Disorder", "Normal"]
    return f"✅ Predicted: **{class_labels[predicted_label]}**\nConfidence: {confidence:.2%}"

# Gradio interface
gr.Interface(
    fn=extract_mel,
    inputs=gr.Audio(type="filepath", label="Upload .wav File"),
    outputs=gr.Textbox(label="Prediction"),
    title="🎧 Audio Classification using Mel Spectrogram",
    description="Upload a .wav file. The system extracts a Mel spectrogram and predicts using a trained CNN."
).launch(debug=True, share=True)



Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://a69ae40c993714a7f1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


INPUT SHAPE: (1, 128, 548, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
INPUT SHAPE: (1, 128, 548, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
INPUT SHAPE: (1, 128, 548, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
