In [1]:
!pip install librosa scikit-learn tensorflow



In [2]:
!pip install soundfile



In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import os
import librosa
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [7]:
def extract_mfcc(file_path):
    # Load the audio file
    audio, sr = librosa.load(file_path, sr=None)
    # Extract MFCC features
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mfcc = np.mean(mfcc.T, axis=0)  # Taking the mean of MFCC features over time
    return mfcc

def load_data(data_dir):
    features = []
    labels = []

    # Iterate over each folder in the data directory
    for speaker in os.listdir(data_dir):
        speaker_path = os.path.join(data_dir, speaker)
        if os.path.isdir(speaker_path):
            for file in os.listdir(speaker_path):
                file_path = os.path.join(speaker_path, file)
                if file_path.endswith('.wav'):  # Only process .wav files
                    mfcc = extract_mfcc(file_path)
                    features.append(mfcc)
                    labels.append(speaker)

    return np.array(features), np.array(labels)

In [11]:
# Load the data
data_dir = r'/content/drive/MyDrive/archive/50_speakers_audio_data'  # Replace with your folder path
X, y = load_data(data_dir)

# Encode the labels (speaker names) into numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

KeyboardInterrupt: 

In [10]:
# SVM Classifier
svm_classifier = SVC(kernel='linear', probability=True)
svm_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

NameError: name 'X_train' is not defined

In [None]:
##   FOR LARGE DATASETS
# Build a simple neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(np.unique(y_encoded)), activation='softmax')  # Output layer with one neuron per class
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Epoch 1/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.0435 - loss: 19.7177 - val_accuracy: 0.1930 - val_loss: 4.1787
Epoch 2/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.2756 - loss: 3.3023 - val_accuracy: 0.4094 - val_loss: 2.4354
Epoch 3/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4629 - loss: 2.1916 - val_accuracy: 0.6179 - val_loss: 1.7622
Epoch 4/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5985 - loss: 1.6124 - val_accuracy: 0.6589 - val_loss: 1.4542
Epoch 5/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6992 - loss: 1.1658 - val_accuracy: 0.7739 - val_loss: 1.1250
Epoch 6/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7763 - loss: 0.9018 - val_accuracy: 0.8090 - val_loss: 0.9821
Epoch 7/10
[1m65/65[0m [32m━━━━━━━━

In [None]:
def predict_speaker(file_path, model=None):
    mfcc = extract_mfcc(file_path).reshape(1, -1)  # Reshape for prediction
    if model:
        prediction = model.predict(mfcc)
        # Get the class with the highest probability
        predicted_class_index = np.argmax(prediction, axis=1)
        # Inverse transform the predicted class index
        speaker = label_encoder.inverse_transform(predicted_class_index)[0]
    else:
        prediction = svm_classifier.predict(mfcc)
        speaker = label_encoder.inverse_transform([prediction])[0]
    return speaker

# Example usage
test_file = '/content/drive/MyDrive/archive/50_speakers_audio_data/karuna/Karuna3.wav'  # Replace with your test file path
predicted_speaker = predict_speaker(test_file, model=model)
print(f"Predicted speaker: {predicted_speaker}")


  audio, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
Predicted speaker: karuna


In [None]:
def predict_speaker(file_path, model=None):
    mfcc = extract_mfcc(file_path).reshape(1, -1)  # Reshape for prediction
    if model:
        prediction = model.predict(mfcc)
        # Get the class with the highest probability
        predicted_class_index = np.argmax(prediction, axis=1)
        # Inverse transform the predicted class index
        speaker = label_encoder.inverse_transform(predicted_class_index)[0]
    else:
        prediction = svm_classifier.predict(mfcc)
        speaker = label_encoder.inverse_transform([prediction])[0]
    return speaker

# Example usage
test_file = '/content/drive/MyDrive/archive/50_speakers_audio_data/swayam/swayam7.wav'  # Replace with your test file path
predicted_speaker = predict_speaker(test_file, model=model)
print(f"Predicted speaker: {predicted_speaker}")


  audio, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Predicted speaker: swayam


In [None]:
def predict_speaker(file_path, model=None):
    mfcc = extract_mfcc(file_path).reshape(1, -1)  # Reshape for prediction
    if model:
        prediction = model.predict(mfcc)
        # Get the class with the highest probability
        predicted_class_index = np.argmax(prediction, axis=1)
        # Inverse transform the predicted class index
        speaker = label_encoder.inverse_transform(predicted_class_index)[0]
    else:
        prediction = svm_classifier.predict(mfcc)
        speaker = label_encoder.inverse_transform([prediction])[0]
    return speaker

# Example usage
test_file = '/content/drive/MyDrive/archive/50_speakers_audio_data/anita/ani12.wav'  # Replace with your test file path
predicted_speaker = predict_speaker(test_file, model=model)
print(f"Predicted speaker: {predicted_speaker}")

  audio, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Predicted speaker: anita


In [None]:
def predict_speaker(file_path, model=None):
    mfcc = extract_mfcc(file_path).reshape(1, -1)  # Reshape for prediction
    if model:
        prediction = model.predict(mfcc)
        # Get the class with the highest probability
        predicted_class_index = np.argmax(prediction, axis=1)
        # Inverse transform the predicted class index
        speaker = label_encoder.inverse_transform(predicted_class_index)[0]
    else:
        prediction = svm_classifier.predict(mfcc)
        speaker = label_encoder.inverse_transform([prediction])[0]
    return speaker

# Example usage
test_file = '/content/drive/MyDrive/archive/50_speakers_audio_data/anita/ani9.wav'  # Replace with your test file path
predicted_speaker = predict_speaker(test_file, model=model)
print(f"Predicted speaker: {predicted_speaker}")

  audio, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
Predicted speaker: anita


In [None]:
def predict_speaker(file_path, model=None):
    mfcc = extract_mfcc(file_path).reshape(1, -1)  # Reshape for prediction
    if model:
        prediction = model.predict(mfcc)
        # Get the class with the highest probability
        predicted_class_index = np.argmax(prediction, axis=1)
        # Inverse transform the predicted class index
        speaker = label_encoder.inverse_transform(predicted_class_index)[0]
    else:
        prediction = svm_classifier.predict(mfcc)
        speaker = label_encoder.inverse_transform([prediction])[0]
    return speaker

# Example usage
test_file = '/content/drive/MyDrive/archive/50_speakers_audio_data/alok/alok5.wav'  # Replace with your test file path
predicted_speaker = predict_speaker(test_file, model=model)
print(f"Predicted speaker: {predicted_speaker}")

  audio, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Predicted speaker: alok


In [None]:
def predict_speaker(file_path, model=None):
    mfcc = extract_mfcc(file_path).reshape(1, -1)  # Reshape for prediction
    if model:
        prediction = model.predict(mfcc)
        # Get the class with the highest probability
        predicted_class_index = np.argmax(prediction, axis=1)
        # Inverse transform the predicted class index
        speaker = label_encoder.inverse_transform(predicted_class_index)[0]
    else:
        prediction = svm_classifier.predict(mfcc)
        speaker = label_encoder.inverse_transform([prediction])[0]
    return speaker

# Example usage
test_file = '/content/drive/MyDrive/archive/50_speakers_audio_data/anita/ani9.wav'  # Replace with your test file path
predicted_speaker = predict_speaker(test_file, model=model)
print(f"Predicted speaker: {predicted_speaker}")

  audio, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
Predicted speaker: anita


In [None]:
def predict_speaker(file_path, model=None):
    mfcc = extract_mfcc(file_path).reshape(1, -1)  # Reshape for prediction
    if model:
        prediction = model.predict(mfcc)
        # Get the class with the highest probability
        predicted_class_index = np.argmax(prediction, axis=1)
        # Inverse transform the predicted class index
        speaker = label_encoder.inverse_transform(predicted_class_index)[0]
    else:
        prediction = svm_classifier.predict(mfcc)
        speaker = label_encoder.inverse_transform([prediction])[0]
    return speaker

# Example usage
test_file = '/content/drive/MyDrive/archive/50_speakers_audio_data/anita/ani12.wav'  # Replace with your test file path
predicted_speaker = predict_speaker(test_file, model=model)
print(f"Predicted speaker: {predicted_speaker}")

  audio, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
Predicted speaker: anita
