In [None]:
import pandas as pd

# Load the Excel file
df = pd.read_excel(r"D:\AIDS\2nd year my\my projects\maduranga\speaker clips\Names.XLSX")

# Extract file names and corresponding speaker names
file_names = df.iloc[:, 0].values  
speaker_names = df.iloc[:, 1].values  

In [None]:
from sklearn.model_selection import train_test_split

# Assuming you have a list of audio paths and their corresponding labels (speaker IDs)
audio_clips = file_names  # Just file names, not wrapped in a list
labels = speaker_names  # Speaker names, not wrapped in a list

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(audio_clips, labels, test_size=0.2, random_state=42)

# Print out some results to confirm
print("Training set:", X_train[:5], y_train[:5])
print("Test set:", X_test[:5], y_test[:5])

In [None]:
import librosa
import numpy as np

def extract_mfcc(audio_path, n_mfcc=13):
    y, sr = librosa.load(audio_path, sr=48000)  # Load the audio file with 16kHz sampling rate
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)  # Use the mean of MFCCs as feature representation


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
def create_cnn_rnn_model(input_shape, num_classes):
    model = models.Sequential()

    # CNN layers
    model.add(layers.Conv1D(32, 3, activation='relu', input_shape=input_shape, padding='same'))
    model.add(layers.MaxPooling1D(2))
    model.add(layers.Conv1D(64, 3, activation='relu', padding='same'))
    model.add(layers.MaxPooling1D(2))
    model.add(layers.Conv1D(128, 3, activation='relu', padding='same'))
    model.add(layers.GlobalAveragePooling1D())

    # Dense layers
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))
    
    # LSTM layer
    model.add(layers.Reshape((1, 128)))
    model.add(layers.LSTM(64, return_sequences=False))
    
    # Output layer
    model.add(layers.Dense(num_classes, activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Convert speaker names to numeric labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Get the number of unique speakers
num_speakers = len(np.unique(y_train_encoded))

# Convert your MFCC features to arrays
X_train_mfcc = np.array([extract_mfcc(os.path.join(r"D:\AIDS\2nd year my\my projects\maduranga\speaker clips", f"{path}")) for path in X_train])
X_test_mfcc = np.array([extract_mfcc(os.path.join(r"D:\AIDS\2nd year my\my projects\maduranga\speaker clips", f"{path}")) for path in X_test])

# Reshape MFCCs for CNN input
X_train_mfcc = X_train_mfcc.reshape(-1, 13, 1)
X_test_mfcc = X_test_mfcc.reshape(-1, 13, 1)

# Get the number of unique speakers
num_speakers = len(np.unique(y_train_encoded))

# Create and train the model
model = create_cnn_rnn_model(input_shape=(13, 1), num_classes=num_speakers)
model.fit(X_train_mfcc, y_train_encoded, validation_data=(X_test_mfcc, y_test_encoded), epochs=20, batch_size=32)

In [None]:
y_pred = model.predict(X_test_mfcc)
y_pred_classes = np.argmax(y_pred, axis=1)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred_classes)
print(f"Accuracy: {accuracy:.2f}")

# Ensure labels match the unique classes in the test data and predictions
labels = sorted(set(y_test_encoded) | set(y_pred_classes))

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred_classes, labels=labels, target_names=label_encoder.classes_))

# Create confusion matrix
cm = confusion_matrix(y_test_encoded, y_pred_classes, labels=labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

export the model

In [None]:
# Save the model
model.save('speaker_identification_model.h5')

# Save the label encoder
import joblib
joblib.dump(label_encoder, 'label_encoder.joblib')