In [None]:
# collect 15 seconds of audio
# break them into 1 second and store them in "testFiles/"
# Apply all preprocessing
# Load the model 
# Predict each file

In [73]:
# collect 15 second file - 
import pyaudio
import wave

def record_audio(filename, duration=15, rate=48000, chunk=1024, channels=2, format=pyaudio.paInt16):
    audio = pyaudio.PyAudio()
    stream = audio.open(format=format, channels=channels,
                        rate=rate, input=True,
                        frames_per_buffer=chunk)
    print("Recording...")
    frames = []
    for i in range(0, int(rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)
    print("Finished recording.")
    
    stream.stop_stream()
    stream.close()
    audio.terminate()
    
    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(audio.get_sample_size(format))
        wf.setframerate(rate)
        wf.writeframes(b''.join(frames))

if __name__ == "__main__":
    filename = "recorded_audio.wav"
    record_audio(filename)

Recording...
Finished recording.


In [74]:
import wave
def get_sampling_rate(filename):
    with wave.open(filename, 'rb') as wf:
        sr = wf.getframerate()
    return sr
if __name__ == "__main__":
    filename = "testing/Unknown/clip_4.wav"  # Replace with your audio file path
    sr = get_sampling_rate(filename)
    print("Sampling rate:", sr)

Sampling rate: 48000


In [75]:
# break clip into "testing/"
from pydub import AudioSegment
import os

def split_audio(filename, output_folder, segment_length=1000):
    audio = AudioSegment.from_wav(filename)
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for i, start_time in enumerate(range(0, len(audio), segment_length)):
        end_time = start_time + segment_length
        segment = audio[start_time:end_time]
        segment.export(os.path.join(output_folder, f"clip_{i+1}.wav"), format="wav")

if __name__ == "__main__":
    filename = "recorded_audio.wav"  # Replace with your recorded audio file
    output_folder = "testing/Unknown/"
    split_audio(filename, output_folder)

In [76]:
# preprocess - 
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import librosa.display

# Set the parent directory for speaker folders
parent_dir = "testing"

# List of speaker folders
speaker_folders = [
    "Unknown"
]

def extract_features(parent_dir, speaker_folders):
    features = []
    labels = []

    for i, speaker_folder in enumerate(speaker_folders):
        speaker_folder_path = os.path.join(parent_dir, speaker_folder)

        for filename in os.listdir(speaker_folder_path):
            if filename.endswith(".wav"):
                file_path = os.path.join(speaker_folder_path, filename)
                audio, sr = librosa.load(file_path, sr=48000, duration=1)
                mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=90)
                
                # Normalize MFCC features
                mfccs = StandardScaler().fit_transform(mfccs)
                
                features.append(mfccs.T)
                labels.append(i)

    return np.array(features), np.array(labels)

# Extract features and labels
X, y = extract_features(parent_dir, speaker_folders)

In [77]:
X.shape

(15, 94, 90)

In [78]:
actual_speakers = [
    "Abhishek",
    "Anirban",
    "Arunanshu",
    "Shivam",
    "Sunamdha"
]

In [79]:
# Encode labels with explicit classes
label_encoder = LabelEncoder()
# y = label_encoder.fit_transform(y)
label_encoder.classes_ = np.array(actual_speakers)

In [80]:
# predict
# import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score

model = joblib.load('Five_speaker_model_mfcc_17mindata.joblib')  # Update with your actual model file
y_pred_probabilities = model.predict(X)
y_pred = np.argmax(y_pred_probabilities, axis=1)
y_pred
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_pred_decoded
# # Evaluate the model on the test set
# y_pred_probabilities = model.predict(X_test)
# y_pred = np.argmax(y_pred_probabilities, axis=1)

# # Decode labels back to original format
# y_test_decoded = label_encoder.inverse_transform(y_test)
# y_pred_decoded = label_encoder.inverse_transform(y_pred)

# # Create a confusion matrix
# conf_matrix = confusion_matrix(y_test_decoded, y_pred_decoded, labels=speaker_folders)

# # Calculate acc   uracy
# accuracy = accuracy_score(y_test_decoded, y_pred_decoded)
# print(f"Test Evaluation Accuracy: {accuracy}")

# # Calculate F1 score
# f1 = f1_score(y_test_decoded, y_pred_decoded, labels=speaker_folders, average='weighted')
# print(f"Weighted F1 Score: {f1}")

# # Plot the confusion matrix
# plt.figure(figsize=(8, 6))
# sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=speaker_folders, yticklabels=speaker_folders)

# # Rotate x-axis labels by 45 degrees
# plt.xticks(rotation=45, ha="right")

# plt.title("Confusion Matrix")
# plt.xlabel("Predicted Label")
# plt.ylabel("True Label")
# plt.show()



array(['Sunamdha', 'Sunamdha', 'Sunamdha', 'Sunamdha', 'Sunamdha',
       'Sunamdha', 'Sunamdha', 'Sunamdha', 'Sunamdha', 'Sunamdha',
       'Sunamdha', 'Sunamdha', 'Sunamdha', 'Sunamdha', 'Sunamdha'],
      dtype='<U9')