In [2]:
pip install librosa numpy pandas scikit-learn matplotlib soundfile seaborn


Note: you may need to restart the kernel to use updated packages.


In [3]:
import os

dataset_path = "33samp"

# List all speakers (subfolders)
speakers = [f for f in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, f))]
print("Speakers:", speakers)

# List audio files for each speaker
for speaker in speakers:
    speaker_path = os.path.join(dataset_path, speaker)
    files = os.listdir(speaker_path)
    print(f"\n{speaker} has {len(files)} audio files: {files}")

Speakers: ['.ipynb_checkpoints', 'dpk33', 'lik33', 'raj33']

.ipynb_checkpoints has 0 audio files: []

dpk33 has 33 audio files: ['10d.wav', '11d.wav', '12d.wav', '13d.wav', '14d1.wav', '15d.wav', '16d.wav', '17d.wav', '18d.wav', '19d1.wav', '1d.wav', '20d.wav', '21d.wav', '22d.wav', '23d.wav', '24d.wav', '25d.wav', '26d1.wav', '27d.wav', '28d.wav', '29d.wav', '2d.wav', '30d.wav', '31d.wav', '32d.wav', '33d.wav', '3d.wav', '4d.wav', '5d.wav', '6d.wav', '7d.wav', '8d.wav', '9d.wav']

lik33 has 33 audio files: ['L1.wav', 'L10.wav', 'L11.wav', 'L12.wav', 'L13.wav', 'L14.wav', 'L15.wav', 'L16.wav', 'L17.wav', 'L18.wav', 'L19.wav', 'L2.wav', 'L20.wav', 'L21.wav', 'L22.wav', 'L23.wav', 'L24.wav', 'L25.wav', 'L26.wav', 'L27.wav', 'L3.wav', 'L34.wav', 'L35.wav', 'L36.wav', 'L37.wav', 'L38.wav', 'L39.wav', 'L4.wav', 'L5.wav', 'L6.wav', 'L7.wav', 'L8.wav', 'L9.wav']

raj33 has 33 audio files: ['ran1.wav', 'ran10.wav', 'ran11.wav', 'ran12.wav', 'ran13.wav', 'ran14.wav', 'ran15.wav', 'ran16.wav', 

In [4]:
import librosa
import numpy as np

def extract_mfcc(file_path, n_mfcc=13, sr=16000):
    y, sr = librosa.load(file_path, sr=sr, mono=True)  # Load audio
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)  # Extract MFCCs
    return np.mean(mfcc, axis=1)  # Take the mean across time

In [5]:
file_path = "33samp/dpk33/1d.wav"  # Change as needed
mfcc_features = extract_mfcc(file_path)

print("MFCC Shape:", mfcc_features.shape)  # Expected: (13,)
print("MFCC Features:\n", mfcc_features)

MFCC Shape: (13,)
MFCC Features:
 [-330.83047    100.929474     4.2469625   45.285423    -2.5210724
   14.055159    -1.5519155   -3.897785    -1.1677694  -11.163385
   -2.744204    -3.496664    -6.263084 ]


In [8]:
import os
import pandas as pd
import numpy as np

dataset_path = "33samp"  # Your main dataset folder

# List all speakers (subfolders in "record")
speakers = [f for f in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, f))]
print("Speakers found:", speakers)

data = []
labels = []

# Loop through each speaker folder (e.g., 'raj', 'deep')
for speaker in speakers:
    speaker_path = os.path.join(dataset_path, speaker)

    # Loop through each audio file in the speaker's folder
    for file in os.listdir(speaker_path):
        if file.endswith(".wav"):  # Ensure it's a .wav file
            file_path = os.path.join(speaker_path, file)
            
            # Extract MFCC features
            mfcc = extract_mfcc(file_path)
            
            data.append(mfcc)
            labels.append(speaker)  # Use the speaker name as the label

# Convert to DataFrame for visualization
df = pd.DataFrame(data)
df["label"] = labels

# Display first few rows
df.head()

Speakers found: ['.ipynb_checkpoints', 'dpk33', 'lik33', 'raj33']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,label
0,-257.011505,106.160355,-11.713521,56.39666,-8.368375,-17.011631,-10.423728,-5.96787,-15.237088,-15.948683,-0.447853,-2.557593,-13.181442,dpk33
1,-239.96405,103.416283,-1.8021,33.679798,-10.124496,5.450778,-21.174589,-7.291982,-5.711663,-13.152877,1.912504,-5.827011,-8.687257,dpk33
2,-204.164703,104.282951,-16.939993,38.474113,-3.998943,-7.332247,-27.504417,5.321534,-16.998734,-11.899725,-2.748642,-1.303363,-9.570438,dpk33
3,-201.286301,91.200905,2.561321,40.440979,-5.163778,-3.164549,-25.540171,9.136554,-16.924547,-11.782964,-3.948802,3.239573,-4.826466,dpk33
4,-212.818787,119.914825,-9.261454,60.822887,-16.306513,-2.559133,-9.542895,-0.562665,-20.203697,-14.651353,4.602472,3.11448,-17.279627,dpk33


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Convert speaker names ('raj', 'deep', etc.) to numeric labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)  # Encode speaker names to numbers
X = np.array(data)

# Split into 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

Training samples: 79, Testing samples: 20


In [10]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Create an SVM model with feature scaling
svm_model = make_pipeline(StandardScaler(), SVC(kernel="linear", probability=True))

# Train the model
svm_model.fit(X_train, y_train)

print("Training complete!")

Training complete!


In [11]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on test data
y_pred = svm_model.predict(X_test)

# Check unique labels
print("Unique classes in y_test:", np.unique(y_test))
print("Unique classes in y_pred:", np.unique(y_pred))
print("Classes from label_encoder:", label_encoder.classes_)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Print classification report with correct labels
print(classification_report(y_test, y_pred, labels=np.unique(y_test), target_names=np.unique(y_test).astype(str)))


Unique classes in y_test: [0 1 2]
Unique classes in y_pred: [0 1 2]
Classes from label_encoder: ['dpk33' 'lik33' 'raj33']
Model Accuracy: 100.00%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         6

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



In [76]:
# Choose a test audio file (not used in training)
test_file = "5 speakers/raj/rn21.wav"  # Change to an actual unseen file

# Extract MFCCs
test_mfcc = extract_mfcc(test_file)

# Reshape for model input
test_mfcc = test_mfcc.reshape(1, -1)

# Predict speaker with confidence scores
probabilities = svm_model.predict_proba(test_mfcc)[0]  # Get probability scores for all classes
max_prob = max(probabilities)  # Get the highest confidence score
predicted_label = np.argmax(probabilities)  # Get the index of the best-matching speaker

# Set threshold for unknown speakers
threshold = 0.2  # Adjust if needed (try 0.6 or 0.7 if it's misclassifying)

if max_prob < threshold:
    predicted_speaker = "Unknown"
else:
    predicted_speaker = label_encoder.inverse_transform([predicted_label])[0]

print(f"Predicted Speaker: {predicted_speaker} (Confidence: {max_prob:.2f})")

  y, sr = librosa.load(file_path, sr=sr, mono=True)  # Load audio
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: '5 speakers/raj/rn21.wav'