### Major install

In [1]:
%pip install librosa soundfile numpy scikit-learn pyttsx3 --quiet

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Pulse code Modulation 

In [2]:
import librosa
import numpy as np
import os
import pyttsx3
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

### GEnerate synthetic Dataset

In [3]:
def generate_speech(text, filename, voice_id):
    engine = pyttsx3.init()
    engine.setProperty('voice', voice_id) ## saved the files 
    engine.setProperty('rate', 150)
    engine.save_to_file(text, filename)
    engine.runAndWait()

os.makedirs("voices", exist_ok=True)

engine = pyttsx3.init()
voices = engine.getProperty('voices') ## asking pyttsx3 to get all voices it has
voice_1 = voices[0].id
voice_2 = voices[1].id if len(voices) > 1 else voices[0].id  # fallback

generate_speech("Hello, this is speaker one. I like machine learning.", "voices/speaker1.wav", voice_1)
generate_speech("Hello, this is speaker two. I prefer data science.", "voices/speaker2.wav", voice_2)


### Extraction(features) 

In [None]:
# Loading files and extracting features(mfccs)
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=16000) # loads audio file into memory
    y_trimmed, _ = librosa.effects.trim(y)
    mfccs = librosa.feature.mfcc(y=y_trimmed, sr=sr, n_mfcc=13)
    return mfccs.T  # Shape: (time, 13)

X, y = [], []

for label, file in enumerate(["voices/speaker1.wav", "voices/speaker2.wav"]):
    mfccs = extract_features(file)
    for mfcc in mfccs:
        X.append(mfcc)
        y.append(label)

X = np.array(X)
y = np.array(y) # Is also used whether who spoke , 1 or 0;
print("Features shape:", X.shape)


Features shape: (280, 13)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = SVC(kernel="linear")
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.96      0.96        28
           1       0.96      0.96      0.96        28

    accuracy                           0.96        56
   macro avg       0.96      0.96      0.96        56
weighted avg       0.96      0.96      0.96        56



In [6]:
generate_speech("This is a test phrase from speaker one.", "voices/test_speaker1.wav", voice_1)

test_mfccs = extract_features("voices/test_speaker1.wav")

predictions = clf.predict(test_mfccs)
predicted_label = np.bincount(predictions).argmax()
print(f"Predicted speaker: {'Speaker 1' if predicted_label == 0 else 'Speaker 2'}")


Predicted speaker: Speaker 1


In [8]:
def speak(text, voice_id):
    engine = pyttsx3.init()
    engine.setProperty('voice', voice_id)
    engine.setProperty('rate', 150)
    engine.say(text)
    engine.runAndWait() 
speak("How are you buddy , Whats going on ?",voice_2)
speak("I'm fine what about you", voice_1)
