In [None]:
!pip install speechbrain torchaudio transformers

In [None]:
from google.colab import files
import os

# Upload the first voice sample
uploaded = files.upload()

# Check if the file is uploaded
for filename in uploaded.keys():
    print(f"✅ Uploaded: {filename}")

# List files in Colab
print("📂 Files in Colab:", os.listdir())


Saving Female.wav to Female (1).wav
✅ Uploaded: Female (1).wav
📂 Files in Colab: ['.config', 'Female.wav', 'reference_embedding.pth', 'Female (1).wav', 'spkrec_model', 'Male.wav', 'sample_data']


In [None]:
import torchaudio
import torch
from speechbrain.pretrained import SpeakerRecognition

# Load Pretrained Speaker Recognition Model
spk_model = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="spkrec_model"
)

def extract_speaker_embedding(audio_file):
    """Extracts speaker-specific voice features"""
    signal, fs = torchaudio.load(audio_file)

    # Convert to 16kHz if needed
    if fs != 16000:
        transform = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
        signal = transform(signal)

    # Extract speaker embedding
    embedding = spk_model.encode_batch(signal)
    return embedding.squeeze(0)

# Extract embeddings from the reference voice sample
reference_embedding = extract_speaker_embedding("Female.wav")

# Save for later comparison
torch.save(reference_embedding, "reference_embedding.pth")

print("✅ Reference Voice Features Saved!")


✅ Reference Voice Features Saved!


In [None]:
# Upload the second voice sample
uploaded = files.upload()

# Rename the uploaded file as "test.wav"
for filename in uploaded.keys():
    print(f"✅ Uploaded: {filename}")

# Check if the file is uploaded
print("📂 Files in Colab:", os.listdir())


Saving Male.wav to Male (1).wav
✅ Uploaded: Male (1).wav
📂 Files in Colab: ['.config', 'Female.wav', 'reference_embedding.pth', 'Female (1).wav', 'spkrec_model', 'Male (1).wav', 'Male.wav', 'sample_data']


In [None]:
# Extract features from the test voice sample
test_embedding = extract_speaker_embedding("Male.wav")

# Load stored reference embedding
reference_embedding = torch.load("reference_embedding.pth")

# Compute cosine similarity
similarity = torch.nn.functional.cosine_similarity(test_embedding, reference_embedding, dim=-1)

# Get a single similarity score (average)
similarity_score = similarity.mean().item()

# Check if the voice matches
if similarity_score > 0.5:  # Adjust threshold if needed
    print(f"✅ Voice Matched! Similarity Score: {similarity_score:.2f} - Unlocking System!")
else:
    print(f"❌ Voice Not Recognized! Similarity Score: {similarity_score:.2f} - Access Denied!")


❌ Voice Not Recognized! Similarity Score: 0.12 - Access Denied!


  reference_embedding = torch.load("reference_embedding.pth")


## Tensorflow

In [None]:
!pip install tensorflow tensorflow_io librosa numpy


In [None]:
!pip install tensorflow tensorflow-io-nightly librosa numpy


In [None]:
!pip install speechbrain torchaudio librosa numpy tensorflow



In [None]:
import torchaudio
import tensorflow as tf
from speechbrain.pretrained import SpeakerRecognition

# Load SpeechBrain Pretrained Speaker Recognition Model
spk_model = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="spkrec_model"
)

def extract_speaker_embedding(audio_file):
    """Extracts speaker-specific voice features using SpeechBrain"""
    signal, fs = torchaudio.load(audio_file)

    # Convert to 16kHz if needed
    if fs != 16000:
        transform = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
        signal = transform(signal)

    # Extract speaker embedding
    embedding = spk_model.encode_batch(signal)

    # Convert to NumPy for TensorFlow compatibility
    return embedding.squeeze(0).detach().numpy()



In [None]:


# Extract speaker embeddings
reference_embedding = extract_speaker_embedding("Female.wav")
test_embedding = extract_speaker_embedding("Male.wav")

# Convert to TensorFlow tensors
reference_tensor = tf.convert_to_tensor(reference_embedding, dtype=tf.float32)
test_tensor = tf.convert_to_tensor(test_embedding, dtype=tf.float32)

# Ensure embeddings are normalized
reference_tensor = tf.linalg.l2_normalize(reference_tensor, axis=-1)
test_tensor = tf.linalg.l2_normalize(test_tensor, axis=-1)

# Compute absolute cosine similarity (higher is better)
cosine_similarity = tf.keras.losses.cosine_similarity(reference_tensor, test_tensor, axis=-1)
similarity_score = -cosine_similarity.numpy()  # Convert to positive scale

# Extract a single similarity score
similarity_score = similarity_score.mean()  # Take the average value

# Decision based on a proper threshold
threshold = 0.6  # Lower = more strict, Higher = more relaxed
if similarity_score > threshold:
    print(f"✅ Voice Matched! Similarity Score: {similarity_score:.2f} - Unlocking System!")
else:
    print(f"❌ Voice Not Recognized! Similarity Score: {similarity_score:.2f} - Access Denied!")



❌ Voice Not Recognized! Similarity Score: 0.12 - Access Denied!


## Speech Recognition

In [23]:
import speech_recognition as sr

def recognize_speech(audio_file):
    """Converts speech to text using Google Speech Recognition"""
    recognizer = sr.Recognizer()

    # Load the audio file
    with sr.AudioFile(audio_file) as source:
        print("🔍 Processing audio...")
        audio_data = recognizer.record(source)

    # Convert speech to text
    try:
        text = recognizer.recognize_google(audio_data)
        print(f"✅ Speech Recognized: {text}")
        return text
    except sr.UnknownValueError:
        print("❌ Could not understand the audio")
        return None
    except sr.RequestError:
        print("❌ Could not request results from Google API")
        return None


In [26]:
# Step 1: Recognize Speech from "Female.wav"
recognized_text_female = recognize_speech("Female.wav")

# Step 2: Recognize Speech from "Male.wav"
recognized_text_male = recognize_speech("Male.wav")

# Step 3: Compare the Transcriptions
if recognized_text_female and recognized_text_male:
    if recognized_text_female == recognized_text_male:
        print("✅ The same speech was detected in both files!")
    else:
        print("❌ The speech content is different in both files.")


🔍 Processing audio...
✅ Speech Recognized: thank you
🔍 Processing audio...
✅ Speech Recognized: turn nine eight seven six five four three two one zero
❌ The speech content is different in both files.
