In [None]:
import whisper
from pyannote.audio import Audio
from pyannote.core import Segment
from pyannote.audio import Pipeline
import numpy as np
import librosa

# Load the Whisper model
model = whisper.load_model("base")

# Load audio file
audio_file = "sample.wav"

# Initialize Pyannote Audio
audio = Audio()

# Load the diarization pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token='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')

# Perform diarization
try:
    diarization = pipeline(audio_file)
except Exception as e:
    raise RuntimeError(f"Failed to process the audio file: {e}")

# Transcribe each speaker's segment
for turn, _, speaker in diarization.itertracks(yield_label=True):
    segment = Segment(turn.start, turn.end)

    # Load the waveform and sample rate
    try:
        waveform, sample_rate = audio.crop(audio_file, segment)
    except Exception as e:
        raise RuntimeError(f"Failed to load the audio segment: {e}")

    # Convert waveform to 1D numpy array expected by Whisper
    waveform = waveform.numpy().flatten()

    # Ensure sample rate is 16000 for Whisper
    if sample_rate != 16000:
        waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)

    # Transcribe the segment
    result = model.transcribe(waveform, fp16=False)  # Disable fp16 for better accuracy
    print(f"{speaker}: {result['text']}")


--------

In [15]:
import speech_recognition as sr

# Create a recognizer object
r = sr.Recognizer()

# Use the microphone as the audio source
with sr.Microphone() as source:
    print("Say something!")
    # Listen for audio input
    audio = r.listen(source)

try:
    # Use Google's speech recognition to convert audio to text
    text = r.recognize_google(audio)
    # Write the transcription to a file
    with open("transcription.txt", "w") as file:
        file.write(text)
    print("Transcription saved to transcription.txt")
except sr.UnknownValueError:
    print("Google Speech Recognition could not understand audio")
except sr.RequestError as e:
    print("Could not request results from Google Speech Recognition service; {0}".format(e))

Say something!
Google Speech Recognition could not understand audio


In [16]:
import speech_recognition as sr
import noisereduce as nr
import numpy as np

# Create a recognizer object
r = sr.Recognizer()

# Use the microphone as the audio source
with sr.Microphone() as source:
    print("Say something!")
    # Listen for audio input
    audio = r.listen(source)

# Check if audio data is received
if audio:
    # Convert audio data to numpy array
    audio_data = np.frombuffer(audio.frame_data, np.int16)

    # Perform noise reduction
    reduced_noise = nr.reduce_noise(y=audio_data, sr=audio.sample_rate)

    # Convert numpy array back to audio data
    reduced_noise_audio = sr.AudioData(reduced_noise.tobytes(), sample_rate=audio.sample_rate, sample_width=audio.sample_width)

    try:
        # Use Google's speech recognition to convert audio to text
        text = r.recognize_google(reduced_noise_audio, language="fr-FR")
        print("Transcript:", text)
        # Write the transcript to a file
        with open("transcript.txt", "a") as file:
            file.write(text + "\n")
            print('transcript updated')
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio.")
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))
else:
    print("No audio data received.")


Say something!
Google Speech Recognition could not understand audio.


In [3]:
from pyannote.audio import Pipeline, Inference
import speech_recognition as sr
import noisereduce as nr
import numpy as np
import tempfile

# Create a recognizer object
r = sr.Recognizer()

# Set the pause threshold to 1.0 seconds
r.pause_threshold = 1.0

# Use the microphone as the audio source
with sr.Microphone() as source:
    print("Say something!")
    # Listen for audio input
    audio = r.listen(source)

# Check if audio data is received
if audio:
    # Convert audio data to numpy array
    audio_data = np.frombuffer(audio.frame_data, np.int16)

    # Perform noise reduction
    reduced_noise = nr.reduce_noise(y=audio_data.flatten(), sr=audio.sample_rate)

    # Convert numpy array back to audio data
    reduced_noise_audio = sr.AudioData(reduced_noise.tobytes(), sample_rate=audio.sample_rate, sample_width=audio.sample_width)

    # Save the reduced noise audio to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp:
        temp_filename = temp.name
        with open(temp_filename, "wb") as f:
            f.write(reduced_noise_audio.get_wav_data())

    # Initialize the pipeline
    pipeline = Pipeline.from_pretrained("None",
                                        use_auth_token="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")
    inference = Inference(pipeline)

    # Apply the pipeline to the temporary audio file
    diarization = inference(temp_filename)

    # Print the diarization output
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")


Say something!

Could not download 'None' pipeline.
It might be because the pipeline is private or gated so make
sure to authenticate. Visit https://hf.co/settings/tokens to
create your access token and retry with:

   >>> Pipeline.from_pretrained('None',
   ...                          use_auth_token=YOUR_AUTH_TOKEN)

If this still does not work, it might be because the pipeline is gated:
visit https://hf.co/None to accept the user conditions.

Could not download 'None' model.
It might be because the model is private or gated so make
sure to authenticate. Visit https://hf.co/settings/tokens to
create your access token and retry with:

   >>> Model.from_pretrained('None',
   ...                       use_auth_token=YOUR_AUTH_TOKEN)

If this still does not work, it might be because the model is gated:
visit https://hf.co/None to accept the user conditions.


AttributeError: 'NoneType' object has no attribute 'device'

In [14]:
import speech_recognition as sr
from pydub import AudioSegment
from pyannote.audio import Pipeline

# Create a recognizer object
r = sr.Recognizer()

# Use the microphone as the audio source
with sr.Microphone() as source:
    print("Say something!")
    # Listen for audio input
    audio = r.listen(source)
    text = r.recognize_google(audio, language= "fr-FR")

# Save the audio data to a file
with open("audio.wav", "wb") as f:
    f.write(audio.get_wav_data())

# Convert audio file to wav format
sound = AudioSegment.from_wav("audio.wav")
sound.export("audio.wav", format="wav")

# Load pretrained pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
                                    use_auth_token='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')

# Process audio file
output = pipeline("audio.wav")
print("Diarization:\n", output)
print("\nTranscription:\n", text)


Say something!


UnknownValueError: 

In [2]:
# Import necessary libraries
from pyannote.audio import Pipeline
import speech_recognition as sr

# Define the audio file path
audio_file = "meeting-clip2.wav"

# Load pretrained pipeline for speaker diarization
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token='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')

# Process audio file for diarization
diarization = diarization_pipeline(audio_file)

# Initialize recognizer class for speech recognition
recognizer = sr.Recognizer()

# Open the audio file
with sr.AudioFile(audio_file) as source:
    # Read the entire audio file
    audio_data = recognizer.record(source)

    # Use the recognizer to convert audio to text
    transcription = recognizer.recognize_google(audio_data, language="en-EN")

# Print the diarization and transcription results
print("Diarization:\n", diarization)
print("\nTranscription:\n", transcription)



ValueError: File meeting-clip2.wav does not exist

In [1]:
from pyannote.audio import Pipeline
import speech_recognition as sr

# Parameters to select 
audio_file = "audio-test\meeting-clip2.wav"
audio_lang = ""

# Pre-set
languages = {"french": "fr-FR", "english": "en-US", "spanish": "es-ES", "german": "de-DE"}
language =  languages.get(audio_lang, "en-US")  # default to English if language not found

# Load pretrained pipeline for speaker diarization
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token='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')

# Process audio file for diarization
diarization = diarization_pipeline(audio_file)

# Initialize recognizer class for speech recognition
recognizer = sr.Recognizer()

# Open the audio file
with sr.AudioFile(audio_file) as source:
    # Read the entire audio file
    audio_data = recognizer.record(source)

    # Use the recognizer to convert audio to text
    transcription = recognizer.recognize_google(audio_data, language=language)

print("Diarization:\n", diarization)
print("\nTranscription:\n", transcription)
print("\nDetected language:", language)


  audio_file = "audio-test\meeting-clip2.wav"
torchvision is not available - cannot save figures


RequestError: recognition connection failed: [WinError 10060] Une tentative de connexion a échoué car le parti connecté n’a pas répondu convenablement au-delà d’une certaine durée ou une connexion établie a échoué car l’hôte de connexion n’a pas répondu

In [3]:
from pyannote.audio import Pipeline
import speech_recognition as sr

# Parameters to select
audio_file = "audio-test/audio.wav"
audio_lang = "english"

# Pre-set
languages = {"french": "fr-FR", "english": "en-US", "spanish": "es-ES", "german": "de-DE"}
language = languages.get(audio_lang, "en-US")  # default to English if language not found

# Load pretrained pipeline for speaker diarization
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token='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')

# Process audio file for diarization
diarization = diarization_pipeline(audio_file)

# Initialize recognizer class for speech recognition
recognizer = sr.Recognizer()

# Open the audio file
with sr.AudioFile(audio_file) as source:
    # Read the entire audio file
    audio_data = recognizer.record(source)

    # Use the recognizer to convert audio to text
    transcription = recognizer.recognize_google(audio_data, language=language)

# Print the diarization and transcription results
print("Diarization:\n", diarization)
print("\nTranscription:\n", transcription)
print("\nDetected language:", language)

# Match speakers with their transcriptions
speaker_transcriptions = {}
for turn, _, speaker in diarization.itertracks(yield_label=True):
    start_time = turn.start
    end_time = turn.end
    # Segment the transcription based on the timestamps
    # This will require additional code to convert the timestamps to indices in the transcription
    # For simplicity, let's assume we have a function `segment_transcription` that does this
    speaker_transcription = segment_transcription(transcription, start_time, end_time)
    # Add the speaker transcription to the dictionary
    speaker_transcriptions[speaker] = speaker_transcription

# Print the speaker transcriptions
for speaker, transcription in speaker_transcriptions.items():
    print(f"\nSpeaker {speaker}:\n{transcription}")


RequestError: recognition connection failed: [WinError 10060] Une tentative de connexion a échoué car le parti connecté n’a pas répondu convenablement au-delà d’une certaine durée ou une connexion établie a échoué car l’hôte de connexion n’a pas répondu

In [None]:
from pyannote.audio import Pipeline
import speech_recognition as sr

# Parameters to select
audio_file = "audio-test/audio.wav"
audio_lang = "english"

# Pre-set
languages = {"french": "fr-FR", "english": "en-US", "spanish": "es-ES", "german": "de-DE"}
language = languages.get(audio_lang, "en-US")  # default to English if language not found

# Load pretrained pipeline for speaker diarization
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token='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')

# Process audio file for diarization
diarization = diarization_pipeline(audio_file)

# Initialize recognizer class for speech recognition
recognizer = sr.Recognizer()

# Open the audio file
with sr.AudioFile(audio_file) as source:
    # Read the entire audio file
    audio_data = recognizer.record(source)

    # Use the recognizer to convert audio to text
    transcription = recognizer.recognize_google(audio_data, language=language)

# Print the diarization and transcription results
print("Diarization:\n", diarization)
print("\nTranscription:\n", transcription)
print("\nDetected language:", language)

# Match speakers with their transcriptions
speaker_transcriptions = {}
for turn, _, speaker in diarization.itertracks(yield_label=True):
    start_time = turn.start
    end_time = turn.end
    # Segment the transcription based on the timestamps
    # This will require additional code to convert the timestamps to indices in the transcription
    # For simplicity, let's assume we have a function `segment_transcription` that does this
    speaker_transcription = segment_transcription(transcription, start_time, end_time)
    # Add the speaker transcription to the dictionary
    speaker_transcriptions[speaker] = speaker_transcription

# Print the speaker transcriptions
for speaker, transcription in speaker_transcriptions.items():
    print(f"\nSpeaker {speaker}:\n{transcription}")


RequestError: recognition connection failed: [WinError 10060] Une tentative de connexion a échoué car le parti connecté n’a pas répondu convenablement au-delà d’une certaine durée ou une connexion établie a échoué car l’hôte de connexion n’a pas répondu

In [None]:
from pyannote.audio import Pipeline
import speech_recognition as sr

# Parameters to select
audio_file = "audio-test/audio.wav"
audio_lang = "english"

# Pre-set
languages = {"french": "fr-FR", "english": "en-US", "spanish": "es-ES", "german": "de-DE"}
language = languages.get(audio_lang, "en-US")  # default to English if language not found

# Load pretrained pipeline for speaker diarization
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token='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')

# Process audio file for diarization
diarization = diarization_pipeline(audio_file)

# Initialize recognizer class for speech recognition
recognizer = sr.Recognizer()

# Open the audio file
with sr.AudioFile(audio_file) as source:
    # Read the entire audio file
    audio_data = recognizer.record(source)

    # Use the recognizer to convert audio to text
    transcription = recognizer.recognize_google(audio_data, language=language)

# Print the diarization and transcription results
print("Diarization:\n", diarization)
print("\nTranscription:\n", transcription)
print("\nDetected language:", language)

# Match speakers with their transcriptions
speaker_transcriptions = {}
for turn, _, speaker in diarization.itertracks(yield_label=True):
    start_time = turn.start
    end_time = turn.end
    # Segment the transcription based on the timestamps
    # This will require additional code to convert the timestamps to indices in the transcription
    # For simplicity, let's assume we have a function `segment_transcription` that does this
    speaker_transcription = segment_transcription(transcription, start_time, end_time)
    # Add the speaker transcription to the dictionary
    speaker_transcriptions[speaker] = speaker_transcription

# Print the speaker transcriptions
for speaker, transcription in speaker_transcriptions.items():
    print(f"\nSpeaker {speaker}:\n{transcription}")


RequestError: recognition connection failed: [WinError 10060] Une tentative de connexion a échoué car le parti connecté n’a pas répondu convenablement au-delà d’une certaine durée ou une connexion établie a échoué car l’hôte de connexion n’a pas répondu

In [None]:
import whisper 

model = whisper.load_model("base")

# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio("audio-test/audio.wav")
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)

100%|███████████████████████████████████████| 139M/139M [00:08<00:00, 17.0MiB/s]


FileNotFoundError: [WinError 2] Le fichier spécifié est introuvable

In [None]:
import whisper
from pyannote.audio import Pipeline

# Parameters to select
audio_file = "audio-test/audio.wav"
audio_lang = "english"

# Load pretrained pipeline for speaker diarization
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token='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')

# Process audio file for diarization
diarization = diarization_pipeline(audio_file)

# Load Whisper model
model = whisper.load_model("base")

# Transcribe audio
result = model.transcribe(audio_file, language=audio_lang)

# Print the diarization and transcription results
print("Diarization:\n", diarization)
print("\nTranscription:\n", result["text"])
print("\nDetected language:", audio_lang)

# Match speakers with their transcriptions
speaker_transcriptions = {}
for turn, _, speaker in diarization.itertracks(yield_label=True):
    start_time = turn.start
    end_time = turn.end
    # Segment the transcription based on the timestamps
    # This will require additional code to convert the timestamps to indices in the transcription
    # For simplicity, let's assume we have a function `segment_transcription` that does this
    speaker_transcription = segment_transcription(result["text"], start_time, end_time)
    # Add the speaker transcription to the dictionary
    speaker_transcriptions[speaker] = speaker_transcription

# Print the speaker transcriptions
for speaker, transcription in speaker_transcriptions.items():
    print(f"\nSpeaker {speaker}:\n{transcription}")


In [None]:
import whisper
from pyannote.audio import Audio

model = whisper.load_model("base")
audio_formated = Audio("sample.wav")

for turn, _, speaker in diarization.itertracks(yield_label=True):
    segment = audio_formated.crop(turn)
    result = model.transcribe(segment.samples)
    print(f"Speaker {speaker}: {result['text']}")

In [None]:
import librosa
import noisereduce as nr
import speechbrain as sb
import numpy as np

# Load the audio file
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Preprocess the audio
# Normalize the audio
waveform = librosa.util.normalize(waveform)

# Denoise the audio using the Spectral Subtraction method from the librosa library
noise_clip = waveform[:sample_rate]  # assuming the first second of the audio is noise
waveform = nr.reduce_noise(y=waveform, sr=sample_rate, y_noise=noise_clip, stationary=False)

# Use the SpeechBrain library to enhance the voice
enhancer = sb.pretrained.VoiceFilter(source="speechbrain/vocoder-denoiser-enhance-gan")
enhanced_waveform = enhancer.enhance_batch(torch.from_numpy(waveform).unsqueeze(0)).squeeze(0).numpy()

# Save the enhanced audio
librosa.output.write_wav(audio_file, enhanced_waveform, sample_rate)