In [1]:
!pip install transformers
!pip install torchaudio librosa silero-vad pydub

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
from pydub import AudioSegment
from pydub.utils import mediainfo

import math
from io import BytesIO
from typing import IO, List


def _split_audio(audio_file: IO[bytes], vad_model, sampling_rate) -> List:
        """
        This is a helper function that takes an audio file and splits it into timestamps based on VAD

        Parameters:
            - audio_file (IO[bytes]) -> Audio file you want to split timestamps of
            - vad_model (silero-vad) -> Voice Activity Detection model to analyze the audio.

        Returns:
            - audio_segments (List) -> List of audio segments (as BytesIO buffers). 

        TODO: 
        1. You can try to ceil or floor the timestamps to nearest integer but there could be a problem in recognizing audio if it merges with some random sound or previous syllable. 
        2. Change from mono to stereo if stereo sound is provided
        3. Try changing the export format to mp3 and observe the results
        """

        def split_timestamps(timestamps):
            result = []

            def find_largest_gap(chunk):
                """Find the index of the largest gap between consecutive timestamps."""
                max_gap = 0
                split_index = None

                for i in range(1, len(chunk)):
                    gap = chunk[i]['start'] - chunk[i - 1]['end']
                    if gap > max_gap:
                        max_gap = gap
                        split_index = i

                return max_gap, split_index

            def process_chunk(chunk):
                """Process a single chunk and split it if its duration is > 29 seconds."""
                # Base condition: If chunk has one or fewer timestamps, add it directly
                if len(chunk) <= 1:
                    result.append(chunk)
                    return

                # Check duration of the chunk
                duration = chunk[-1]['end'] - chunk[0]['start']
                if duration <= 29:
                    result.append(chunk)  # If duration is <= 29, keep the chunk as is
                    return

                # Find the largest gap and split at that point
                max_gap, split_index = find_largest_gap(chunk)

                # If no valid split point is found, add the chunk as is
                if max_gap <= 0 or split_index is None:
                    result.append(chunk)
                    return

                # Split the chunk into two at the split_index
                left_chunk = chunk[:split_index]
                right_chunk = chunk[split_index:]

                # Process each sub-chunk recursively
                process_chunk(left_chunk)
                process_chunk(right_chunk)

            # Start processing the chunks
            process_chunk(timestamps)

            return result

        # 1. Read audio and prepare it for processing
        audio_bytes = audio_file.getvalue()

        # 2. Convert the audio into BytesIO object
        audio = BytesIO(audio_bytes)

        # 3. Read audio with it's sampling rate
        wav = read_audio(audio, sampling_rate=sampling_rate)

        # 4. Get speech timestamps from the vad model 
        speech_timestamps = get_speech_timestamps(
            wav, 
            vad_model, 
            sampling_rate=sampling_rate, 
            return_seconds=True
        )
     
        # 5. Extract processed timestamps
        processed_timestamps = split_timestamps(speech_timestamps)

        # TODO: You can try to ceil or floor the timestamps to nearest integer but there could be a problem in recognizing audio if it merges with some random sound or previous syllable. 
        cleaned_processed_timestamps = []
        for timestamps in processed_timestamps:
            cleaned_processed_timestamps.append(
                {
                    "start": math.floor(float(timestamps[0]['start'])), 
                    "end": math.ceil(float(timestamps[-1]['end']))
                }
            )
        
        # 6. Cleaning processed timestamps
        cleaned_processed_timestamps_2 = [cleaned_processed_timestamps[0]]

        for i in range(1, len(cleaned_processed_timestamps)):
            if (cleaned_processed_timestamps[i]['end']- cleaned_processed_timestamps[i]['start']) + (cleaned_processed_timestamps_2[-1]['end'] - cleaned_processed_timestamps_2[-1]['start']) < 29:
                cleaned_processed_timestamps_2[-1]['end'] = cleaned_processed_timestamps[i]['end']
            else:
                cleaned_processed_timestamps_2.append(cleaned_processed_timestamps[i])

        # Convert processed timestamps into audio segments
        audio_segments = []
        audio = AudioSegment.from_file(audio_file)
        for cleaned_processed_timestamp in cleaned_processed_timestamps:
            start_ms = cleaned_processed_timestamp['start'] * 1000  # Convert seconds to milliseconds
            end_ms = cleaned_processed_timestamp['end'] * 1000    # Convert seconds to milliseconds
            segment = audio[start_ms:end_ms]

            buffer = BytesIO()
            segment.export(buffer, format="wav") # TODO: Try changing it to wav and observe the results
            buffer.seek(0)  # Reset buffer pointer
            audio_segments.append(buffer)

        return audio_segments

In [3]:
!pip install accelerate

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [4]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
# from datasets import load_dataset
import librosa
import numpy as np
from pydub import AudioSegment
from io import BytesIO

# Set up device and data types
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load model and processor
model_id = "openai/whisper-large-v3"  # Use your custom Whisper model
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

# Create ASR pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=0 if torch.cuda.is_available() else -1,
)

# Load your audio file
audio_file = "./recordingsv4/7f5662f8-e39c-4429-b7a4-39628fd750a1.mp3"
vad_model = load_silero_vad()  # Implement your VAD logic here

audio = open(audio_file, "rb")
audio = BytesIO(audio.read())

# Load the audio and apply Voice Activity Detection (VAD) 
waveform, sampling_rate = librosa.load(audio_file, sr=None)
audio_segments = _split_audio(audio, vad_model, sampling_rate)  # Split audio into segments

# Transcribe each audio segment
for segment in audio_segments:
    # Load audio segment
    audio_segment = AudioSegment.from_file(segment)

    # Convert Pydub AudioSegment to NumPy array
    samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
    samples /= np.iinfo(audio_segment.array_type).max  # Normalize to [-1, 1]

    # Resample to 16 kHz
    waveform = librosa.resample(samples, orig_sr=audio_segment.frame_rate, target_sr=16000)
    generate_kwargs = {
    "max_new_tokens": 445,
    "language": "hindi", 
    "task": "translate",
    "num_beams": 1,
    "condition_on_prev_tokens": True,
    "compression_ratio_threshold": 1.35,  # zlib compression ratio threshold (in token space)
    "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
    "logprob_threshold": -1.0,
    "no_speech_threshold": 0.6,
    "return_timestamps": True,
    }

    result = pipe(waveform, generate_kwargs=generate_kwargs)
    # Run transcription pipeline
    # result = pipe({"array": waveform, "sampling_rate": 16000})
    print("Transcription:", result["text"])


Device set to use cuda:0
You have passed task=translate, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=translate.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


Transcription:  Good morning sir, how are you? Hello Ma'am, which account are you going to in this dividend? Good morning sir, how are you? I am asking you, which account are you going to in this dividend?




Transcription:  Hello, sir, we are not selling, tell us how are you doing? I am saying that the dividends that go in, will you tell me in which account you will tell me?




Transcription:  So, it goes to your bank only, if you get any dividend then your particular... I am asking you this particular, you are going now, they can show that yes, they are going to this bank or this bank, what is this? I don't understand, sorry, will you repeat? I am saying this, you can tell which bank they are going to, they will tell you.




Transcription:  I don't understand, will you repeat? I am saying, can you tell me which band are you going to?




Transcription:  Yes, I will definitely tell you. Rajesh, you have registered banks here. You have registered 4 banks. One is Punjab National Bank, State Bank of India.




Transcription:  And one is your Indian bank, Indian bank, State bank of India, Yash bank, these four banks you have added. In this, it is going to your primary. Primary bank I told you which was the primary. Primary bank you have kept Punjab National Bank, so your dividend is going in this. And see, if I go to close two banks in Mali's investment, then it will go away.




Transcription:  Yes, you will get removed from the bank application, you have to keep one minimum, you can get removed from that. I am not getting removed from the app, I am not getting removed from the app. Are you not getting the default option from the app? I am getting it but not getting removed.




Transcription:  Actually, you are not getting the option of default, right? I am getting it but not getting it.




Transcription:  Ok, I will request you if the bank is not being removed from there, then we will check out our team and ask them why it is not being removed from there. And they will help you for that. So once you give this mail. This is not for both of them.


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Transcription:  So, both of them... You want to move? Yes. It's okay, it will move. Yes, bank. It's okay, it will move. You can mail it once.




Transcription:  Yes Venk




Transcription:  Then you will get rid of the thyroid.




Transcription:  Ok, I have to mail it now. Yes, please mail it now. You can't do it on the phone. I am really sorry. On-call is not possible. Really sorry. You don't have to do it. Mail id is our customer care at the rate choice will be your outcome. Ok, I will send it. Ok, bye.




Transcription:  Customer care at the rate of 24.5 Sir, your voice echo is happening, but if you...


In [5]:
# from transformers import WhisperProcessor, WhisperForConditionalGeneration
# from pydub import AudioSegment
# import librosa
# import numpy as np
# from io import BytesIO
# import torch

# # Check if GPU is available
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Using device: {device}")

# # Load custom model and processor
# model_name = "quinnb/whisper-Large-v3-hindi"  # Replace with your desired model
# processor = WhisperProcessor.from_pretrained(model_name)
# model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
# model.config.forced_decoder_ids = None

# # Load audio file
# audio_file = "./recordingsv4/f83d924c-8c14-48d2-af37-b4e71413490a.mp3"
# with open(audio_file, "rb") as f:
#     audio = f.read()
# audio = BytesIO(audio)

# # Load Silero VAD (adjust based on your custom `_split_audio` logic)
# vad_model = load_silero_vad()  # Ensure you have a valid implementation for this function

# # Process the audio
# waveform, sampling_rate = librosa.load(audio_file, sr=None)  # Load audio and retain original sampling rate
# audio_segments = _split_audio(audio, vad_model, sampling_rate)  # Split into segments

# # Process each segment and transcribe
# for audio_segment in audio_segments:
#     # Load the audio segment into a Pydub AudioSegment object
#     audio_segment = AudioSegment.from_file(audio_segment)

#     # Convert AudioSegment to NumPy array
#     samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
#     samples /= np.iinfo(audio_segment.array_type).max  # Normalize to range [-1, 1]

#     # Resample audio to 16 kHz
#     sampling_rate = audio_segment.frame_rate
#     waveform = librosa.resample(samples, orig_sr=sampling_rate, target_sr=16000)

#     # Generate input features for Whisper
#     input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
#     input_features = input_features.to(device)  # Move input features to GPU

#     # Generate transcription
#     predicted_ids = model.generate(input_features)
#     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

#     print("Transcription:", transcription)
