In [1]:
!pip install silero-vad pydub torch transformers librosa numpy accelerate pandas

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import os 

# Run this in the terminal - apt-get update && apt-get install -y ffmpeg

In [3]:
for root, _, files in os.walk("./audio_recordings"):
    print(len(files))

251


## Audio Chunking Algorithm

In [4]:


from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
from pydub import AudioSegment
from pydub.utils import mediainfo

import math
from io import BytesIO
from typing import IO, List


def _split_audio(audio_file: IO[bytes], vad_model, sampling_rate) -> List:
        """
        This is a helper function that takes an audio file and splits it into timestamps based on VAD

        Parameters:
            - audio_file (IO[bytes]) -> Audio file you want to split timestamps of
            - vad_model (silero-vad) -> Voice Activity Detection model to analyze the audio.

        Returns:
            - audio_segments (List) -> List of audio segments (as BytesIO buffers). 

        TODO: 
        1. You can try to ceil or floor the timestamps to nearest integer but there could be a problem in recognizing audio if it merges with some random sound or previous syllable. 
        2. Change from mono to stereo if stereo sound is provided
        3. Try changing the export format to mp3 and observe the results
        """

        def split_timestamps(timestamps):
            result = []

            def find_largest_gap(chunk):
                """Find the index of the largest gap between consecutive timestamps."""
                max_gap = 0
                split_index = None

                for i in range(1, len(chunk)):
                    gap = chunk[i]['start'] - chunk[i - 1]['end']
                    if gap > max_gap:
                        max_gap = gap
                        split_index = i

                return max_gap, split_index

            def process_chunk(chunk):
                """Process a single chunk and split it if its duration is > 29 seconds."""
                # Base condition: If chunk has one or fewer timestamps, add it directly
                if len(chunk) <= 1:
                    result.append(chunk)
                    return

                # Check duration of the chunk
                duration = chunk[-1]['end'] - chunk[0]['start']
                if duration <= 29:
                    result.append(chunk)  # If duration is <= 29, keep the chunk as is
                    return

                # Find the largest gap and split at that point
                max_gap, split_index = find_largest_gap(chunk)

                # If no valid split point is found, add the chunk as is
                if max_gap <= 0 or split_index is None:
                    result.append(chunk)
                    return

                # Split the chunk into two at the split_index
                left_chunk = chunk[:split_index]
                right_chunk = chunk[split_index:]

                # Process each sub-chunk recursively
                process_chunk(left_chunk)
                process_chunk(right_chunk)

            # Start processing the chunks
            process_chunk(timestamps)

            return result

        # 1. Read audio and prepare it for processing
        audio_bytes = audio_file.getvalue()

        # 2. Convert the audio into BytesIO object
        audio = BytesIO(audio_bytes)

        # 3. Read audio with it's sampling rate
        wav = read_audio(audio, sampling_rate=sampling_rate)

        # 4. Get speech timestamps from the vad model 
        speech_timestamps = get_speech_timestamps(
            wav, 
            vad_model, 
            sampling_rate=sampling_rate, 
            return_seconds=True
        )
     
        # 5. Extract processed timestamps
        processed_timestamps = split_timestamps(speech_timestamps)

        # TODO: You can try to ceil or floor the timestamps to nearest integer but there could be a problem in recognizing audio if it merges with some random sound or previous syllable. 
        cleaned_processed_timestamps = []
        for timestamps in processed_timestamps:
            cleaned_processed_timestamps.append(
                {
                    "start": math.floor(float(timestamps[0]['start'])), 
                    "end": math.ceil(float(timestamps[-1]['end']))
                }
            )
        
        # 6. Cleaning processed timestamps
        cleaned_processed_timestamps_2 = [cleaned_processed_timestamps[0]]

        for i in range(1, len(cleaned_processed_timestamps)):
            if (cleaned_processed_timestamps[i]['end']- cleaned_processed_timestamps[i]['start']) + (cleaned_processed_timestamps_2[-1]['end'] - cleaned_processed_timestamps_2[-1]['start']) < 29:
                cleaned_processed_timestamps_2[-1]['end'] = cleaned_processed_timestamps[i]['end']
            else:
                cleaned_processed_timestamps_2.append(cleaned_processed_timestamps[i])

        # Convert processed timestamps into audio segments
        audio_segments = []
        audio = AudioSegment.from_file(audio_file)
        for cleaned_processed_timestamp in cleaned_processed_timestamps:
            start_ms = cleaned_processed_timestamp['start'] * 1000  # Convert seconds to milliseconds
            end_ms = cleaned_processed_timestamp['end'] * 1000    # Convert seconds to milliseconds
            segment = audio[start_ms:end_ms]

            buffer = BytesIO()
            segment.export(buffer, format="wav") # TODO: Try changing it to wav and observe the results
            buffer.seek(0)  # Reset buffer pointer
            audio_segments.append(buffer)

        return audio_segments

## Setting up the Whisper Model & Silero VAD

In [5]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
# from datasets import load_dataset
import librosa
import numpy as np
from pydub import AudioSegment
from io import BytesIO

# Set up device and data types
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load model and processor
model_id = "openai/whisper-large-v3"  # Use your custom Whisper model
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

# Create ASR pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=0 if torch.cuda.is_available() else -1,
)

vad_model = load_silero_vad()  

Device set to use cuda:0


## Single File Testing

In [6]:
def single_file_testing(file_path: str):
    # 1. Loading the audio file into bytes
    audio = open(file_path, "rb")

    # 2. Converting audio into BytesIO object
    audio = BytesIO(audio.read())

    # 3. Load the audio and apply Voice Activity Detection (VAD) 
    waveform, sampling_rate = librosa.load(file_path, sr=None)
    audio_segments = _split_audio(audio, vad_model, sampling_rate)  # Split audio into segments

    # 4. Transcribe each audio segment
    raw_transcripts = []
    for segment in audio_segments:
        # Load audio segment
        audio_segment = AudioSegment.from_file(segment)
        
        # Convert Pydub AudioSegment to NumPy array
        samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
        samples /= np.iinfo(audio_segment.array_type).max  # Normalize to [-1, 1]
        
        # Resample to 16 kHz
        waveform = librosa.resample(samples, orig_sr=audio_segment.frame_rate, target_sr=16000)
        generate_kwargs = {
            # "max_new_tokens": 445,
            "language": "hindi", 
            "task": "translate",
            "num_beams": 1,
            "condition_on_prev_tokens": True,
            "compression_ratio_threshold": 1.35,  # zlib compression ratio threshold (in token space)
            "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
            "logprob_threshold": -1.0,
            "no_speech_threshold": 0.6,
            "return_timestamps": True,
        }
        
        result = pipe(waveform, generate_kwargs=generate_kwargs)
        # Run transcription pipeline
        # result = pipe({"array": waveform, "sampling_rate": 16000})
        raw_transcripts.append(result["text"])

    return raw_transcripts

In [7]:
# AUDIO_FILE_PATH = f"./audio_recordings/07b9f26f-b829-42f5-8b14-2eefc63431e5_0_r (1).mp3"

# single_file_testing(AUDIO_FILE_PATH)

## Multi-File 

In [8]:
import pandas as pd

In [9]:
sales_calls_data = {
    "audio_id": [], 
    "raw_transcripts": [], 
    "cleaned_transcripts": []
}

In [10]:
for root, _, files in os.walk("./audio_recordings"):
    for index, file in enumerate(files):
        audio_file_path = os.path.join(root, file)
        audio_id = file[:-4]
        raw_transcripts = single_file_testing(audio_file_path)
        cleaned_transcripts = ' '.join([transcript.strip() for transcript in raw_transcripts])

        sales_calls_data['audio_id'].append(audio_id)
        sales_calls_data['raw_transcripts'].append(raw_transcripts)
        sales_calls_data['cleaned_transcripts'].append(cleaned_transcripts)

        print(f"{index+1}/{len(files)}")
        
    #     break

    # break

You have passed task=translate, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=translate.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


1/251




2/251




3/251




4/251




5/251




6/251




7/251




8/251




9/251




10/251




11/251




12/251




13/251




14/251




15/251




16/251




17/251




18/251




19/251




20/251




21/251




22/251




23/251




24/251




25/251




26/251




27/251




28/251




29/251




30/251




31/251




32/251




33/251




34/251




35/251




36/251




37/251




38/251




39/251




40/251




41/251




42/251




43/251




44/251




45/251




46/251




47/251




48/251




49/251




50/251




51/251




52/251




53/251




54/251




55/251




56/251




57/251




58/251




59/251




60/251




61/251




62/251


IndexError: list index out of range

In [None]:
# sales_calls_data['cleaned_transcripts'][0]
# sales_calls_data

In [11]:
pd.DataFrame(sales_calls_data).to_csv("sales_call_transcripts.csv")