In [None]:
#!pip install torch torchaudio librosa wget text-unidecode omegaconf sox pydub ipython soundfile funasr nemo_toolkit[asr]


Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sox
  Downloading sox-1.5.0.tar.gz (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget, sox
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=2c4c8691bef6c3016e2e7d6449c63396162bc52dd8f892f4b4d0e62c93009654
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
  Building wheel for sox (setup.py) ... [?25l[?25hdone
  Created wheel for sox: filename=sox-1.5.0-py3-none-any.whl size=40037 sha256=141bd70928b9aa4972106a5d8dc9fce2ca7f3a782074a3cfd2280318a1304565
  Stored in directory: /root/.cache/pip/wheels/74/e7/7b/8033be3ec5e4994595d01269fc9657c8fd83a0dcbf853666

In [1]:
import torch
from nemo.collections.asr.models import EncDecMultiTaskModel
from pydub import AudioSegment
from IPython.display import display, Audio
import os
from tqdm import tqdm
from pydub.utils import mediainfo
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'running on {device}')

running on cuda


In [7]:
class TranscriptionModel:
    def __init__(self, **kwargs):
        self.BATCH_SIZE = 30
        self.FRAME_RATE = 16000
        self.kwargs=kwargs
        self.model_name = kwargs["model_name"]
        self.device = kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu")

        self.build_model(**kwargs)

    
    def get_length(self, input_file):
        audio = AudioSegment.from_file(input_file)
        length = len(audio) / 1000
        return length

    def convert(self, input_file, output_file, format):
        audio = AudioSegment.from_file(input_file)
        audio.export(output_file, format=format)

    
    def convertTo16kHzMono(self, input_file):
        audio = AudioSegment.from_file(input_file)
        audio = audio.set_channels(1).set_frame_rate(16000)
        audio.export(input_file, format="wav")

    
    def build_model(self, **configs):
        model_name = configs.pop("model_name")
        if model_name == "canary_1b_flash":
            self.model = EncDecMultiTaskModel.from_pretrained(model_name="nvidia/canary-1b-flash", 
                                                                map_location=self.device)
            
        elif model_name == "sense_voice_small":
            SenseVoiceSmall_dir = "FunAudioLLM/SenseVoiceSmall"
            vad_model = configs.get("vad_model", "fsmn-vad")
            vad_kwargs = configs.get("vad_kwargs", {"max_single_segment_time": 30000})
            device = configs.get("device", "cuda:0" if torch.cuda.is_available() else "cpu")
            hub = configs.get("hub", "hf")
            
            self.model = AutoModel(
                model=SenseVoiceSmall_dir,
                vad_model=vad_model,
                vad_kwargs=vad_kwargs,
                device=device,
                hub=hub,
            )
        else:
            raise ValueError("Unsupported model name. Please use either 'canary_1b_flash' or'sense_voice_small'.")

    
##################################################################################################################################################
    def transcribe_audio(self, **kwargs):
        try:
            self.input_file = kwargs.get("input_file")
            if self.input_file is None:
                raise ValueError("input_file couldn't be found!")
            
            if not os.path.exists(self.input_file):
                raise FileNotFoundError(f"The file {self.input_file} does not exist.")

        except ValueError as e:
            print(f"Error: {e}")
            return {}, {}
    
        except FileNotFoundError as e:
            print(f"Error: {e}")
            return {}, {}
    
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return {}, {}
            
        self.use_overlay = kwargs.get("use_overlay", False)
        self.batch_size = kwargs.get("batch_size", self.BATCH_SIZE)
        self.start_time = kwargs.get("start_time", 0)
        self.end_time = min(kwargs.get("end_time", self.get_length(self.input_file)), self.get_length(self.input_file))


        if self.model_name == "canary_1b_flash":
            self.source_lang = kwargs.get("source_lang", "en")
            self.target_lang = kwargs.get("target_lang", "en")
            self.pnc = kwargs.get("pnc", "yes")
            #self.timestamps = kwargs.get("timestamps", "yes")
    
        elif self.model_name == "sense_voice_small":
            self.language = kwargs.get("language", "auto")
            self.use_itn = kwargs.get("use_itn", True)
            self.batch_size_s = kwargs.get("batch_size_s", self.batch_size)
            self.merge_vad = kwargs.get("merge_vad", True)
            self.merge_length_s = kwargs.get("merge_length_s", self.batch_size//2)
    
        else:
            raise ValueError("Unsupported model name. Please use either 'canary_1b_flash' or'sense_voice_small'.")
    
    
        transcriptions, transcriptions_obj = self.transcribe_audio_segments(**kwargs)
    
        return transcriptions, transcriptions_obj
    
    
##################################################################################################################################################
    def transcribe_audio_segments(self, **kwargs):
        if not self.input_file.endswith('.wav'):
            temp_filename = f"temp_file.wav"
            self.convert(self.input_file, temp_filename, 'wav')
            self.input_file = temp_filename

        audio = AudioSegment.from_file(self.input_file)
        
        if audio.channels!= 1 or audio.frame_rate!= self.FRAME_RATE:
            self.convertTo16kHzMono(self.input_file)

        audio = AudioSegment.from_file(self.input_file)
        audio_duration = len(audio) // 1000

        start_time_ms = int(self.start_time * 1000)
        end_time_ms = int(self.end_time * 1000)
        trimmed_audio = audio[start_time_ms:end_time_ms]

        batch_num = int((end_time_ms - start_time_ms) // (self.batch_size * 1000)) + ((end_time_ms - start_time_ms) % (self.batch_size * 1000) > 0)

        transcriptions = {}
        transcriptions_obj = {}

        for i in tqdm(range(batch_num), desc="Transcribing segments"):
            start_time = i * self.batch_size * 1000
            end_time = min(start_time + self.batch_size * 1000, end_time_ms)

            
            if end_time - start_time < self.batch_size * 1000 & self.use_overlay:
                start_time = end_time - self.batch_size * 1000

            audio_segment = trimmed_audio[start_time : end_time]

            temp_filename = f"temp_segment_{i}.wav"
            audio_segment.export(temp_filename, format="wav")


            
            if self.model_name == "canary_1b_flash":
                transcription = self.model.transcribe(
                    audio=[temp_filename],
                    source_lang=self.source_lang,
                    target_lang=self.target_lang,
                    pnc=str(self.pnc),
                    #timestamps=self.timestamps NotImplementedError: Computing timestamps are not supported for this model yet.
                )
                time_interval = f"[{(start_time + start_time_ms) // 1000}:{(end_time + start_time_ms) // 1000}]"
                #transcriptions[time_interval] = rich_transcription_postprocess(transcription[0].text)
                '''rich_transcription_postprocess is a funasr function'''
                transcriptions[time_interval] = transcription[0].text
                transcriptions_obj[time_interval] = transcription

            
            elif self.model_name == "sense_voice_small":
                transcription = self.model.generate(
                    input=temp_filename,
                    cache={},
                    language=self.language,
                    use_itn=self.use_itn,
                    batch_size_s=self.batch_size_s,
                    merge_vad=self.merge_vad,
                    merge_length_s=self.merge_length_s,
                )

                time_interval = f"[{(start_time + start_time_ms) // 1000}:{(end_time + start_time_ms) // 1000}]"
                transcriptions[time_interval] = rich_transcription_postprocess(transcription[0]["text"])
                transcriptions_obj[time_interval] = transcription
            
            os.remove(temp_filename)

        return transcriptions, transcriptions_obj


In [8]:
# Example usage
transcription_model = TranscriptionModel(
    model_name="sense_voice_small", #  "sense_voice_small", "canary_1b_flash"
    device="cuda" if torch.cuda.is_available() else "cpu"
)

transcriptions_text, transcriptions_obj = transcription_model.transcribe_audio(
    input_file = "audio_files\en\min2_speech.mp3",
    end_time = 80
)

funasr version: 1.2.6.
Check update of funasr, and it would cost few times. You may disable it by set `disable_update=True` in AutoModel
You are using the latest version of funasr-1.2.6


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

Detect model requirements, begin to install it: C:\Users\baybe\.cache\huggingface\hub\models--FunAudioLLM--SenseVoiceSmall\snapshots\3eb3b4eeffc2f2dde6051b853983753db33e35c3\requirements.txt
fail to install model requirements! 
error   error: subprocess-exited-with-error
  
  × Preparing metadata (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [19 lines of output]
      + C:\Users\baybe\anaconda3\envs\deb\python.exe C:\Users\baybe\AppData\Local\Temp\pip-install-7hq6tk3n\numpy_231f901a3d40445894f99130f4640cd5\vendored-meson\meson\meson.py setup C:\Users\baybe\AppData\Local\Temp\pip-install-7hq6tk3n\numpy_231f901a3d40445894f99130f4640cd5 C:\Users\baybe\AppData\Local\Temp\pip-install-7hq6tk3n\numpy_231f901a3d40445894f99130f4640cd5\.mesonpy-mcgzeifm -Dbuildtype=release -Db_ndebug=if-release -Db_vscrt=md --native-file=C:\Users\baybe\AppData\Local\Temp\pip-install-7hq6tk3n\numpy_231f901a3d40445894f99130f4640cd5\.mesonpy-mcgzeifm\meson-python-native-file.ini
      The Meson 

rtf_avg: 0.007: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  4.45it/s]

[A
[A
[A
rtf_avg: 0.005: 100%|[34m██████████[0m| 3/3 [00:00<00:00, 20.32it/s]
rtf_avg: 0.005, time_speech:  30.000, time_escape: 0.154: 100%|[31m██████████[0m| 1/1 [00:00<00:00,  6.27it/s]
rtf_avg: 0.003: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 11.14it/s]

[A
[A
[A
rtf_avg: 0.006: 100%|[34m██████████[0m| 3/3 [00:00<00:00, 17.16it/s]
rtf_avg: 0.006, time_speech:  30.000, time_escape: 0.181: 100%|[31m██████████[0m| 1/1 [00:00<00:00,  5.36it/s]
rtf_avg: 0.004: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 13.06it/s]

[A
[A
[A
rtf_avg: 0.006: 100%|[34m██████████[0m| 2/2 [00:00<00:00, 15.36it/s]
rtf_avg: 0.007, time_speech:  20.000, time_escape: 0.136: 100%|[31m██████████[0m| 1/1 [00:00<00:00,  7.10it/s]
Transcribing segments: 100%|██████████| 3/3 [00:00<00:00,  3.26it/s]


In [9]:
transcriptions_text

{'[0:30]': "Thank you all for not sleeping in. After I heard all about all the great fun last night, I figured that, you know, Lauren and I would be the only ones here. So thank you for, for joining us.What my hope is. And I'm very pleased to be with all of you and M E I today. My hope is that I can.😊In our short time today, get you to think a little differently, especially about consumers, especially about my world, which is the supermarket and.",
 '[30:60]': "Especially about trends. So what I want to do is get started by showing you an actual advertisement from about 100 years ago. It's a picture of the hog with a child's head on the body.Says makes children and adults as fat as pigs, no cure, No pay price 50 cents Groroves tasteless chillilnic on the market over 20 years,50 cents,100 years ago. We're talking about a very expensive product here.Anybody want to take a guess what's in.",
 '[60:80]': 'That bottle, just yell it out. water, sugar water, whiskey. We know what you were doi

In [6]:
transcriptions_obj

{'[0:30]': [{'key': 'temp_segment_0',
   'text': "<|en|><|HAPPY|><|Speech|><|withitn|>Thank you all for not sleeping in. After I heard all about all the great fun last night, I figured that, you know, Lauren and I would be the only ones here. So thank you for, for joining us. <|en|><|HAPPY|><|Speech|><|withitn|>What my hope is. And I'm very pleased to be with all of you and M E I today. My hope is that I can. <|en|><|NEUTRAL|><|Speech|><|withitn|>In our short time today, get you to think a little differently, especially about consumers, especially about my world, which is the supermarket and."}],
 '[30:60]': [{'key': 'temp_segment_1',
   'text': "<|en|><|NEUTRAL|><|Speech|><|withitn|>Especially about trends. So what I want to do is get started by showing you an actual advertisement from about 100 years ago. It's a picture of the hog with a child's head on the body. <|en|><|EMO_UNKNOWN|><|Speech|><|withitn|>Says makes children and adults as fat as pigs, no cure, No pay price 50 cents Gr