In [None]:
!pip install gradio
!pip install git+https://github.com/huggingface/transformers

Collecting gradio
  Downloading gradio-4.21.0-py3-none-any.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.110.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.12.0 (from gradio)
  Downloading gradio_client-0.12.0-py3-none-any.whl (310 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.7/310.7 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
import gradio as gr
from transformers import pipeline, AutoTokenizer,AutoModelForSeq2SeqLM
import numpy as np
import torch
import re

transcriber = pipeline("automatic-speech-recognition", model="yaygomii/whisper-small-ta-fyp")


model_name = "csebuetnlp/mT5_m2m_crossSum_enhanced"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
target_lang = "tamil"
get_lang_id = lambda lang: tokenizer._convert_token_to_id(
    model.config.task_specific_params["langid_map"][lang][1]
)
# Define whitespace handler for cleaning text
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

In [None]:
def summarize(transcript):

    input_text = WHITESPACE_HANDLER(transcript)
    input_ids = tokenizer(
        [input_text],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids,
        decoder_start_token_id=get_lang_id(target_lang),
        max_length=84,
        no_repeat_ngram_size=2,
        num_beams=4,
    )[0]

    summary = tokenizer.decode(
        output_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return summary[summary.find(">") + 1:]

In [None]:




def transcribe_stream(stream, new_chunk):
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]

def transcribe(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    return transcriber({"sampling_rate": sr, "raw": y})["text"]

def split_audio_into_chunks(audio):
    sr, y = audio
    chunk_size = int(sr * 5)
    num_chunks = len(y) // chunk_size

    for i in range(num_chunks):
        chunk = y[i * chunk_size : (i + 1) * chunk_size]
        yield sr, chunk

    # Yield the last chunk if it's not a full chunk
    if len(y) % chunk_size != 0:
        chunk = y[num_chunks * chunk_size :]
        yield sr, chunk

def file_transcribe(audio, transcript):


    for chunk in split_audio_into_chunks(audio):

        sr, chunk_data = chunk
        chunk_data = chunk_data.astype(np.float32)
        chunk_data /= np.max(np.abs(chunk_data))

        if transcript == "":
            transcript = transcriber({"sampling_rate": sr, "raw": chunk_data})["text"]
        else:
            transcript += " " + transcriber({"sampling_rate": sr, "raw": chunk_data})["text"]



        yield transcript, transcript

In [None]:
with gr.Blocks() as demo:
    gr.Markdown("""# Tamil Transcription Service""")
    gr.Markdown("""Created by finetuning a Whisper Small model using tamil dialect based dataset""")

    with gr.Tab("Live transcribe"):

        gr.Markdown("""Record audio live and get transcriptions""")

        stream = gr.State()
        audio = gr.Audio( streaming=True)
        text = gr.Textbox()
        audio.stream(transcribe_stream, [stream, audio], [stream, text])
        clear_btn = gr.ClearButton([text,audio])

    with gr.Tab("Transcribe (short audio)"):

        gr.Markdown("""Record audios and get transcriptions""")

        audio = gr.Audio()
        text = gr.Textbox()
        with gr.Row():
          transcribe_btn = gr.Button()
          clear_btn = gr.ClearButton([text,audio])
        transcribe_btn.click(transcribe,audio,text)

    with gr.Tab("Streaming transcribe (long audio)"):

      gr.Markdown("""Upload long audios and get streaming transcriptions""")
      with gr.Row():
        with gr.Column():
          transcript = gr.State("")
          audio = gr.Audio(sources=["upload"])
          text = gr.Textbox()
          with gr.Row():
            transcribe_btn = gr.Button()
            clear_btn = gr.ClearButton([text,audio])
            stop_btn = gr.Button("Cancel")
          transcribe_evt = transcribe_btn.click(file_transcribe, [audio, transcript], [text, transcript])
          stop_btn.click(fn=None, inputs=None, outputs=None,cancels=[transcribe_evt])
        with gr.Column():
          summary_text = gr.Textbox(label="Summary")
          summary_btn = gr.Button("Summarize")
          summary_btn.click(summarize,text, summary_text)





demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://6c6202fec68ff4a640.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


