In [1]:
import gradio as gr
import json
import torch
import numpy as np

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    

In [2]:
def whisper_pipeline(audio_path, json_path):
    import whisper_timestamped as whisper_t

    audio = whisper_t.load_audio(audio_path)

    model = whisper_t.load_model("medium")

    # print(model)
    result = whisper_t.transcribe(model, audio, language='en')
    with open(json_path,'w') as f:
        json.dump(result,f)

    model.to(torch.device('cpu'))
    del model
    torch.cuda.empty_cache()



In [3]:
def SOLID_pipeline(json_in_path, json_out_path):
    import torch
    from transformers import BertForSequenceClassification, BertTokenizer
    import pandas as pd
    model = BertForSequenceClassification.from_pretrained('models/SOLID_finetuneHSO_1')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    data = ''
    with open(json_in_path, 'r') as f:
        data = json.loads(f.read())
        f.close()

    data = data['segments']
    df = pd.DataFrame(columns=["start","end","text"])
    for d in data:
        start = d.get("start")
        end = d.get("end")
        text = d.get("text")
        df.loc[len(df)] = [start,end,text]

    text = df['text'].values
    
    encodings = tokenizer(text.tolist(), truncation=True, padding=True)

    dataset = TweetDataset(encodings, np.zeros(text.shape))

    loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=False)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    print(model)
    
    model.eval()
    # Perform evaluation on validation set and calculate metrics as needed
    # Example: calculate accuracy
    # correct = 0
    # total = 0
    i = 0
    prediction_list = np.array([])

    with torch.no_grad():
        # test_start  = time.time()
        print('start')
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            prediction_list = np.append(prediction_list, predictions.detach().cpu().numpy())
        print('end')
        
        # test_end = time.time()
    
    df_out = df.assign(predictions=prediction_list)

    df_out.to_json(json_out_path)

    model.to(torch.device('cpu'))
    del model
    torch.cuda.empty_cache()

In [4]:
def run_audio_model(audio_path):
    data = {'default':'true', 'audio':'false'}
    temp_file_path = './json_samples/obj_sample.json'

    if audio_path != None:
        whisper_pipeline(audio_path, temp_file_path)
        with open(temp_file_path, 'r') as f:
            data = json.load(f)
            f.close()
    else:
        data = {'default':'false', 'audio':'false'}
    out_f = open(temp_file_path, 'w')
    json.dump(data, fp=out_f)
    out_f.close()
    return [data, gr.File(value=temp_file_path, file_types=['.json'])]

In [5]:
def run_text_model(input_filepath):
    output_filepath = 'json_samples/output_sample.json'
    if type(input_filepath) == dict:
        return input_filepath
    elif type(input_filepath) == gr.utils.NamedString:
        SOLID_pipeline(input_filepath, output_filepath)
        return json.load(open(output_filepath)) #'This is the text inference output'
    else:
        return None
    

In [6]:

with gr.Blocks() as interface:
    gr.Markdown("Upload audio file below, click **run** to see the output.")


    with gr.Column():
        with gr.Row():
            audio_inp = gr.File(file_types=['audio'])
            audio_out = gr.JSON()
        
        btn_audio = gr.Button("Run stt")
        
    
    with gr.Column():
        gr.Markdown("Upload JSON file below, click **run** to see the output.")
        with gr.Row():
            text_inp = gr.File(file_types=['.json'])
            text_out = gr.JSON()
        btn_text = gr.Button("Run text inference")
        btn_text.click(fn=run_text_model, inputs=text_inp, outputs=text_out)
        
    btn_audio.click(fn=run_audio_model, inputs=audio_inp, outputs=[audio_out,text_inp])



In [7]:
interface.launch(share=False)  # Share your demo with just 1 extra parameter ðŸš€


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




IMPORTANT: You are using gradio version 4.19.2, however version 4.29.0 is available, please upgrade.
--------


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Traceback (most recent call last):
  File "/home/cymn/miniconda3/envs/nvidia/lib/python3.11/site-packages/gradio/queueing.py", line 495, in call_prediction
    output = await route_utils.call_process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cymn/miniconda3/envs/nvidia/lib/python3.11/site-packages/gradio/route_utils.py", line 235, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cymn/miniconda3/envs/nvidia/lib/python3.11/site-packages/gradio/blocks.py", line 1627, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cymn/miniconda3/envs/nvidia/lib/python3.11/site-packages/gradio/blocks.py", line 1173, in call_function
    prediction = await anyio.to_thread.run_sync(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 24089/24089 [02:58<00:00, 134.67frames/s]


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

  return dynamo.is_compiling()


end
