In [22]:
import pandas as pd
import numpy as np
import os
import time

import whisper

In [24]:
def stat_print(list_x, name, precision=4, quantile=False):
    if quantile:
        print(
            "Mean " + name + " =", np.round(np.nanmean(list_x), precision),
            "+/-", np.round(np.nanstd(list_x), precision),
            "; Median =", np.round(np.nanmedian(list_x), precision),
            "; Min. = ", np.round(np.nanmin(list_x), precision),
            "; Max. = ", np.round(np.nanmax(list_x), precision),
            "\n Quantile 1%", np.round(np.nanquantile(list_x, 0.01), precision),
            "Quantile 99%", np.round(np.nanquantile(list_x, 0.99), precision)
        )
    else:
        print(
            "Mean " + name + " =", np.round(np.nanmean(list_x), precision),
            "+/-", np.round(np.nanstd(list_x), precision),
            "; Median =", np.round(np.nanmedian(list_x), precision),
            "; Min. = ", np.round(np.nanmin(list_x), precision),
            "; Max. = ", np.round(np.nanmax(list_x), precision)
        )

In [13]:
audio_files = [f for f in os.listdir() if ".wav" in f]
print(audio_files)

['audio_bot_aws.wav', 'Subject_Jose_T=0.wav', 'Subject_Jose_T=1.wav', 'Subject_Jose_T=2.wav', 'Subject_Jose_T=3.wav', 'Subject_Jose_T=4.wav', 'Subject_Jose_T=5.wav', 'Subject_Jose_T=6.wav', 'Subject_Jose_T=7.wav']


In [3]:
model = whisper.load_model("base")

100%|███████████████████████████████████████| 139M/139M [00:11<00:00, 12.4MiB/s]


In [15]:
save_results_list = []
for f in audio_files:
    t0 = time.time()
    result = model.transcribe(f)
    delta_t = time.time() - t0
    
    save_results_list.append({
        "AudioFile": f,
        "Text": result["text"],
        "DeltaTime": delta_t,
        "compression_ratio": result["segments"][0]["compression_ratio"],
        "no_speech_prob": result["segments"][0]["no_speech_prob"],
        "Language": result["language"],
    })



Detected language: spanish
Detected language: spanish
Detected language: spanish
Detected language: spanish
Detected language: spanish
Detected language: spanish
Detected language: spanish
Detected language: spanish
Detected language: spanish


In [27]:
df_s2t = pd.DataFrame(save_results_list)
display(df_s2t.head(10))
df_s2t.to_csv("SaveOpenAIWhisper.csv", index=False)

Unnamed: 0,AudioFile,Text,DeltaTime,compression_ratio,no_speech_prob,Language
0,audio_bot_aws.wav,"De verdad, eso es increíble. Me siento muy a ...",3.800838,0.883333,0.186058,es
1,Subject_Jose_T=0.wav,"Hola María, ¿cómo estás? Yo también estoy muy...",3.859542,0.871795,0.18111,es
2,Subject_Jose_T=1.wav,Sí que yo me llamo José y estoy muy contento ...,3.912071,0.876712,0.058637,es
3,Subject_Jose_T=2.wav,"El placer es mi omaria, ¿a qué te dedicas?",3.653322,0.807692,0.178047,es
4,Subject_Jose_T=3.wav,te doyeron lyingwen que estudi hasta tres,12.668211,0.854167,0.243649,es
5,Subject_Jose_T=4.wav,"Ah, qué guay, creo que los psicólogos son muy...",4.398523,0.95,0.045852,es
6,Subject_Jose_T=5.wav,ahí me alegro mucho entonces María la verdad ...,4.214833,1.102041,0.131233,es
7,Subject_Jose_T=6.wav,seguro que sí que lo conseguirás,2.900639,0.842105,0.022074,es
8,Subject_Jose_T=7.wav,Tás fuento lo camaría. No sé qué me estás dic...,3.705381,0.836066,0.144891,es


In [25]:
stat_print(df_s2t["DeltaTime"], "DeltaTime", precision=4, quantile=False)

Mean DeltaTime = 4.7904 +/- 2.8127 ; Median = 3.8595 ; Min. =  2.9006 ; Max. =  12.6682


In [9]:
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio("audio_bot_aws.wav")
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

result = model.transcribe("audio_bot_aws.wav")

# decode the audio
# options = whisper.DecodingOptions()
# result = whisper.decode(model, mel, options)

# print the recognized text
print(result["text"])

Detected language: es
Detected language: spanish
 De verdad, eso es increíble. Me siento muy a la gada.


In [14]:
result

{'text': ' De verdad, eso es increíble. Me siento muy a la gada.',
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 6.0,
   'text': ' De verdad, eso es increíble. Me siento muy a la gada.',
   'tokens': [50364,
    1346,
    13692,
    11,
    7287,
    785,
    46202,
    638,
    13,
    1923,
    40340,
    5323,
    257,
    635,
    290,
    1538,
    13,
    50664],
   'temperature': 0.0,
   'avg_logprob': -0.36580140967118113,
   'compression_ratio': 0.8833333333333333,
   'no_speech_prob': 0.18605804443359375}],
 'language': 'es'}