In [1]:
from datasets import load_from_disk

dataset = load_from_disk("../datasets/dataset_fish")
dataset = dataset.with_format("numpy", columns=["audio"], output_all_columns=True)
dataset[0]


  from .autonotebook import tqdm as notebook_tqdm


{'audio': array([0.0005188 , 0.00064087, 0.00057983, ..., 0.00036621, 0.00064087,
        0.0005188 ], shape=(57344,), dtype=float32),
 'time_string': '12:00 AM',
 'hour': 0,
 'minute': 0,
 'period': 'AM',
 'text': 'The time is 12:00 AM.',
 'transcribed_text': ' The time is 12:00 AM.\n'}

In [2]:
from IPython.display import Audio

Audio(dataset[500]["audio"], rate=24000)


In [3]:
# NOTE: this is a hack; I fixed this for subsequent runs
dataset = dataset.map(lambda r: {"ground_truth": f"The time is {r['time_string']}."})
dataset[0]

{'audio': array([0.0005188 , 0.00064087, 0.00057983, ..., 0.00036621, 0.00064087,
        0.0005188 ], shape=(57344,), dtype=float32),
 'ground_truth': np.str_('The time is 12:00 AM.'),
 'time_string': '12:00 AM',
 'hour': 0,
 'minute': 0,
 'period': 'AM',
 'text': 'The time is 12:00 AM.',
 'transcribed_text': ' The time is 12:00 AM.\n'}

In [4]:
import pandas as pd
from dateparser import parse

def grade_row(row):
    if "The time is" not in row["transcribed_text"]:
        return { "is_correct": False, "error": "failed_prefix" }

    date = parse(row["transcribed_text"].split("The time is")[1].strip())
    if date is None:
        return { "is_correct": False, "error": "no_time" }
    elif date.hour % 12 == row["hour"] % 12 and date.minute == row["minute"]:
        return { "is_correct": True, "error": None }
    else:
        return { "is_correct": False, "error": "wrong_time"}


dataset = dataset.map(grade_row, num_proc=12)


In [5]:
pd.Series(dataset['is_correct']).value_counts()

True     1247
False     193
Name: count, dtype: int64

## Error analysis of known good OAI output

In [6]:
ds_failed = dataset.filter(lambda r: not r['is_correct'])
print(f"Failed: {len(ds_failed)}")
ds_failed[:5]

Failed: 193


{'audio': array([array([-0.00024414, -0.00024414,  0.        , ...,  0.00033569,
                0.00024414,  0.00021362], shape=(67584,), dtype=float32),
        array([ 0.00177002,  0.00192261,  0.00158691, ..., -0.00045776,
               -0.00057983, -0.00057983], shape=(51200,), dtype=float32),
        array([-0.00183105, -0.0015564 , -0.00170898, ..., -0.00024414,
               -0.00036621,  0.        ], shape=(53248,), dtype=float32),
        array([-0.00161743, -0.00158691, -0.00128174, ...,  0.00192261,
                0.00195312,  0.00195312], shape=(53248,), dtype=float32),
        array([-0.00115967, -0.00140381, -0.00125122, ..., -0.00094604,
               -0.00088501, -0.00082397], shape=(71680,), dtype=float32)],
       dtype=object),
 'ground_truth': array(['The time is 12:01 AM.', 'The time is 12:07 AM.',
        'The time is 12:14 AM.', 'The time is 12:20 AM.',
        'The time is 12:22 AM.'], dtype='<U21'),
 'is_correct': array([False, False, False, False, False])

In [7]:
pd.Series(ds_failed['error']).value_counts()

no_time          100
wrong_time        63
failed_prefix     30
Name: count, dtype: int64

In [8]:
ds_failed.select_columns(['transcribed_text', 'error', 'hour', 'minute']).map(lambda x: { "transcribed_text": x['transcribed_text'].strip() }).to_csv('ds_failed_text_only.csv', index=False)

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 624.80ba/s]


7770

In [49]:
failed_gen_1 = ds_failed.filter(lambda r: r['error'] == 'no_time')[9]
print(failed_gen_1['text'])
print(failed_gen_1['hour'], failed_gen_1['minute'])

Audio(failed_gen_1['audio'], rate=24000)

Filter: 100%|██████████| 35/35 [00:00<00:00, 5243.82 examples/s]

The time is five o'clock PM.
17 0





In [60]:
failed_gen_1 = ds_failed.filter(lambda r: r['error'] == 'failed_prefix')[7]
print(failed_gen_1['text'])
print(failed_gen_1['hour'], failed_gen_1['minute'])

Audio(failed_gen_1['audio'], rate=24000)

What time is 5:28 AM?
5 28
