In [1]:
import boto3
import json
import base64
import requests
import time
import pandas as pd

In [2]:
# load client and resource
sd_session = boto3.session.Session(profile_name='smart-dyspnea')
s3_client = sd_session.client('s3')
transcribe_client = sd_session.client('transcribe')

In [3]:
with open('./data/ml_data.json', 'r') as f:
    data = json.loads(f.read())

In [11]:
audio = base64.b64decode(data[76]['audio']['data'].encode('utf-8'))

In [15]:
type(data[76]['audio']['data'])

str

In [18]:
with open('./data/audio.mp3', 'wb') as f:
    f.write(audio)

In [21]:
response = s3_client.upload_file(Bucket='sd-test-ml', Key='audios/audio.mp3', Filename="./data/audio.mp3")

In [34]:
response = transcribe_client.start_transcription_job(
    TranscriptionJobName='test-clase-1',
    LanguageCode='es-ES',
    MediaFormat='mp3',
    Media={
        'MediaFileUri': f"s3://sd-test-ml/audios/audio.mp3"
    },
#    Settings={
#        'ShowAlternatives': True,
#        'MaxAlternatives': 3,
#        'VocabularyFilterName': 'string',
#        'VocabularyFilterMethod': 'remove'|'mask'
#    },
)

In [35]:
response

{'TranscriptionJob': {'TranscriptionJobName': 'test-clase-1',
  'TranscriptionJobStatus': 'IN_PROGRESS',
  'LanguageCode': 'es-ES',
  'MediaFormat': 'mp3',
  'Media': {'MediaFileUri': 's3://sd-test-ml/audios/audio.mp3'},
  'StartTime': datetime.datetime(2021, 3, 17, 18, 22, 4, 47000, tzinfo=tzlocal()),
  'CreationTime': datetime.datetime(2021, 3, 17, 18, 22, 4, 26000, tzinfo=tzlocal())},
 'ResponseMetadata': {'RequestId': '0957824a-e512-4be2-89f2-4e4d54d2dd05',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Wed, 17 Mar 2021 17:22:03 GMT',
   'x-amzn-requestid': '0957824a-e512-4be2-89f2-4e4d54d2dd05',
   'content-length': '263',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [52]:
response = transcribe_client.get_transcription_job(
    TranscriptionJobName='test-clase-1'
)
response

{'TranscriptionJob': {'TranscriptionJobName': 'test-clase-1',
  'TranscriptionJobStatus': 'COMPLETED',
  'LanguageCode': 'es-ES',
  'MediaSampleRateHertz': 48000,
  'MediaFormat': 'mp3',
  'Media': {'MediaFileUri': 's3://sd-test-ml/audios/audio.mp3'},
  'Transcript': {'TranscriptFileUri': 'https://s3.eu-west-1.amazonaws.com/aws-transcribe-eu-west-1-prod/294604510371/test-clase-1/fadc6ae1-7295-44f1-964e-0ae071398422/asrOutput.json?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEPn%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMSJIMEYCIQDDyiIMaTCxwBsLs6YZP205uRemqrhrZYSBBngevnVifAIhAKhBJ5Nye29uhXsBe4fp0QHKXujW1Nd9Kyp1lWelDYf0KrQDCDIQAhoMNTg3MDE3NjYzNDE3IgxIINUq4rdl4D%2FWif8qkQMguMO%2FL7zkrjrFt949DGdTbfjIbyIaEKCV4kJO%2BMIN6qwdBqu8VFrVXvHEflJXtrIxirWzj%2F8Uwc%2Fyp10NXBcmGTGzRSZRnmhLrmDbmsa151ErP88DefseqFWLMDODzGLcNbR7KSXYbdU%2B3sCuHS2vqVGioHzGM2rgvz72MEr9XeFmkK0VCxw1zqZ2DJEgnJWlVWr0eewKEkviASQ8Yl6NGoGN0XtslEUkTzBaguFql653sRavjNejjEzPhyslvDFpPb3H%2BQ5MsOdFuWqavev1WPIwm2hADbxe8rNsbxSNnPkuTMGhJvt5R%2F0B5

In [53]:
output = requests.get(response["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]).json()

In [54]:
output

{'jobName': 'test-clase-1',
 'accountId': '294604510371',
 'results': {'transcripts': [{'transcript': 'uno dos tres cuatro cinco seis siete ocho nueve diez once doce trece catorce quince. Vicenti siete ocho ocho nueve. En entre un noventa y dos veintitrés. Veinticuatro veinticinco, veintiséis veintisiete veintiocho veintinueve treinta.'}],
  'items': [{'start_time': '0.0',
    'end_time': '0.12',
    'alternatives': [{'confidence': '0.7208', 'content': 'uno'}],
    'type': 'pronunciation'},
   {'start_time': '0.12',
    'end_time': '0.4',
    'alternatives': [{'confidence': '0.9988', 'content': 'dos'}],
    'type': 'pronunciation'},
   {'start_time': '0.4',
    'end_time': '0.66',
    'alternatives': [{'confidence': '1.0', 'content': 'tres'}],
    'type': 'pronunciation'},
   {'start_time': '0.66',
    'end_time': '1.02',
    'alternatives': [{'confidence': '1.0', 'content': 'cuatro'}],
    'type': 'pronunciation'},
   {'start_time': '1.02',
    'end_time': '1.35',
    'alternatives': 

### Creating a filter vocabulary

In [55]:
response = transcribe_client.create_vocabulary_filter(
    VocabularyFilterName='numbers',
    LanguageCode='es-ES',
    Words=[
        'uno','siete','diez','treinta'
    ]
)

In [58]:
response = transcribe_client.start_transcription_job(
    TranscriptionJobName='test-clase-2',
    LanguageCode='es-ES',
    MediaFormat='mp3',
    Media={
        'MediaFileUri': f"s3://sd-test-ml/audios/audio.mp3"
    },
    Settings={
#        'ShowAlternatives': True,
#        'MaxAlternatives': 3,
        'VocabularyFilterName': 'numbers',
#        'VocabularyFilterMethod': 'remove'|'mask'
    },
)

In [59]:
%timeit
response = {
    'TranscriptionJob': {
      'TranscriptionJobStatus': ''
    }
}

while response['TranscriptionJob']['TranscriptionJobStatus'] != 'COMPLETED':
    time.sleep(1)
    response = transcribe_client.get_transcription_job(
        TranscriptionJobName='test-clase-2'
    )
    print(response['TranscriptionJob']['TranscriptionJobStatus'])

IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
IN_PROGRESS
COMPLETED


In [60]:
response

{'TranscriptionJob': {'TranscriptionJobName': 'test-clase-2',
  'TranscriptionJobStatus': 'COMPLETED',
  'LanguageCode': 'es-ES',
  'MediaSampleRateHertz': 48000,
  'MediaFormat': 'mp3',
  'Media': {'MediaFileUri': 's3://sd-test-ml/audios/audio.mp3'},
  'Transcript': {'TranscriptFileUri': 'https://s3.eu-west-1.amazonaws.com/aws-transcribe-eu-west-1-prod/294604510371/test-clase-2/52869a72-a64c-4737-9b66-1fe9ac6fc4ca/asrOutput.json?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEPn%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMSJHMEUCIQCHhI97bQonqrR3C9%2BHQE%2F5tXQ%2FiBticrVBZbYca%2F7BZgIgF8KSHanRa9y9lYW%2BsoKzjHXX4nmO70YnNw8jnzdKN5QqtAMIMhACGgw1ODcwMTc2NjM0MTciDEjkeEXBCJXz6VDaXCqRAw5TRmzSbcLBPL6PT6ur4aqjRpvsLrQILZOhfBTmbl0m4OXIlAzHtgZKbaCRS9E3NeL4Uzx3cYGzOch2UB%2F0EgJRTtLIkAF1uNDAK4ekmnlGraSW%2FeQet9acKX7QDTvOp%2FWXkmzE6Fo3K2RcNu6AV%2FyYq1iERVdLlUAPIr63TpbiZ%2FS3DQL%2B20tgR9yZe5sNt2evL3sl4qs4I2QVjWmyOIlQoSlpatBDgr%2BIR%2B%2Fqwlrs%2FGs5zuY3d92RskZwiOteOkJIeMnTKabkEad1u84oFUcn3sxTw6vTkOXMGWYhYG29DGW

In [61]:
output = requests.get(response["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]).json()

In [62]:
output

{'jobName': 'test-clase-2',
 'accountId': '294604510371',
 'results': {'transcripts': [{'transcript': '*** dos tres cuatro cinco seis *** ocho nueve *** once doce trece catorce quince. Vicenti *** ocho ocho nueve. En entre un noventa y dos veintitrés. Veinticuatro veinticinco, veintiséis veintisiete veintiocho veintinueve ***.'}],
  'items': [{'start_time': '0.0',
    'end_time': '0.12',
    'alternatives': [{'confidence': '0.7208', 'content': '***'}],
    'type': 'pronunciation'},
   {'start_time': '0.12',
    'end_time': '0.4',
    'alternatives': [{'confidence': '0.9988', 'content': 'dos'}],
    'type': 'pronunciation'},
   {'start_time': '0.4',
    'end_time': '0.66',
    'alternatives': [{'confidence': '1.0', 'content': 'tres'}],
    'type': 'pronunciation'},
   {'start_time': '0.66',
    'end_time': '1.02',
    'alternatives': [{'confidence': '1.0', 'content': 'cuatro'}],
    'type': 'pronunciation'},
   {'start_time': '1.02',
    'end_time': '1.35',
    'alternatives': [{'confid

### Creating vocabulary

In [63]:
response = transcribe_client.create_vocabulary(
    VocabularyName='numbers',
    LanguageCode='es-ES',
    Phrases=[
        'uno',
        'siete',
        'diez',
        'treinta'
    ]
)

In [64]:
response = transcribe_client.start_transcription_job(
    TranscriptionJobName='test-clase-3',
    LanguageCode='es-ES',
    MediaFormat='mp3',
    Media={
        'MediaFileUri': f"s3://sd-test-ml/audios/audio.mp3"
    },
    Settings={
        'VocabularyName': 'numbers'
    }
)

In [67]:
response = {
    'TranscriptionJob': {
      'TranscriptionJobStatus': ''
    }
}

while response['TranscriptionJob']['TranscriptionJobStatus'] != 'COMPLETED':
    time.sleep(1)
    response = transcribe_client.get_transcription_job(
        TranscriptionJobName='test-clase-3'
    )
    print(response['TranscriptionJob']['TranscriptionJobStatus'])

COMPLETED


In [68]:
response

{'TranscriptionJob': {'TranscriptionJobName': 'test-clase-3',
  'TranscriptionJobStatus': 'COMPLETED',
  'LanguageCode': 'es-ES',
  'MediaSampleRateHertz': 48000,
  'MediaFormat': 'mp3',
  'Media': {'MediaFileUri': 's3://sd-test-ml/audios/audio.mp3'},
  'Transcript': {'TranscriptFileUri': 'https://s3.eu-west-1.amazonaws.com/aws-transcribe-eu-west-1-prod/294604510371/test-clase-3/abb85346-67bf-4ce0-a55d-140e1235db40/asrOutput.json?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEPr%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMSJHMEUCIQCZS0d4ZPgIiUed1gfuv%2BTomW1owneYIvz%2BVxsZlRYnagIgVvip3hmfeTBftfNRhoPJTElTXCk8Gl0PGbU6FnsF0IAqtAMIMhACGgw1ODcwMTc2NjM0MTciDCShaUSVSplQnowvXSqRA7wMLjYmZujrOQmmOi5uVgzISNLQXO7Iu5oTTQJodM%2B2xmVeb7hHIKlTwISf%2FvGh1V1Ulk%2Bz90k9AhKtmTmyjHLuIhRv39tdcQZqppszHm9Kmo1NtTVXv1KvGHBO20EL3GxqgLlUalT0LgNY7FTj8VJnaFJJuVT2Om2EiSwfL92pB71Cd7m90Ue%2BIGGZHDA0tQpY54wTXilKw9ItHgOaFRkOvbcae%2FcVe8%2FFdahYUhnxY1%2FpCHP7l1CaJKoWuYhaYNv9cDRY4tYALSxDRF7cvGCzGIYQsVlg7McKt5zqKE8JI08lXL7%2FJLcKH

In [69]:
output = requests.get(response["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]).json()

In [70]:
output

{'jobName': 'test-clase-3',
 'accountId': '294604510371',
 'results': {'transcripts': [{'transcript': 'uno dos, tres cuatro cinco, seis siete, ocho nueve, diez once, doce trece catorce quince Vicenti siete chochona deben entre un noventa y dos veintitrés veinticuatro y cinco mil seis siete veintiocho mil nueve treinta.'}],
  'items': [{'start_time': '0.0',
    'end_time': '0.12',
    'alternatives': [{'confidence': '0.7161', 'content': 'uno'}],
    'type': 'pronunciation'},
   {'start_time': '0.12',
    'end_time': '0.4',
    'alternatives': [{'confidence': '0.9989', 'content': 'dos'}],
    'type': 'pronunciation'},
   {'alternatives': [{'confidence': '0.0', 'content': ','}],
    'type': 'punctuation'},
   {'start_time': '0.4',
    'end_time': '0.66',
    'alternatives': [{'confidence': '1.0', 'content': 'tres'}],
    'type': 'pronunciation'},
   {'start_time': '0.66',
    'end_time': '1.02',
    'alternatives': [{'confidence': '1.0', 'content': 'cuatro'}],
    'type': 'pronunciation'}

### Process multiple audios

In [None]:
result = {}

In [16]:
for record in data:  
    audio_id = record['id'].split('/')[-1][20:]
    
    if audio_id not in result:
        response = transcribe_client.start_transcription_job(
            TranscriptionJobName=audio_id+'5',
            LanguageCode='es-ES',
            MediaFormat='mp3',
            Media={
                'MediaFileUri': record['audio']['s3'].replace('sources', 'processed/sources')
            }
        )

        response = {
            'TranscriptionJob': {
              'TranscriptionJobStatus': ''
            }
        }

        start = time.time()
        while response['TranscriptionJob']['TranscriptionJobStatus'] != 'COMPLETED':
            time.sleep(0.5)
            response = transcribe_client.get_transcription_job(
                TranscriptionJobName=audio_id+'5'
            )

        end = time.time()
        time_duration = end-start
        print(f"Time spent: {time_duration}")

        output = requests.get(response["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]).json()    

        result[audio_id] = {
            'transcription': output['results']['transcripts'][0]['transcript'],
            'time': time_duration,
            '1': 'uno' in output['results']['transcripts'][0]['transcript'],
            '2': ' dos' in output['results']['transcripts'][0]['transcript'],
            '3': ' tres' in output['results']['transcripts'][0]['transcript'],
            '7': ' siete' in output['results']['transcripts'][0]['transcript'],
            '10': 'diez' in output['results']['transcripts'][0]['transcript'],
            '28': 'veintiocho' in output['results']['transcripts'][0]['transcript'],
            '29': 'veintinueve' in output['results']['transcripts'][0]['transcript'],
            '30': 'treinta' in output['results']['transcripts'][0]['transcript']
        }

Time spent: 19.980509281158447
Time spent: 18.137675046920776
Time spent: 22.601765155792236
Time spent: 18.653274059295654
Time spent: 18.373526096343994
Time spent: 18.72437596321106
Time spent: 22.3785560131073
Time spent: 19.479393005371094
Time spent: 19.552014112472534
Time spent: 28.684407711029053
Time spent: 27.737651348114014
Time spent: 18.896041870117188
Time spent: 21.15692687034607
Time spent: 16.460272073745728
Time spent: 21.026095867156982
Time spent: 28.61070489883423
Time spent: 17.527085065841675
Time spent: 18.423467874526978
Time spent: 19.35248899459839
Time spent: 18.114320278167725
Time spent: 18.01288080215454
Time spent: 20.243913888931274
Time spent: 31.130372047424316
Time spent: 19.933776140213013
Time spent: 19.97895312309265
Time spent: 19.45619297027588
Time spent: 20.349179983139038
Time spent: 18.367475986480713
Time spent: 18.852354049682617


In [12]:
len(result)

57

In [7]:
df = pd.DataFrame.from_dict(result, orient='index')

In [8]:
df

Unnamed: 0,transcription,time,1,7,10,30
806145f6-ebae-43e3-848f-3d6b26d803bd,"uno, dos, tres cuatro. Cinco sitios deberían s...",1.110564,True,True,False,False
d866b2c0-6eca-40e1-ad93-5f4feab08421,"uno, tres, cuatro, cinco, seis, siete, ocho, n...",1.120585,True,True,False,False
39f1a1db-904a-4069-9a35-005847b9101c,"uno tres, cuatro cinco, seis, siete, ocho, nue...",1.129963,True,True,True,False
109d0932-6702-4390-9e4c-d01177e1fb5c,co- desprecio a,1.105825,False,False,False,False
7e43bff4-5cec-48e2-8697-1e72f81a8522,"uno, dos, tres Pack cinco se siete ocho, nueve...",1.120603,True,True,False,False
26e43698-7653-46eb-9ba4-e3a4502db126,uno dos tres cuatro cinco seis siete ocho nuev...,1.094926,True,True,True,False
c453e481-e0cc-4509-b321-f8dce76be00a,"seis, siete ocho, nueve diez, o sea, los setec...",1.090188,True,True,True,True
ab9362e1-e884-4648-86a4-d22e9cb0acf5,"uno, dos, tres, cuatro, cinco, seis, siete och...",1.124651,True,True,True,True
fc67332b-fb3f-401b-a506-dd13e0bb3a35,"cuidado, porque con los tres, cuatro y siete d...",1.110053,False,True,False,False
e08b689e-100e-437f-9f33-559ff3436567,"uno, dos, tres, cuatro, cinco, seis, siete, oc...",1.132245,True,True,True,True


In [9]:
ids = [i['id'].split('/')[-1][20:] for i in data]

In [10]:
len(ids)

87

In [11]:
transcribe_jobs = []

In [12]:
response = transcribe_client.list_transcription_jobs(
    Status='COMPLETED',
    MaxResults=10
)

while len(response['TranscriptionJobSummaries'][-1]['TranscriptionJobName']) == 37:
    response = transcribe_client.list_transcription_jobs(
        Status='COMPLETED',
        NextToken=response['NextToken'],
        MaxResults=10
    )
    
    transcribe_jobs += [job['TranscriptionJobName'] for job in response['TranscriptionJobSummaries']]

In [13]:
match_jobs = [job[:-1] for job in transcribe_jobs if job[:-1] in ids]

In [14]:
len(match_jobs)

117

In [15]:
job_audios_processed = set(match_jobs)

In [16]:
len(job_audios_processed)

76

In [28]:
result = {}
for job in job_audios_processed:
        
    if job+'5' in transcribe_jobs:
        response = transcribe_client.get_transcription_job(
            TranscriptionJobName=job+'5'
        )

        output = requests.get(response["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]).json()  
        
        items = [item['alternatives'][0]['content'].lower() for item in output['results']['items']]

        result[job] = {
            'transcription': output['results']['transcripts'][0]['transcript'],
            'time': 30.0,
            '1': 'uno' in items or 'un' in items,
            '2': 'dos' in items,
            '3': 'tres' in items,
            '7': 'siete' in items,
            '10': 'diez' in items,
            '28': 'veintiocho' in items,
            '29': 'veintinueve' in items,
            '30': 'treinta' in items
        }

In [29]:
len(result)

76

In [30]:
df = pd.DataFrame.from_dict(result, orient='index')
df

Unnamed: 0,transcription,time,1,2,3,7,10,28,29,30
00f3e7dd-cd43-48fd-94d9-ef87c4332442,"cuatro, seis siete, ocho nueve, diez once, doc...",30.0,False,False,False,True,True,True,True,True
a8534cb4-eda3-4373-b9e5-559130b34546,"dos, tres, cuatro, cinco, seis, siete, ocho nu...",30.0,False,True,True,True,True,True,True,True
25839531-720c-4eea-8c43-7b02512475f2,"un, dos, tres, cuatro, cinco, seis, siete, och...",30.0,True,True,True,True,True,True,True,True
99ef1a94-f43b-4869-bd36-d8a2887e348d,"uno dos, tres, cuatro cinco, seis siete, ocho ...",30.0,True,True,True,True,True,True,True,True
b5d2edee-f5cd-4902-9288-4c682df3dd4e,"dos, tres, cuatro. Tengo seis, siete, ocho año...",30.0,False,True,True,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...
f9295b32-8440-413b-b276-47f8a7d5932a,"uno dos tres, cuatro cinco, seis siete, ocho n...",30.0,True,True,True,True,True,True,True,True
46167a68-8dfc-49c3-b5da-9ce2dcc56436,"los días once, doce, trece, catorce, quince, d...",30.0,False,False,False,False,False,True,True,True
8a21b69d-4951-4220-a119-e9edf3601900,uno dos tres cuatro cinco seis siete ocho nuev...,30.0,True,True,True,True,True,False,False,False
a9fe218b-db71-4b21-bd1d-39b4769fef03,"uno dos, tres cuatro cinco seis siete ocho nue...",30.0,True,True,True,True,True,True,True,False


In [40]:
df.to_csv('data/aws_transcription.csv')