In [1]:
!pip install ibm_watson
!pip install pydub



In [2]:
import os
import json
import math

from ibm_watson import SpeechToTextV1

from ibm_cloud_sdk_core.authenticators import IAMAuthenticator, BasicAuthenticator
from pydub import AudioSegment

In [10]:
with open('./my-IBM-keys.json', 'r') as f_in:
    credentials = json.load(f_in)

In [11]:
api_key= credentials['api-keys'][0]

service_endpoint='https://api.eu-gb.speech-to-text.watson.cloud.ibm.com'
my_endpoint=credentials['endpoint']

## Setup STT Service

In [12]:
authenticator = IAMAuthenticator(api_key)
#authenticator = BasicAuthenticator('api-key', api_key)
speech2text = SpeechToTextV1(authenticator=authenticator)

speech2text.set_service_url(my_endpoint)

In [13]:
# test connection
speech_models = speech2text.list_models().get_result()
print(json.dumps(speech_models, indent=2))

{
  "models": [
    {
      "name": "es-MX_BroadbandModel",
      "rate": 16000,
      "language": "es-MX",
      "description": "Mexican Spanish broadband model.",
      "supported_features": {
        "custom_language_model": true,
        "speaker_labels": true
      },
      "url": "https://api.eu-gb.speech-to-text.watson.cloud.ibm.com/instances/a4e78391-c3cd-4b79-a95b-3d5eaf9f3d0a/v1/models/es-MX_BroadbandModel"
    },
    {
      "name": "zh-CN_BroadbandModel",
      "rate": 16000,
      "language": "zh-CN",
      "description": "Mandarin broadband model.",
      "supported_features": {
        "custom_language_model": false,
        "speaker_labels": true
      },
      "url": "https://api.eu-gb.speech-to-text.watson.cloud.ibm.com/instances/a4e78391-c3cd-4b79-a95b-3d5eaf9f3d0a/v1/models/zh-CN_BroadbandModel"
    },
    {
      "name": "es-CL_BroadbandModel",
      "rate": 16000,
      "language": "es-CL",
      "description": "Chilean Spanish broadband model.",
      "supported_

## Split Larger Audio File into Chunk

In [6]:
data_root = './data/'

In [8]:
os.listdir(data_root)

['p8_2021-03-10_Children_Screens.mp3',
 'p2_2021-03-10_Children_Screens.mp3',
 '2021-03-10_Children_Screens.mp3',
 'p1_2021-03-10_Children_Screens.mp3',
 'p0_2021-03-10_Children_Screens.mp3',
 'PR_file2_2021-04-01_12-06-30.mp3',
 'p5_2021-03-10_Children_Screens.mp3',
 'p3_2021-03-10_Children_Screens.mp3',
 'p6_2021-03-10_Children_Screens.mp3',
 'p4_2021-03-10_Children_Screens.mp3',
 'p7_2021-03-10_Children_Screens.mp3']

In [9]:
my_audio = AudioSegment.from_mp3(data_root + 'PR_file2_2021-04-01_12-06-30.mp3')

In [10]:
my_audio.duration_seconds

3233.671836734694

In [189]:
class SplitMp3Audio():
    def __init__(self, filename, data_root='./data/', enc="mp3"):
        
        self.root = data_root
        self.filename = filename
        self.filepath = os.path.join(self.root, self.filename)
                
        self.audio = AudioSegment.from_mp3(self.filepath)
        
        #self.silence_threshold = silence_threshold
        
        self.result_files = []
    
    def get_duration(self, sound=None, form="s"):
        
        if sound is None:
            sound = self.audio
        elif not isinstance(sound, AudioSegment):
            raise ValueError()
        
        if 's' == form:
            return sound.duration_seconds
        elif 'm' == form:
            return math.ceil(self.get_duration(sound) / 60)
        elif 'h' == form:
            return math.ceil(self.get_duration(sound, 'm') / 60)
        else:
            raise ValueError()
    
    def get_f_name(self):
        return self.filename
    
    def get_res_files(self):
        return self.result_files
    
    def clean_res_files(self):
        self.result_files = []
    
    def remove_all_res_files(self):
        # remove the created sub-files
        for i in range(len(self.result_files)):
            f_name = self.result_files.pop()
            if os.path.exists(f_name):
                os.remove(f_name)
    
    def detect_leading_silence(self, sound, silence_threshold=-50.0, chunk_size=10):
        '''
        silence_threshold in dB
        chunk_size in ms

        iterate over chunks until you find the first one with sound
        '''
        trim_ms = 0 # ms

        assert chunk_size > 0 # to avoid infinite loop
        while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
            trim_ms += chunk_size

        return trim_ms
    
    def trim_start_end(self, sound, start=None, end=None, silence_threshold=-50.0, chunk_size=10): 
        
        if start is None and end is None:
            # trim by detecting silence
            start = self.detect_leading_silence(sound, silence_threshold, chunk_size)
            end = self.detect_leading_silence(sound.reverse(), silence_threshold, chunk_size)

        duration = len(sound) 
        trimmed_sound = sound[start:duration-end]
        
        return trimmed_sound
    
    def manual_trim_audio_file(self, start, end, f_name=None):
        # start and end in ms 
        
        if f_name is None: 
            sound = self.audio
        elif isinstance(f_name, str):
            sound = AudioSegment.from_mp3(f_name)
            
        trimmed_sound = self.trim_start_end(sound, start, end)
        
        return trimmed_sound
    
    def split_by_min(self, from_min, to_min, split_filename, trim=True):
        t1 = from_min * 60 * 1000
        t2 = to_min * 60 * 1000
        split_audio = self.audio[t1:t2]
        
        if trim:
            split_audio = self.trim_start_end(split_audio)
        
        file_name = os.path.join(self.root, split_filename)
        split_audio.export(file_name, format="mp3")
        self.result_files.append(file_name)
        
        return split_audio
        
    def multiple_split(self, min_per_split, trim=True, pre=None):
        
        assert min_per_split is not None
        
        assert min_per_split > 0
        
        total_mins = self.get_duration(form='m')
        
        for idx, i in enumerate(range(0, total_mins, min_per_split)):
            
            if pre is not None:
                split_name = pre + str(idx) + '_' + self.filename
            else:
                split_name = str(idx) + '_' + self.filename
                
            split_audio = self.split_by_min(i, i+min_per_split, split_name, trim=trim)
            
            print(str(idx) + ' Done')
            if i == total_mins - min_per_split:
                print('Split into {} parts. Done.'.format(i+1))

In [118]:
os.listdir()

['results0_4.json',
 'results_p1.json',
 'speech2text-ba18bb43ca58.json',
 '.ipynb_checkpoints',
 'Speech2Text_IBM.ipynb',
 'data',
 'output_f2.txt',
 'results5_9.json',
 'output',
 'results_formatted.txt',
 's2t-google-cloud.ipynb']

In [190]:
#file_name = '2021-03-10_Children_Screens.mp3'
f_name = 'PR_file2_2021-04-01_12-06-30.mp3'
base_name = f_name.split('.')[0]
audio_splitter = SplitMp3Audio(f_name, data_root)
audio_splitter.get_duration(form='m')

54

In [191]:
audio_splitter.multiple_split(min_per_split=10, pre='p')

0 Done
1 Done
2 Done
3 Done
4 Done
5 Done


In [193]:
audio_splitter.get_res_files()

[]

In [194]:
#audio_splitter.remove_all_res_files()

In [133]:
## manual curation 

f='./data/p0_PR_file2_2021-04-01_12-06-30.mp3'

trimmed_sound = audio_splitter.manual_trim_audio_file(start=130000, end=0, f_name=f)
audio_splitter.get_duration(trimmed_sound, form='m')

460940

In [136]:
trimmed_sound.export(f, format="mp3")

<_io.BufferedRandom name='./data/p0_PR_file2_2021-04-01_12-06-30.mp3'>

In [137]:
f='./data/p2_PR_file2_2021-04-01_12-06-30.mp3'

trimmed_sound = audio_splitter.manual_trim_audio_file(start=50000, end=0, f_name=f)
print(audio_splitter.get_duration(trimmed_sound, form='m'))
trimmed_sound.export(f, format="mp3")

9


<_io.BufferedRandom name='./data/p2_PR_file2_2021-04-01_12-06-30.mp3'>

## Transcribe Audio Chunks

In [138]:
audio_files = audio_splitter.get_res_files()
len(audio_files)

6

In [139]:
%%time
results = []
#n_max = 5
f_name = audio_splitter.filename

out_dir = './output/'

#audio_files = ['./data/p0_2021-03-10_Children_Screens.mp3']

for i, file in enumerate(audio_files): 
    print('Transcribing {} ..'.format(file))
    with open(file, 'rb') as audio:
        res = speech2text.recognize(audio=audio, 
                        content_type='audio/mp3',
                        model='en-GB_NarrowbandModel',
                        max_alternatives=0,
                        continuous=True,
                        inactivity_timeout=-1,
                        speaker_labels=True).get_result()
        
    if 'results' not in res:
            print('chunk {} has no results due to {}!'.format(i, res['error']))
            
    results.append(res)
    print('Done')

save_file = out_dir + base_name + '_results.json'
print('Saving results: {}'.format(save_file))
with open(save_file, 'w') as f_out: 
    json.dump(results, f_out)
    
print('Complete!')

Transcribing ./data/p0_PR_file2_2021-04-01_12-06-30.mp3 ..
Done
Transcribing ./data/p1_PR_file2_2021-04-01_12-06-30.mp3 ..
Done
Transcribing ./data/p2_PR_file2_2021-04-01_12-06-30.mp3 ..
Done
Transcribing ./data/p3_PR_file2_2021-04-01_12-06-30.mp3 ..
Done
Transcribing ./data/p4_PR_file2_2021-04-01_12-06-30.mp3 ..
Done
Transcribing ./data/p5_PR_file2_2021-04-01_12-06-30.mp3 ..
Done
Saving results ..
Complete!
CPU times: user 643 ms, sys: 122 ms, total: 765 ms
Wall time: 18min 44s


In [88]:
len(results)

6

In [81]:
# %%time
# results5_9 = []
# for i, file in enumerate(audio_files[5:]): 
#     print('Transcribing {} ..'.format(file))
#     with open(file, 'rb') as audio:
#         res = speech2text.recognize(audio=audio, 
#                         content_type='audio/mp3',
#                         model='en-GB_NarrowbandModel',
#                         max_alternatives=0,
#                         continuous=True,
#                         speaker_labels=True).get_result()
#     results5_9.append(res)
#     print('Done')

# print('Saving results ..')
# with open('results5_9.json', 'w') as f_out: 
#     json.dump(results5_9, f_out)
    
# print('Done')

In [None]:
## Clean Up

In [188]:
audio_splitter.remove_all_res_files()

In [None]:
audio_splitter.get_res_files()

## Format Transcribed Chunks into Text

### Raw Text

In [146]:
def simple_transcript2txt(results, post_fix): 
    text = []

    for i, chunk in enumerate(results):
        if 'results' not in chunk:
            print('chunk {} has no results due to {}!'.format(i, chunk['error']))
        else:
            for res in chunk['results']:
                text.append(res['alternatives'][0]['transcript'].rstrip() + '.\n')
            #print('chunk {} appended'.format(i))
    
    with open('output_{}.txt'.format(post_fix), 'w') as f_out: 
        f_out.writelines(text)
        
    return text

In [141]:
# simple_transcript2txt(results0_4, '0_4')

In [145]:
text = simple_transcript2txt(results, 'f2')

chunk 0 appended
chunk 1 appended
chunk 2 appended
chunk 3 appended
chunk 4 appended
chunk 5 appended


In [143]:
#results0_4[0]['results'][1]['alternatives'][0]['timestamps']

### Combine & Format Transcripts

In [148]:
os.listdir(out_dir)

['output_5_9.txt',
 'output_0_4.txt',
 'PR_file2_2021-04-01_12-06-30.mp3_results.json',
 'full_transcript_formatted.txt']

In [157]:
# load unformatted transcriptions 
with open(out_dir + 'PR_file2_2021-04-01_12-06-30.mp3_results.json', 'r') as f_in:
    results_f2 = json.load(f_in)
# with open('results5_9.json', 'r') as f_in:
#     results5_9 = json.load(f_in)
    
# with open('results0_4.json', 'r') as f_in:
#     results0_4 = json.load(f_in)

# res_agg = results0_4 + results5_9
# len(res_agg)
res_agg = results

In [158]:
minutes_per_chunk=10

In [159]:
def get_timespan_paragraph(ts_list: list) -> (int, int): 
    '''
    extract start and end point given timestamps of passage
    '''
    w0, start, _ = ts_list[0]
    wT, _, end = ts_list[-1]
    
    return start, end

In [160]:
# example: extract start-end-time stamp of 1st paragraph of 1st chunk

get_timespan_paragraph(results_f2[0]['results'][1]['alternatives'][0]['timestamps'])

(3.04, 3.35)

In [161]:
results[0]['results'][1]['alternatives'][0]['transcript']

'yes '

In [162]:
def get_timespan_speaker(spk_lbls):
    '''
    example entry: 
        {'from': 40.83,
      'to': 41.23,
      'speaker': 1,
      'confidence': 0.21,
      'final': False}
    '''
    spk_dict = {}
    spk = spk_lbls[0]['speaker']
    start = spk_lbls[0]['from']
    end = 0.0
    for entry in spk_lbls: 
        # detect speaker change
        if entry['speaker'] != spk:
            # add entry
            spk_dict[(start, end)] = spk
            start = entry['from']
            spk = entry['speaker']
        
        end = entry['to']
    
    spk_dict[(start, end)] = spk
    
    return spk_dict

In [163]:
'''
ts: (1.2, 5.4), text: ['asdf'], 
(1.2, 5.4) -> speaker?



'''

"\nts: (1.2, 5.4), text: ['asdf'], \n(1.2, 5.4) -> speaker?\n\n\n\n"

In [164]:
# example: get timespan + speaker lbl for 1st chunk
spk_dict = get_timespan_speaker(res_agg[0]['speaker_labels'])
spk_dict.items()

dict_items([((0.74, 460.94), 0)])

In [165]:
# aggregate speaker labels from all chunks

spk = set()
all_speakers = dict()
for i, chunk in enumerate(res_agg):
    spk_dict = get_timespan_speaker(chunk['speaker_labels'])
    all_speakers[i] = spk_dict
    [spk.add(j) for j in list(spk_dict.values())]

spk

{0, 1, 2}

In [166]:
list(zip(*list(spk_dict.keys())))

[(0.0,), (231.84,)]

In [167]:
def get_speaker_from_ts(spk_dict, ts) -> int:
    start, end = list(zip(*list(spk_dict.keys())))
    
    assert ts[0] < ts[1]
    
    for i, t1 in enumerate(end):
        if ts[0] >= start[i] and ts[1] <= end[i]:
            # time span exactly matches a single speaker
            return spk_dict[(start[i], end[i])]
        
        
        #elif ts[0] >= start[i] and ts[1] > end[i]:
        #    raise ValueError('given timespan overlaps multiple speakers')
    
    return '??'
    #raise ValueError('ts {} not corresponding to any speaker {}'.format(ts, spk_dict.items()))
        

In [168]:
# (42.42, 54.18) => 0

In [169]:
get_speaker_from_ts(spk_dict, (278.42, 280.18))

'??'

In [170]:
#def format_transcript_with_ts_spk():
'''
loop through cunks
for each paragraph, get transcript and time span
for each such time span, get speaker label

adjust times according to chunk-id

combine (ts; speaker; transcript)
''' 

'\nloop through cunks\nfor each paragraph, get transcript and time span\nfor each such time span, get speaker label\n\nadjust times according to chunk-id\n\ncombine (ts; speaker; transcript)\n'

In [171]:
get_timespan_speaker(res_agg[0]['speaker_labels'])

{(0.74, 460.94): 0}

In [172]:
test_ts = list(get_timespan_speaker(res_agg[0]['speaker_labels']).keys())

In [173]:
277.01 % 60

37.00999999999999

In [174]:
round(divmod(277.01, 60)[1], 2)

37.01

In [175]:
import datetime
import time

sec=277.01

In [176]:
str(datetime.timedelta(seconds=1577.01))

'0:26:17.010000'

In [179]:
def format_timestamps(raw_ts: (float, float), chunk_id=0, min_p_chunk=10) -> (str, str):
    '''
    adjust timestmaps to mm:ss.ms format given an offset
    '''
    offset=chunk_id * min_p_chunk
    start, end = raw_ts
    
    def convert_ts(t, offset):
        m, s = divmod(t, 60)
        s = float('{:.2f}'.format(s))
        return '{:0>2}:{:0>4}'.format(int(m)+offset, s)
    
    return convert_ts(start, offset), convert_ts(end, offset) 

In [180]:
format_timestamps((6.0, 599.99), 0)

('00:06.0', '09:59.99')

In [181]:
text = []

minutes_per_chunk=10

for i, chunk in enumerate(res_agg): 
    
    spk_dict = get_timespan_speaker(chunk['speaker_labels'])
    
    for paragraph in chunk['results']: 
        # get transcript
        transcript = paragraph['alternatives'][0]['transcript']
        
        # get time span
        ts = get_timespan_paragraph(paragraph['alternatives'][0]['timestamps'])
        
        # get speaker label
        spk = get_speaker_from_ts(spk_dict, ts)
        
        # format time stamps into mm:ss:ms
        ts = format_timestamps(ts, chunk_id=i)
        
        text.append('({}, {}) (spk {}): '.format(ts[0], ts[1], spk) + transcript + '.\n')
        
    text.append("- end of chunk {}-\n".format(i+1))

In [182]:
text[:100]

["(00:0.74, 00:2.23) (spk 0): no it's fine I don't .\n",
 '(00:3.04, 00:3.35) (spk 0): yes .\n',
 '(00:7.03, 00:7.35) (spk 0): yes .\n',
 '(00:14.72, 00:15.72) (spk 0): yep so .\n',
 '(00:17.33, 00:20.12) (spk 0): you know this is exactly where you should be right now .\n',
 "(00:20.93, 00:32.0) (spk 0): and you know I think you know I'm glad to see what you wrote in the email about your ideas and now you know you presented this morning on the pushchair and things like that I think they're clearly .\n",
 '(00:34.43, 00:35.96) (spk 0): you know I think that is .\n',
 "(00:37.36, 00:45.42) (spk 0): the only like going through the methodology kind of helps you organising clarify you already radical thinking a little bit it's going to be email up .\n",
 '(00:49.39, 00:51.72) (spk 0): just look at your poster right now because I saw that you .\n',
 '(00:53.55, 00:54.27) (spk 0): but I think .\n',
 "(00:55.18, 01:3.34) (spk 0): I think you know like in the postman when you presented there I 

In [186]:
# save transcript
out_name = out_dir + base_name + '_transcript_formatted.txt'
with open(out_name, 'w') as f_out: 
    f_out.writelines(text)
print(out_name)

./output/PR_file2_2021-04-01_12-06-30_transcript_formatted.txt


In [170]:
def ibm_watson_s2t(filename: str) -> str:
    authenticator = IAMAuthenticator(api_key)
    speech2text = SpeechToTextV1(authenticator=authenticator)
    speech2text.set_service_url(my_endpoint)

    with open(filename, 'rb') as audio_file:
        response = speech2text.recognize(
            audio=audio_file,
            content_type='audio/{}'.format(os.path.splitext(filename)[1][1:]),
            max_alternatives=1,
            model='en-GB_NarrowbandModel',
            speaker_labels=True).get_result()

    return response

## Technical Debt

- handle Audio format other than mp3
- iterate over all chunks in one loop (without getting interrupted by IBM due to too many requests)
- create basic API to use without Jupyter notebook
- add parameters to pass individual API keys and service endpoint
- add parameters to pass name and directory for input and output file
- add parameter to control STT params (language, speaker labels, inactivity_timeout)
- add parameter to control chunk size

In [22]:
'''
ApiException: Error: <HTML><HEAD>
<TITLE>Internal Server Error</TITLE>
</HEAD><BODY>
<H1>Internal Server Error - Write</H1>
The server encountered an internal error or misconfiguration and was unable to
complete your request.<P>
Reference&#32;&#35;4&#46;a5901602&#46;1615816352&#46;7132959
</BODY></HTML>
, Code: 503
'''

'\nApiException: Error: <HTML><HEAD>\n<TITLE>Internal Server Error</TITLE>\n</HEAD><BODY>\n<H1>Internal Server Error - Write</H1>\nThe server encountered an internal error or misconfiguration and was unable to\ncomplete your request.<P>\nReference&#32;&#35;4&#46;a5901602&#46;1615816352&#46;7132959\n</BODY></HTML>\n, Code: 503\n'