In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd "drive/MyDrive/SpeechDoctor"

/content/drive/MyDrive/SpeechDoctor


In [3]:
!pip install pydub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


### Convert .mp3 to .wav

In [4]:
from os import path
from pydub import AudioSegment

In [5]:
src = "converted_e29i0_ShanIndaAMAccent_enhanced.mp3"
dst = "converted_e29i0_ShanIndaAMAccent_enhanced.wav"

In [6]:
sound = AudioSegment.from_mp3(src)
sound.export(dst, format="wav")

<_io.BufferedRandom name='converted_e29i0_ShanIndaAMAccent_enhanced.wav'>

### For the audio files that has different sampling rate, need to resample

In [7]:
import scipy.signal
import soundfile as sf

In [8]:
# Load the original audio file
audio, original_sample_rate = sf.read(dst)

# Set the target sample rate
target_sample_rate = 16000

# Calculate the resampling ratio
resampling_ratio = target_sample_rate / original_sample_rate

# Resample the audio to the target sample rate
resampled_audio = scipy.signal.resample(audio, int(len(audio) * resampling_ratio))

# Save the resampled audio to a new file
sf.write("converted_e29i0_ShanIndaAMAccent_enhanced_resampled.wav", resampled_audio, target_sample_rate)

### Convert into Mono file

In [9]:
sound = AudioSegment.from_wav("converted_e29i0_ShanIndaAMAccent_enhanced_resampled.wav")
sound = sound.set_channels(1)
sound.export("converted_e29i0_ShanIndaAMAccent_enhanced_resampled_mono.wav", format="wav")

<_io.BufferedRandom name='converted_e29i0_ShanIndaAMAccent_enhanced_resampled_mono.wav'>

## Count the number of sentences and timestamps

### Use OpenAI whisper

In [10]:
!pip install -U openai-whisper

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai-whisper
  Downloading openai-whisper-20230314.tar.gz (792 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m792.9/792.9 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken==0.3.1 (from openai-whisper)
  Downloading tiktoken-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpeg-python==0.2.0 (from openai-whisper)
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created 

In [11]:
import whisper

# available models = ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large']
model = whisper.load_model("base")

100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 141MiB/s]


In [12]:
result = model.transcribe("converted_e29i0_ShanIndaAMAccent_enhanced_resampled_mono.wav")
print(result["text"])



 Hello, my name is Prasjant and I am going to graduate with an engineering degree in computer science, ready to embark on a journey of continuous learning and technological advancement. Throughout my college-journal computer science, I have acquired a comprehensive understanding of algorithms, data structures, software development and system design. The dynamic nature of computer science has allowed me to embrace challenges, adapt to new technologies and constantly stay at the forefront of innovation. With the Solid Foundation in Computer Science principles in the programming, I am equipped to navigate the landscape and tackle complex problems head on. As the computer science graduate, I possess strong problem-solving skills, a logical mindset and the ability to think critically, which are essential in navigating the complexities of the digital era. I am proficient in multiple programming languages, including but not limited to Python, Java, C++ and JavaScript, enabling me to develop r

In [15]:
print(result['segments'])

[{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.36, 'text': ' Hello, my name is Prasjant and I am going to graduate with an engineering degree in computer', 'tokens': [50364, 2425, 11, 452, 1315, 307, 2114, 296, 73, 394, 293, 286, 669, 516, 281, 8080, 365, 364, 7043, 4314, 294, 3820, 50632], 'temperature': 0.0, 'avg_logprob': -0.2831454594930013, 'compression_ratio': 1.6577946768060836, 'no_speech_prob': 0.042412690818309784}, {'id': 1, 'seek': 0, 'start': 5.36, 'end': 11.88, 'text': ' science, ready to embark on a journey of continuous learning and technological advancement.', 'tokens': [50632, 3497, 11, 1919, 281, 29832, 322, 257, 4671, 295, 10957, 2539, 293, 18439, 35764, 13, 50958], 'temperature': 0.0, 'avg_logprob': -0.2831454594930013, 'compression_ratio': 1.6577946768060836, 'no_speech_prob': 0.042412690818309784}, {'id': 2, 'seek': 0, 'start': 11.88, 'end': 16.48, 'text': ' Throughout my college-journal computer science, I have acquired a comprehensive understanding', 'tokens': [5

In [29]:
for segment in result['segments']:
  print(segment['start'], '-', segment['end'])

0.0 - 5.36
5.36 - 11.88
11.88 - 16.48
16.48 - 21.88
21.88 - 27.0
27.0 - 32.84
32.84 - 37.16
37.16 - 40.96
40.96 - 47.56
47.56 - 52.68
52.68 - 54.480000000000004
54.48 - 60.8
60.8 - 68.12
68.12 - 73.28
73.28 - 79.4
79.4 - 81.92
81.92 - 87.08
87.08 - 93.72
93.72 - 95.68
95.68 - 99.88
99.88 - 105.32000000000001
105.32000000000001 - 106.88
106.88 - 113.72
113.72 - 118.96
118.96 - 124.72
124.72 - 131.76
131.76 - 133.48
133.48 - 136.48
136.48 - 137.07999999999998


In [17]:
for segment in result['segments']:
  print(segment['text'], "*")

 Hello, my name is Prasjant and I am going to graduate with an engineering degree in computer *
 science, ready to embark on a journey of continuous learning and technological advancement. *
 Throughout my college-journal computer science, I have acquired a comprehensive understanding *
 of algorithms, data structures, software development and system design. *
 The dynamic nature of computer science has allowed me to embrace challenges, adapt to *
 new technologies and constantly stay at the forefront of innovation. *
 With the Solid Foundation in Computer Science principles in the programming, I am equipped *
 to navigate the landscape and tackle complex problems head on. *
 As the computer science graduate, I possess strong problem-solving skills, a logical mindset *
 and the ability to think critically, which are essential in navigating the complexities *
 of the digital era. *
 I am proficient in multiple programming languages, including but not limited to Python, Java, *
 C++ and 

In [22]:
import pprint

In [33]:
sentences = []
json_sentence = []

tmp = ""
start_time = result['segments'][0]['start']
end_time = result['segments'][0]['end']

for segment in result['segments']:
  tmp += segment['text']
  if "." in segment['text']:
    sentences.append(tmp)
    end_time = segment['end']
    json_sentence.append({'start': start_time, 'end': end_time, 'sentence': tmp})
    tmp = ""
    start_time = end_time

sentence_num = len(sentences)

print('The number of sentence is : ', sentence_num)
pprint.pprint(json_sentence)



The number of sentence is :  13
[{'end': 11.88,
  'sentence': ' Hello, my name is Prasjant and I am going to graduate with an '
              'engineering degree in computer science, ready to embark on a '
              'journey of continuous learning and technological advancement.',
  'start': 0.0},
 {'end': 21.88,
  'sentence': ' Throughout my college-journal computer science, I have '
              'acquired a comprehensive understanding of algorithms, data '
              'structures, software development and system design.',
  'start': 11.88},
 {'end': 32.84,
  'sentence': ' The dynamic nature of computer science has allowed me to '
              'embrace challenges, adapt to new technologies and constantly '
              'stay at the forefront of innovation.',
  'start': 21.88},
 {'end': 40.96,
  'sentence': ' With the Solid Foundation in Computer Science principles in '
              'the programming, I am equipped to navigate the landscape and '
              'tackle complex p

In [47]:
json_sentence_data = json.dumps(json_sentence)
with open('sentence_json_data.json', 'w') as outfile:
    outfile.write(json_sentence_data)


## Count the number of words and timestamps

### Using VOSK model

In [34]:
!unzip vosk-model-en-in-0.5.zip

Archive:  vosk-model-en-in-0.5.zip
   creating: vosk-model-en-in-0.5/
   creating: vosk-model-en-in-0.5/am/
 extracting: vosk-model-en-in-0.5/am/frame_subsampling_factor  
  inflating: vosk-model-en-in-0.5/am/final.mdl  
   creating: vosk-model-en-in-0.5/ivector/
  inflating: vosk-model-en-in-0.5/ivector/ivector.conf  
  inflating: vosk-model-en-in-0.5/ivector/final.dubm  
  inflating: vosk-model-en-in-0.5/ivector/final.ie  
  inflating: vosk-model-en-in-0.5/ivector/final.mat  
  inflating: vosk-model-en-in-0.5/ivector/splice.conf  
  inflating: vosk-model-en-in-0.5/ivector/global_cmvn.stats  
  inflating: vosk-model-en-in-0.5/ivector/online_cmvn.conf  
  inflating: vosk-model-en-in-0.5/README  
   creating: vosk-model-en-in-0.5/conf/
  inflating: vosk-model-en-in-0.5/conf/mfcc.conf  
  inflating: vosk-model-en-in-0.5/conf/model.conf  
   creating: vosk-model-en-in-0.5/graph/
   creating: vosk-model-en-in-0.5/graph/phones/
  inflating: vosk-model-en-in-0.5/graph/phones/word_boundary.in

In [36]:
!pip install vosk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vosk
  Downloading vosk-0.3.45-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
Collecting srt (from vosk)
  Downloading srt-3.5.3.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting websockets (from vosk)
  Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.9/129.9 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: srt
  Building wheel for srt (setup.py) ... [?25l[?25hdone
  Created wheel for srt: filename=srt-3.5.3-py3-none-any.whl size=22428 sha256=e39bbc3c35546db350ff611a98dcd0fc0a536060d09c683e1e0da3dd21db6e61
  Stored in direc

In [38]:
class Word:
    ''' A class representing a word from the JSON format for vosk speech recognition API '''

    def __init__(self, dict):
        '''
        Parameters:
          dict (dict) dictionary from JSON, containing:
            conf (float): degree of confidence, from 0 to 1
            end (float): end time of the pronouncing the word, in seconds
            start (float): start time of the pronouncing the word, in seconds
            word (str): recognized word
        '''

        self.conf = dict["conf"]
        self.end = dict["end"]
        self.start = dict["start"]
        self.word = dict["word"]

    def to_string(self):
        ''' Returns a string describing this instance '''
        return "{:20} from {:.2f} sec to {:.2f} sec, confidence is {:.2f}%".format(
            self.word, self.start, self.end, self.conf*100)

In [41]:
import wave
import json

from vosk import Model, KaldiRecognizer, SetLogLevel
# from .Word import Word as custom_Word

model_path = "vosk-model-en-in-0.5"

In [43]:
audio_filename = "converted_e29i0_ShanIndaAMAccent_enhanced_resampled_mono.wav"

model = Model(model_path)
wf = wave.open(audio_filename, "rb")
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)

# get the list of JSON dictionaries
results = []

# recognize speech using vosk model
while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        part_result = json.loads(rec.Result())
        results.append(part_result)
part_result = json.loads(rec.FinalResult())
results.append(part_result)

# convert list of JSON dictionaries to list of 'Word' objects
list_of_Words = []
for sentence in results:
    if len(sentence) == 1:
        # sometimes there are bugs in recognition
        # and it returns an empty dictionary
        # {'text': ''}
        continue
    for obj in sentence['result']:
        w = Word(obj)  # create custom Word object
        list_of_Words.append(w)  # and add it to list

wf.close()  # close audiofile

# output to the screen

word_num = len(list_of_Words)
json_words = []
for word in list_of_Words:
    # print(word.to_string())
    json_words.append({'start': word.start, 'end': word.end, 'word': word.word})

print(json_words)


hello                from 0.48 sec to 0.90 sec, confidence is 100.00%
my                   from 0.93 sec to 1.08 sec, confidence is 100.00%
name                 from 1.08 sec to 1.23 sec, confidence is 94.44%
is                   from 1.23 sec to 1.32 sec, confidence is 92.70%
prasanth             from 1.32 sec to 1.98 sec, confidence is 27.80%
and                  from 2.37 sec to 2.55 sec, confidence is 95.23%
i'm                  from 2.55 sec to 2.73 sec, confidence is 68.94%
going                from 2.73 sec to 2.88 sec, confidence is 100.00%
to                   from 2.88 sec to 2.94 sec, confidence is 100.00%
graduate             from 2.94 sec to 3.69 sec, confidence is 100.00%
with                 from 3.75 sec to 3.96 sec, confidence is 100.00%
an                   from 3.96 sec to 4.02 sec, confidence is 99.99%
engineering          from 4.02 sec to 4.47 sec, confidence is 100.00%
degree               from 4.47 sec to 4.89 sec, confidence is 100.00%
in                   from 

In [45]:
word_num = len(list_of_Words)
json_words = []
for word in list_of_Words:
    # print(word.to_string())
    json_words.append({'start': word.start, 'end': word.end, 'word': word.word})

print("The number of words is : ", len(json_words))
print(json_words)

The number of words is :  286
[{'start': 0.48, 'end': 0.9, 'word': 'hello'}, {'start': 0.93, 'end': 1.08, 'word': 'my'}, {'start': 1.08, 'end': 1.23, 'word': 'name'}, {'start': 1.23, 'end': 1.320156, 'word': 'is'}, {'start': 1.320156, 'end': 1.98, 'word': 'prasanth'}, {'start': 2.369857, 'end': 2.549876, 'word': 'and'}, {'start': 2.550666, 'end': 2.73, 'word': "i'm"}, {'start': 2.73, 'end': 2.88, 'word': 'going'}, {'start': 2.88, 'end': 2.94, 'word': 'to'}, {'start': 2.94, 'end': 3.69, 'word': 'graduate'}, {'start': 3.75, 'end': 3.96, 'word': 'with'}, {'start': 3.96, 'end': 4.02, 'word': 'an'}, {'start': 4.02, 'end': 4.47, 'word': 'engineering'}, {'start': 4.47, 'end': 4.89, 'word': 'degree'}, {'start': 4.89, 'end': 5.01, 'word': 'in'}, {'start': 5.01, 'end': 5.4, 'word': 'computer'}, {'start': 5.4, 'end': 6.03, 'word': 'science'}, {'start': 6.3, 'end': 6.69, 'word': 'ready'}, {'start': 6.69, 'end': 6.84, 'word': 'to'}, {'start': 6.84, 'end': 7.32, 'word': 'embark'}, {'start': 7.32, 'e

In [48]:
json_words_data = json.dumps(json_words)
with open('words_json_data.json', 'w') as outfile:
    outfile.write(json_words_data)

In [49]:
for word in list_of_Words:
    print(word.to_string())

hello                from 0.48 sec to 0.90 sec, confidence is 100.00%
my                   from 0.93 sec to 1.08 sec, confidence is 100.00%
name                 from 1.08 sec to 1.23 sec, confidence is 94.44%
is                   from 1.23 sec to 1.32 sec, confidence is 92.70%
prasanth             from 1.32 sec to 1.98 sec, confidence is 27.80%
and                  from 2.37 sec to 2.55 sec, confidence is 95.23%
i'm                  from 2.55 sec to 2.73 sec, confidence is 68.94%
going                from 2.73 sec to 2.88 sec, confidence is 100.00%
to                   from 2.88 sec to 2.94 sec, confidence is 100.00%
graduate             from 2.94 sec to 3.69 sec, confidence is 100.00%
with                 from 3.75 sec to 3.96 sec, confidence is 100.00%
an                   from 3.96 sec to 4.02 sec, confidence is 99.99%
engineering          from 4.02 sec to 4.47 sec, confidence is 100.00%
degree               from 4.47 sec to 4.89 sec, confidence is 100.00%
in                   from 

In [53]:
pause = []
for i in range(len(list_of_Words) - 1):
  word = list_of_Words[i]
  next_word = list_of_Words[i + 1]
  if word.end < next_word.start - 0.5:
    pause.append({'start': word.end, 'end': next_word.start})

num_pause = len(pause)
print("The number of puases is : ", len(pause))
pprint.pprint(pause)

json_pause_data = json.dumps(pause)
with open('pause_json_data.json', 'w') as outfile:
    outfile.write(json_pause_data)


The number of puases is :  17
[{'end': 11.76, 'start': 11.01},
 {'end': 22.08, 'start': 21.0},
 {'end': 28.53, 'start': 28.02},
 {'end': 32.73, 'start': 31.68},
 {'end': 41.730916, 'start': 40.966135},
 {'end': 50.552281, 'start': 49.14},
 {'end': 54.45, 'start': 53.698982},
 {'end': 68.04, 'start': 67.199912},
 {'end': 78.307339, 'start': 77.76},
 {'end': 82.08, 'start': 80.88},
 {'end': 84.78, 'start': 84.21},
 {'end': 89.34, 'start': 88.829956},
 {'end': 95.67, 'start': 94.86},
 {'end': 103.080396, 'start': 102.24},
 {'end': 106.8, 'start': 106.103057},
 {'end': 118.95, 'start': 118.38},
 {'end': 133.470176, 'start': 132.779985}]


In [57]:
filter_words = ['Umm', 'aah', 'aargh' 'oh', 'um', 'uh', 'er', 'ah', 'like', 'well', 'so', 'right', 'literally', 'okay']


for word in list_of_Words:
    w = word.to_string().split()
    if w[0] in filter_words:
      print(word.to_string())

hello                from 0.48 sec to 0.90 sec, confidence is 100.00%
my                   from 0.93 sec to 1.08 sec, confidence is 100.00%
name                 from 1.08 sec to 1.23 sec, confidence is 94.44%
is                   from 1.23 sec to 1.32 sec, confidence is 92.70%
prasanth             from 1.32 sec to 1.98 sec, confidence is 27.80%
and                  from 2.37 sec to 2.55 sec, confidence is 95.23%
i'm                  from 2.55 sec to 2.73 sec, confidence is 68.94%
going                from 2.73 sec to 2.88 sec, confidence is 100.00%
to                   from 2.88 sec to 2.94 sec, confidence is 100.00%
graduate             from 2.94 sec to 3.69 sec, confidence is 100.00%
with                 from 3.75 sec to 3.96 sec, confidence is 100.00%
an                   from 3.96 sec to 4.02 sec, confidence is 99.99%
engineering          from 4.02 sec to 4.47 sec, confidence is 100.00%
degree               from 4.47 sec to 4.89 sec, confidence is 100.00%
in                   from 