Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Update to DeepSpeech 0.5.1, remove VAD chunking added on commit 4caf459
  • Loading branch information
danry25 committed Nov 18, 2019
1 parent 1b3bbca commit 4808b32
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 166 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -27,9 +27,12 @@ lib64/
parts/
sdist/
var/
deepspeech*/
models*/
*.egg-info/
.installed.cfg
*.egg
*.wav

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down
5 changes: 2 additions & 3 deletions README.md
Expand Up @@ -20,12 +20,12 @@ python3 /usr/lib/python3/dist-packages/virtualenv.py -p python3 env
Followed by installing the python packages needed:

```
pip install ffmpeg-python flask deepspeech webrtcvad uuid requests scipy
pip install ffmpeg-python flask deepspeech uuid requests scipy
```

At this point, Mozilla's DeepSpeech needs a language model:
```
wget -O - https://github.com/mozilla/DeepSpeech/releases/download/v0.3.0/deepspeech-0.3.0-models.tar.gz | tar xvfz -
wget -O - https://github.com/mozilla/DeepSpeech/releases/download/v0.5.1/deepspeech-0.5.1-models.tar.gz | tar xvfz -
```

Now, lets take this for a test spin!
Expand Down Expand Up @@ -78,6 +78,5 @@ Thanks to the following people and resources, this project exists:

## Things to improve
* Queue for recordings to be processed
* Figure out why webrtcvad is dumping binary and time increments in the console output. Not useful for our usecase
* Add endpoints that act like standard proprietary HTTP voice endpoints (making this a drop in replacement)
* Add GPU support
28 changes: 5 additions & 23 deletions deepspeech_frontend/__init__.py
@@ -1,9 +1,7 @@
import os
import ffmpeg
import uuid
import webrtcvad
import time
from .chunker import read_wave, write_wave, frame_generator, vad_collector
from deepspeech import Model
import scipy.io.wavfile as wav
from flask import Flask, flash, request, redirect, url_for, send_from_directory, make_response, jsonify
Expand All @@ -15,11 +13,11 @@
BEAM_WIDTH = 500

# The alpha hyperparameter of the CTC decoder. Language Model weight
LM_WEIGHT = 1.50
LM_WEIGHT = 0.75

# Valid word insertion weight. This is used to lessen the word insertion penalty
# when the inserted word is part of the vocabulary
VALID_WORD_COUNT_WEIGHT = 2.10
VALID_WORD_COUNT_WEIGHT = 1.85

# These constants are tied to the shape of the graph used (changing them changes
# the geometry of the first layer), so make sure you use the same constants that
Expand All @@ -31,15 +29,13 @@
# Size of the context window used for producing timesteps in the input vector
N_CONTEXT = 9

# How aggressive to be when splitting audio files into chunks
aggressiveness = 1

UPLOAD_FOLDER = '/tmp'
ALLOWED_EXTENSIONS = set(['wav', 'mp3', 'flac'])

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
ds = Model('models/output_graph.pbmm', N_FEATURES, N_CONTEXT, 'models/alphabet.txt', BEAM_WIDTH)
ds = Model('models/output_graph.pb', N_FEATURES, N_CONTEXT, 'models/alphabet.txt', BEAM_WIDTH)
ds.enableDecoderWithLM('models/alphabet.txt', 'models/lm.binary', 'models/trie', LM_WEIGHT,
VALID_WORD_COUNT_WEIGHT)
api_keys = []
Expand Down Expand Up @@ -105,22 +101,8 @@ def transcribe(filename):
transcribe(filename)
print("Starting transcription...")
transcription_in_progress = True
processed_data = ""
audio, sample_rate = read_wave(os.path.join(app.config['UPLOAD_FOLDER'], filename))
vad = webrtcvad.Vad(0)
frames = frame_generator(20, audio, sample_rate)
frames = list(frames)
# Change the frame generator line above and the frame size (20 by default)/window size (40 by default) when dealing with non-stop talkers!
segments = vad_collector(sample_rate, 20, 40, vad, frames)
for i, segment in enumerate(segments):
path = 'chunk-%002d.wav' % (i,)
# print(' Writing %s' % (path,))
write_wave(path, segment, sample_rate)
fs, audio = wav.read(path)
processed_data += ds.stt(audio, fs)
processed_data += " "
# print(processed_data)
os.remove(path)
fs, audio = wav.read(os.path.join(app.config['UPLOAD_FOLDER'], filename))
processed_data = ds.stt(audio, fs)
os.remove(os.path.join(app.config['UPLOAD_FOLDER'], filename))
transcription_in_progress = False
return processed_data
Expand Down
140 changes: 0 additions & 140 deletions deepspeech_frontend/chunker.py

This file was deleted.

0 comments on commit 4808b32

Please sign in to comment.