[Wav2Vec2] PyCTCDecode Integration to support language model boosted …

…decoding (huggingface#14339) * up * up * up * make it cleaner * correct * make styhahalal * add more tests * finish * small fix * make style * up * tryout to solve cicrle ci * up * fix more tests * fix more tests * apply sylvains suggestions * fix import * correct docs * add pyctcdecode only to speech tests * fix more tests * add tf, flax and pt tests * add pt * fix last tests * fix more tests * Apply suggestions from code review * change lines * Apply suggestions from code review Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com> * correct tests * correct tests * add doc string Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>
Albertobegue · Jan 27, 2022 · b77eb89 · b77eb89
1 parent e476ac7
commit b77eb89
Show file tree

Hide file tree

Showing 16 changed files with 831 additions and 19 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -83,6 +83,7 @@ jobs:
             - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
             - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
             - run: pip install tensorflow_probability
+            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                 key: v0.4-{{ checksum "setup.py" }}
                 paths:
@@ -151,6 +152,7 @@ jobs:
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
             - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                 key: v0.4-{{ checksum "setup.py" }}
                 paths:
@@ -187,6 +189,7 @@ jobs:
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
             - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                 key: v0.4-{{ checksum "setup.py" }}
                 paths:
@@ -217,6 +220,7 @@ jobs:
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
             - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                   key: v0.4-torch-{{ checksum "setup.py" }}
                   paths:
@@ -252,6 +256,7 @@ jobs:
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
             - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                   key: v0.4-torch-{{ checksum "setup.py" }}
                   paths:
@@ -278,9 +283,11 @@ jobs:
                   keys:
                       - v0.4-tf-{{ checksum "setup.py" }}
                       - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
             - run: pip install tensorflow_probability
+            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                   key: v0.4-tf-{{ checksum "setup.py" }}
                   paths:
@@ -312,9 +319,11 @@ jobs:
                   keys:
                       - v0.4-tf-{{ checksum "setup.py" }}
                       - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
             - run: pip install tensorflow_probability
+            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                   key: v0.4-tf-{{ checksum "setup.py" }}
                   paths:
@@ -341,8 +350,10 @@ jobs:
                 keys:
                     - v0.4-flax-{{ checksum "setup.py" }}
                     - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
             - run: pip install --upgrade pip
-            - run: sudo pip install .[flax,testing,sentencepiece,flax-speech,vision]
+            - run: pip install .[flax,testing,sentencepiece,flax-speech,vision]
+            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                   key: v0.4-flax-{{ checksum "setup.py" }}
                   paths:
@@ -374,8 +385,10 @@ jobs:
                 keys:
                     - v0.4-flax-{{ checksum "setup.py" }}
                     - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
             - run: pip install --upgrade pip
-            - run: sudo pip install .[flax,testing,sentencepiece,vision,flax-speech]
+            - run: pip install .[flax,testing,sentencepiece,vision,flax-speech]
+            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                   key: v0.4-flax-{{ checksum "setup.py" }}
                   paths:
@@ -407,6 +420,7 @@ jobs:
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
             - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                   key: v0.4-torch-{{ checksum "setup.py" }}
                   paths:
@@ -443,6 +457,7 @@ jobs:
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
             - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                   key: v0.4-torch-{{ checksum "setup.py" }}
                   paths:
@@ -582,7 +597,7 @@ jobs:
                   path: ~/transformers/examples_output.txt
             - store_artifacts:
                   path: ~/transformers/reports
-    
+
     run_examples_torch_all:
         working_directory: ~/transformers
         docker:

diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
@@ -34,6 +34,7 @@ jobs:
           apt install -y libsndfile1-dev
           pip install --upgrade pip
           pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+          pip install https://github.com/kpu/kenlm/archive/master.zip
 
       - name: Launcher docker
         uses: actions/checkout@v2
@@ -87,6 +88,7 @@ jobs:
           pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
           pip install --upgrade pip
           pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
+          pip install https://github.com/kpu/kenlm/archive/master.zip
 
       - name: Launcher docker
         uses: actions/checkout@v2
@@ -142,6 +144,7 @@ jobs:
 #          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
 #          pip install --upgrade pip
 #          pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
+#          pip install https://github.com/kpu/kenlm/archive/master.zip
 #
 #      - name: Launcher docker
 #        uses: actions/checkout@v2
@@ -200,7 +203,7 @@ jobs:
           apt install -y libsndfile1-dev
           pip install --upgrade pip
           pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
-
+          pip install https://github.com/kpu/kenlm/archive/master.zip
       - name: Launcher docker
         uses: actions/checkout@v2
         with:
@@ -256,6 +259,7 @@ jobs:
 #          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
 #          pip install --upgrade pip
 #          pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
+#          pip install https://github.com/kpu/kenlm/archive/master.zip
 #
 #      - name: Launcher docker
 #        uses: actions/checkout@v2
@@ -311,6 +315,7 @@ jobs:
 #          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
 #          pip install --upgrade pip
 #          pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
+#          pip install https://github.com/kpu/kenlm/archive/master.zip
 #
 #      - name: Launcher docker
 #        uses: actions/checkout@v2

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
@@ -36,6 +36,7 @@ jobs:
           apt -y update && apt install -y libsndfile1-dev git
           pip install --upgrade pip
           pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+          pip install https://github.com/kpu/kenlm/archive/master.zip
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
@@ -102,6 +103,7 @@ jobs:
           pip install --upgrade pip
           pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
           pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
+          pip install https://github.com/kpu/kenlm/archive/master.zip
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
@@ -141,6 +143,8 @@ jobs:
           apt -y update && apt install -y libsndfile1-dev git
           pip install --upgrade pip
           pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
+          pip install https://github.com/kpu/kenlm/archive/master.zip
+
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
@@ -236,6 +240,7 @@ jobs:
           apt -y update && apt install -y libsndfile1-dev git
           pip install --upgrade pip
           pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+          pip install https://github.com/kpu/kenlm/archive/master.zip
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
@@ -288,6 +293,7 @@ jobs:
           apt -y update && apt install -y libsndfile1-dev git
           pip install --upgrade pip
           pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
+          pip install https://github.com/kpu/kenlm/archive/master.zip
 
       - name: Are GPUs recognized by our DL frameworks
         run: |

diff --git a/docs/source/model_doc/wav2vec2.rst b/docs/source/model_doc/wav2vec2.rst
@@ -67,9 +67,19 @@ Wav2Vec2Processor
     :members: __call__, pad, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
 
 
+Wav2Vec2ProcessorWithLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Wav2Vec2ProcessorWithLM
+    :members: __call__, pad, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
+
+
 Wav2Vec2 specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. autoclass:: transformers.models.wav2vec2.processing_wav2vec2_with_lm.Wav2Vec2DecoderWithLMOutput
+    :members: 
+
 .. autoclass:: transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2BaseModelOutput
     :members: 
 

diff --git a/setup.py b/setup.py
@@ -51,15 +51,15 @@
    pip install -i https://testpypi.python.org/pypi transformers
 
    Check you can run the following commands:
-   python -c "from transformers import pipeline; classifier = pipeline('text-classification'); print(classifier('What a nice release'))" 
+   python -c "from transformers import pipeline; classifier = pipeline('text-classification'); print(classifier('What a nice release'))"
    python -c "from transformers import *"
 
 9. Upload the final version to actual pypi:
    twine upload dist/* -r pypi
 
 10. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
 
-11. Run `make post-release` (or, for a patch release, `make post-patch`). If you were on a branch for the release, 
+11. Run `make post-release` (or, for a patch release, `make post-patch`). If you were on a branch for the release,
     you need to go back to master before executing this.
 """
 
@@ -159,6 +159,7 @@
     "tokenizers>=0.10.1,<0.11",
     "torch>=1.0",
     "torchaudio",
+    "pyctcdecode>=0.2.0",
     "tqdm>=4.27",
     "unidic>=1.0.2",
     "unidic_lite>=1.0.7",
@@ -262,7 +263,7 @@ def run(self):
 extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
-extras["audio"] = deps_list("librosa")
+extras["audio"] = deps_list("librosa", "pyctcdecode")
 extras["speech"] = deps_list("torchaudio") + extras["audio"]  # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
 extras["tf-speech"] = extras["audio"]

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -44,6 +44,7 @@
 from .file_utils import (
     _LazyModule,
     is_flax_available,
+    is_pyctcdecode_available,
     is_pytorch_quantization_available,
     is_scatter_available,
     is_sentencepiece_available,
@@ -471,6 +472,15 @@
         name for name in dir(dummy_speech_objects) if not name.startswith("_")
     ]
 
+if is_pyctcdecode_available():
+    _import_structure["models.wav2vec2"].append("Wav2Vec2ProcessorWithLM")
+else:
+    from .utils import dummy_pyctcdecode_objects
+
+    _import_structure["utils.dummy_pyctcdecode_objects"] = [
+        name for name in dir(dummy_pyctcdecode_objects) if not name.startswith("_")
+    ]
+
 if is_sentencepiece_available() and is_speech_available():
     _import_structure["models.speech_to_text"].append("Speech2TextProcessor")
 else:
@@ -2440,6 +2450,11 @@
     else:
         from .utils.dummy_speech_objects import *
 
+    if is_pyctcdecode_available():
+        from .models.wav2vec2 import Wav2Vec2ProcessorWithLM
+    else:
+        from .utils.dummy_pyctcdecode_objects import *
+
     if is_speech_available() and is_sentencepiece_available():
         from .models.speech_to_text import Speech2TextProcessor
     else:

diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
@@ -70,6 +70,7 @@
     "tokenizers": "tokenizers>=0.10.1,<0.11",
     "torch": "torch>=1.0",
     "torchaudio": "torchaudio",
+    "pyctcdecode": "pyctcdecode>=0.2.0",
     "tqdm": "tqdm>=4.27",
     "unidic": "unidic>=1.0.2",
     "unidic_lite": "unidic_lite>=1.0.7",

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
@@ -237,6 +237,22 @@
     _torchaudio_available = False
 
 
+_pyctcdecode_available = importlib.util.find_spec("pyctcdecode") is not None
+try:
+    _pyctcdecode_version = importlib_metadata.version("pyctcdecode")
+    logger.debug(f"Successfully imported pyctcdecode version {_pyctcdecode_version}")
+except importlib_metadata.PackageNotFoundError:
+    _pyctcdecode_available = False
+
+
+_librosa_available = importlib.util.find_spec("librosa") is not None
+try:
+    _librosa_version = importlib_metadata.version("librosa")
+    logger.debug(f"Successfully imported librosa version {_librosa_version}")
+except importlib_metadata.PackageNotFoundError:
+    _librosa_available = False
+
+
 torch_cache_home = os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
 old_default_cache_path = os.path.join(torch_cache_home, "transformers")
 # New default cache, shared with the Datasets library
@@ -311,6 +327,14 @@ def is_torch_available():
     return _torch_available
 
 
+def is_pyctcdecode_available():
+    return _pyctcdecode_available
+
+
+def is_librosa_available():
+    return _librosa_available
+
+
 def is_torch_cuda_available():
     if is_torch_available():
         import torch
@@ -736,6 +760,12 @@ def wrapper(*args, **kwargs):
 `pip install pytesseract`
 """
 
+# docstyle-ignore
+PYCTCDECODE_IMPORT_ERROR = """
+{0} requires the pyctcdecode library but it was not found in your environment. You can install it with pip:
+`pip install pyctcdecode`
+"""
+
 
 BACKENDS_MAPPING = OrderedDict(
     [
@@ -745,6 +775,7 @@ def wrapper(*args, **kwargs):
         ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
         ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
         ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
+        ("pyctcdecode", (is_pyctcdecode_available, PYCTCDECODE_IMPORT_ERROR)),
         ("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)),
         ("scatter", (is_scatter_available, SCATTER_IMPORT_ERROR)),
         ("pytorch_quantization", (is_pytorch_quantization_available, PYTORCH_QUANTIZATION_IMPORT_ERROR)),

diff --git a/src/transformers/models/wav2vec2/__init__.py b/src/transformers/models/wav2vec2/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
+from ...file_utils import _LazyModule, is_flax_available, is_pyctcdecode_available, is_tf_available, is_torch_available
 
 
 _import_structure = {
@@ -27,6 +27,9 @@
     "tokenization_wav2vec2": ["Wav2Vec2CTCTokenizer", "Wav2Vec2Tokenizer"],
 }
 
+if is_pyctcdecode_available():
+    _import_structure["processing_wav2vec2_with_lm"] = ["Wav2Vec2ProcessorWithLM"]
+
 if is_torch_available():
     _import_structure["modeling_wav2vec2"] = [
         "WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -61,6 +64,9 @@
     from .processing_wav2vec2 import Wav2Vec2Processor
     from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2Tokenizer
 
+    if is_pyctcdecode_available():
+        from .processing_wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
+
     if is_torch_available():
         from .modeling_wav2vec2 import (
             WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,