In [12]:
#!pip install jiwer
#!pip install pydub
#!pip install huggingface_hub
#!pip install pyannote.audio

In [1]:
import re
import unicodedata

# non-ASCII letters that are not separated by "NFKD" normalization
ADDITIONAL_DIACRITICS = {
    "œ": "oe",
    "Œ": "OE",
    "ø": "o",
    "Ø": "O",
    "æ": "ae",
    "Æ": "AE",
    "ß": "ss",
    "ẞ": "SS",
    "đ": "d",
    "Đ": "D",
    "ð": "d",
    "Ð": "D",
    "þ": "th",
    "Þ": "th",
    "ł": "l",
    "Ł": "L",
}


def remove_symbols_and_diacritics(s: str, keep=""):
    """
    Replace any other markers, symbols, and punctuations with a space,
    and drop any diacritics (category 'Mn' and some manual mappings)
    """
    return "".join(
        c
        if c in keep
        else ADDITIONAL_DIACRITICS[c]
        if c in ADDITIONAL_DIACRITICS
        else ""
        if unicodedata.category(c) == "Mn"
        else " "
        if unicodedata.category(c)[0] in "MSP"
        else c
        for c in unicodedata.normalize("NFKD", s)
    )


class TextNormalizer:
    def __init__(self):
        self.clean = remove_symbols_and_diacritics

    def __call__(self, s: str):
        s = s.lower()
        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
        s = s.replace("\n", " ")  # remove new line
        s = s.replace("\xa0", "")  # remove non-breaking space
        s = re.sub(r"mmm|euh", "", s)  # remove disfluencies
        s = self.clean(s).lower()
        s = re.sub(r"\s+", " ", s)  # replace one or more whitespace with only one
        s = re.sub(
            r"(\w)(\1{2,})", r"\1", s
        )  # replace prolonged words with standard spelling
        s = re.sub(r"\b(.+)(\b\1\b)+", r"\1", s)  # remove repeated phrases

        return s

In [13]:
import re
import jiwer
import pickle
import json

from pydub import AudioSegment
from huggingface_hub import login
from pyannote.audio import Pipeline
from huggingsound import SpeechRecognitionModel


In [14]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [15]:
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
                                    use_auth_token=True)

Downloading (…)olve/2.1/config.yaml:   0%|          | 0.00/500 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Downloading (…)/2022.07/config.yaml:   0%|          | 0.00/318 [00:00<?, ?B/s]

Downloading (…)ain/hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading (…)bedding_model.ckpt";:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

Downloading (…)an_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading (…)"classifier.ckpt";:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

Downloading (…)in/label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

In [16]:
# spacer
spacermilli = 2000
spacer = AudioSegment.silent(duration=spacermilli)

audio = AudioSegment.from_file("/content/BOC-066_5min.m4a")  # lecun1.wav
# check input size
input_length = audio.duration_seconds * 1000
kernel_size = spacermilli

# if kernel size > input size, we change kernel size
if kernel_size > input_length:
    kernel_size = input_length

# append audio with adjusted kernel size
audio = spacer.append(audio, crossfade=kernel_size)
audio.export("temp.wav", format="wav")



<_io.BufferedRandom name='temp.wav'>

In [17]:
# pyannote pipeline
#pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=True)

print("Diarizing...")
DEMO_FILE = {"audio": "/content/temp.wav", "num_speakers": 2}
dz = pipeline(DEMO_FILE)

with open("diarization.txt", "w") as text_file:
    text_file.write(str(dz))

print(*list(dz.itertracks(yield_label=True))[:10], sep="\n")

# prepare audio files according to diarization
def millisec(timeStr):
    spl = timeStr.split(":")
    s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2])) * 1000)
    return s


# group segments
print("Grouping segments...")
dzs = open("diarization.txt").read().splitlines()

groups = []
g = []
lastend = 0

for d in dzs:
    if g and (g[0].split()[-1] != d.split()[-1]):  # same speaker
        groups.append(g)
        g = []

    g.append(d)

    end = re.findall("[0-9]+:[0-9]+:[0-9]+\.[0-9]+", string=d)[1]
    end = millisec(end)
    if lastend > end:  # segment engulfed by a previous segment
        groups.append(g)
        g = []
    else:
        lastend = end
if g:
    groups.append(g)
print(*groups, sep="\n")

# save each part
print("Saving segments...")
audio = AudioSegment.from_wav("temp.wav")
gidx = -1
audio_paths = []

for g in groups:
    start = re.findall("[0-9]+:[0-9]+:[0-9]+\.[0-9]+", string=g[0])[0]
    end = re.findall("[0-9]+:[0-9]+:[0-9]+\.[0-9]+", string=g[-1])[1]
    start = millisec(start)  # - spacermilli
    end = millisec(end)  # - spacermilli
    print(start, end)
    gidx += 1
    audio[start:end].export(f"{gidx}.wav", format="wav")
    audio_paths.append(f"{gidx}.wav")


Diarizing...
(<Segment(0.784687, 17.9634)>, 'AY', 'SPEAKER_01')
(<Segment(2.86031, 3.48469)>, 'A', 'SPEAKER_00')
(<Segment(6.25219, 7.26469)>, 'B', 'SPEAKER_00')
(<Segment(13.1034, 13.4409)>, 'C', 'SPEAKER_00')
(<Segment(15.6684, 15.9384)>, 'D', 'SPEAKER_00')
(<Segment(19.8366, 33.6234)>, 'AZ', 'SPEAKER_01')
(<Segment(21.8278, 22.5028)>, 'E', 'SPEAKER_00')
(<Segment(23.2116, 23.7178)>, 'F', 'SPEAKER_00')
(<Segment(34.7709, 37.2516)>, 'G', 'SPEAKER_00')
(<Segment(38.0447, 39.0572)>, 'BA', 'SPEAKER_01')
Grouping segments...
['[ 00:00:00.784 -->  00:00:17.963] AY SPEAKER_01']
['[ 00:00:02.860 -->  00:00:03.484] A SPEAKER_00']
['[ 00:00:06.252 -->  00:00:07.264] B SPEAKER_00']
['[ 00:00:13.103 -->  00:00:13.440] C SPEAKER_00']
['[ 00:00:15.668 -->  00:00:15.938] D SPEAKER_00']
['[ 00:00:19.836 -->  00:00:33.623] AZ SPEAKER_01']
['[ 00:00:21.827 -->  00:00:22.502] E SPEAKER_00']
['[ 00:00:23.211 -->  00:00:23.717] F SPEAKER_00']
['[ 00:00:34.770 -->  00:00:37.251] G SPEAKER_00']
['[ 00:00:3

In [22]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

#! pip install huggingsound

In [21]:
# transcription

model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-xls-r-1b-french")

print("Transcribing...")
transcription = ""

results = model.transcribe(audio_paths)
with open("temp.json", "w") as f:
    json.dump(results, f)

for result in results:
    transcription += result["transcription"]

print("Loading target...")
with open("/content/P39682 - boc-066.zip (1)_5min.txt", "r", encoding="utf8") as f:
    target = f.read()

normalizer = TextNormalizer()

transcription = normalizer(transcription)
target = normalizer(target)

print("Calculating WER...")
wer = jiwer.wer(transcription, target)
print(f"WER: {wer}")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

INFO:huggingsound.speech_recognition.model:Loading model...


Downloading (…)lve/main/config.json:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Transcribing...


100%|██████████| 41/41 [05:33<00:00,  8.14s/it]


Loading target...
Calculating WER...
WER: 0.6468797564687976


In [23]:
with open("transcription.txt", "w", encoding="utf8") as f:
    f.write(transcription)

In [24]:
with open("target.txt", "w", encoding="utf8") as f:
    f.write(target)