# We shall first transform the mp3 into 16kHz mono wav file.

In [None]:
from pydub import AudioSegment

f = AudioSegment.from_mp3("audio.mp3")
f.export("audio.wav", format="wav",)

# Transcribing

In [1]:
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC
import soundfile as sf
import torch
import os


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# load model and tokenizer
processor = Wav2Vec2ProcessorWithLM.from_pretrained(
    "classla/wav2vec2-xls-r-parlaspeech-hr-lm")
model = Wav2Vec2ForCTC.from_pretrained("classla/wav2vec2-xls-r-parlaspeech-hr-lm")

# read the wav file 
speech, sample_rate = sf.read("audio.wav")

# Limit the input to first N seconds:
speech_segment = speech[0:20*sample_rate]

inputs = processor(speech_segment, sampling_rate=sample_rate, return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits
transcription = processor.batch_decode(logits.numpy()).text[0]

Only 0 unigrams passed as vocabulary. Is this small or artificial data?


In [17]:
speech.shape

(25437888,)

In [19]:
speech_segment = speech[0:1*60*sample_rate]


inputs = processor(speech_segment, sampling_rate=sample_rate, return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits
transcription = processor.batch_decode(logits.numpy()).text[0]
transcription

'ukoliko ste gledali vesti televizije b92 koje su upravo završene svakako vam je pretpostavljam poznato da je u ovim trenucima u toku sjednica vlade srbije koja priprema nacrt rezolucije o kosovu koji bi sutra trebalo da se nađe pred novim sazivom republičkog parlamenta no o tom potom o sjednici vlade i o rezoluciji nešto kasnije tokom sjednica vlade bude završen ili sutra ujutru sada o jednoj drugoj temi koja također tema dana za bivšim komandantom posebnih jedinica policije i žandarmerije goranom radosavljevićem gurijem raspisana je prema agencijskim vestima policijska potraga na nivou srbije jer se radosavljević nije pojavio na sudu gdje je pozvan u svojstvu svjedoka na suđenju optuženima za pomaganje ubistvu američkih državljana albak'

# Implementing a complete transcribing process

In [23]:
overlap_seconds = 1
import numpy as np

indices = np.arange(0, speech.shape[0], 10 * 60 * sample_rate, dtype=int).tolist() + [-1]

transcripts = list()
for start, stop in zip(indices[0:-2], indices[1:]):
    # If overlap would go over the end, correct it
    if stop + overlap_seconds * sample_rate >= speech.shape[0]:
        stop = -1
    speech_segment = speech[start:stop]
    inputs = processor(speech_segment, sampling_rate=sample_rate, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    transcription = processor.batch_decode(logits.numpy()).text[0]
    transcripts.append(transcription)

# Saving the results

In [None]:
with open("001_transcripts.txt", "w") as f:
    f.write(" ".join(transcripts))

# To other models

In [22]:
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC, Wav2Vec2Processor
import soundfile as sf
import torch
import os


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# load model and tokenizer
processor = Wav2Vec2Processor.from_pretrained(
    "classla/wav2vec2-xls-r-parlaspeech-hr-lm", pad_token="[PAD]")
model = Wav2Vec2ForCTC.from_pretrained("classla/wav2vec2-xls-r-parlaspeech-hr-lm")

# read the wav file 
speech, sample_rate = sf.read("audio.wav")

# Limit the input to first N seconds:
speech_segment = speech[0:20*sample_rate]

inputs = processor(speech_segment, sampling_rate=sample_rate, return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits
prediction_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(prediction_ids)
transcription

['ukoliko ste gledali vesti televizije b992. koje su upravo završene svakako vam je pretpostavljen poznato da je u ovim trenucima u toku sjednica vlade srbije koja priprema']

In [14]:
inputs = processor(speech_segment, sampling_rate=sample_rate, return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

In [20]:
prediction_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(prediction_ids)
transcription

['[PAD]u[PAD]ko[PAD]l[PAD]i[PAD]ko[PAD] [PAD]ste gl[PAD]e[PAD]d[PAD]a[PAD]l[PAD]i [PAD]v[PAD]e[PAD]s[PAD]t[PAD]i[PAD] [PAD]tel[PAD]e[PAD]vi[PAD]zi[PAD]je [PAD]b[PAD]9[PAD]9[PAD]2.[PAD] koje s[PAD]u [PAD]u[PAD]p[PAD]r[PAD]av[PAD]o[PAD] [PAD]z[PAD]a[PAD]v[PAD]r[PAD]š[PAD]e[PAD]n[PAD]e[PAD] [PAD]s[PAD]va[PAD]k[PAD]a[PAD]k[PAD]o va[PAD]m je[PAD] [PAD]pret[PAD]p[PAD]o[PAD]stavljen p[PAD]o[PAD]z[PAD]n[PAD]a[PAD]t[PAD]o[PAD] d[PAD]a je u ov[PAD]i[PAD]m tre[PAD]nu[PAD]c[PAD]i[PAD]m[PAD]a[PAD] u [PAD]t[PAD]o[PAD]k[PAD]u[PAD] [PAD]s[PAD]j[PAD]e[PAD]dn[PAD]i[PAD]c[PAD]a[PAD] [PAD]v[PAD]l[PAD]a[PAD]d[PAD]e[PAD] [PAD]s[PAD]r[PAD]b[PAD]i[PAD]je [PAD]k[PAD]oja[PAD] [PAD]pri[PAD]pre[PAD]m[PAD]a[PAD]']