In [23]:
from moviepy.editor import VideoFileClip
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torchaudio
import torchaudio.transforms as T
import torch
import numpy as np
from evaluate import load

In [2]:
original_transcript = "Tak for det, formand. Og ja, det er rigtigt, at vi nu behandler beslutningsforslag B 4 om opfølgning på visse af EU-reformgruppens anbefalinger og ændring af Færøudvalgets og Grønlandsudvalgets retsgrundlag. Hvis man starter med EU-reformgruppen, kan man nævne, at vi i Europaudvalget og i Folketinget i juni 2020 besluttede at nedsætte en ekspertgruppe, der skulle kigge på, hvordan vi arbejdede med EU-sagerne."

# Trimming video

In [3]:
def trim_video(video_file_path, start_time, end_time, trimmed_video_path):
    video = VideoFileClip(video_file_path).subclip(start_time, end_time)
    video.write_videofile(trimmed_video_path, codec="libx264", audio_codec="aac")

In [4]:
# Path to your MP4 video file
video_file_path = '../data/video/VOD_08-12-2022_M__de_i_salen.mp4'
# Path for the trimmed video
trimmed_video_path = '../data/video/trimmed/snip.mp4'
# Path to save the extracted audio
audio_file_path = '../data/audio/snip.wav'

# Time window for trimming (in seconds)
start_time = 60 + 47 
end_time = 60 + 47 + 29

# Trim video
trim_video(video_file_path, start_time, end_time, trimmed_video_path)

Moviepy - Building video ../data/video/trimmed/snip.mp4.
MoviePy - Writing audio in snipTEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video ../data/video/trimmed/snip.mp4



                                                               

Moviepy - Done !
Moviepy - video ready ../data/video/trimmed/snip.mp4


# Extract audio

In [5]:
def extract_audio_from_video(video_file_path, audio_file_path):
    video = VideoFileClip(video_file_path)
    video.audio.write_audiofile(audio_file_path, codec='pcm_s16le')

# Extract audio from trimmed video
extract_audio_from_video(trimmed_video_path, audio_file_path)

MoviePy - Writing audio in ../data/audio/snip.wav


                                                        

MoviePy - Done.




# Manipulate audio

In [7]:
# load in audio
waveform, sample_rate = torchaudio.load(audio_file_path)

# Convert stereo audio to mono by averaging the two channels
waveform_mono = torch.mean(waveform, dim=0, keepdim=True)

# Fix the sample rate
resampler = T.Resample(orig_freq=sample_rate, new_freq=16000)
waveform_resampled = resampler(waveform_mono)

# save the audio
torchaudio.save('../data/audio/resampled/snip.wav', waveform_resampled, 16000)

# delete variables
del waveform, waveform_mono, waveform_resampled, sample_rate, resampler

# Transcribe

In [8]:
# load in audio
waveform, sample_rate = torchaudio.load('../data/audio/resampled/snip.wav')
waveform_np = waveform.squeeze().numpy()

Shape of waveform:  torch.Size([1, 464000])
Sample rate:  16000


In [21]:
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

# load data
input_features = processor(waveform_np, sampling_rate=sample_rate, return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription1 = processor.batch_decode(predicted_ids, skip_special_tokens=False)

transcription2 = processor.batch_decode(predicted_ids, skip_special_tokens=True)

In [22]:
print("Transcription1:", transcription1)
print("Transcription2:", transcription2)

Transcription1: ['<|startoftranscript|><|da|><|transcribe|><|notimestamps|> Tak for det, for åmen. Jeg er det rigtig, det er beslutningst forslade b4, om opfølgelig en provisere i ureformgruppens andefalinger endringer, færdig udvalderer i konnen, så der er det skratskundlade. Og hvis man starter først med EU-reformgruppen, så er det i UFTV i juni 2000er tøb beslutet i Europa-delet og folgetændende og næsset, en ekspertgruppet, skulle klike på, at der er en vi arbejde med EU-sererne, her, hvor det er faktisk i disse måneder,<|endoftext|>']
Transcription2: [' Tak for det, for åmen. Jeg er det rigtig, det er beslutningst forslade b4, om opfølgelig en provisere i ureformgruppens andefalinger endringer, færdig udvalderer i konnen, så der er det skratskundlade. Og hvis man starter først med EU-reformgruppen, så er det i UFTV i juni 2000er tøb beslutet i Europa-delet og folgetændende og næsset, en ekspertgruppet, skulle klike på, at der er en vi arbejde med EU-sererne, her, hvor det er fakti

# Check quality

In [24]:
from datasets import load_dataset
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import torch
from evaluate import load

librispeech_test_clean = load_dataset("librispeech_asr", "clean", split="test")


Downloading builder script: 100%|██████████| 11.5k/11.5k [00:00<00:00, 14.0MB/s]
Downloading metadata: 100%|██████████| 10.1k/10.1k [00:00<00:00, 40.8MB/s]


In [None]:

processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny").to("cuda")

def map_to_pred(batch):
    audio = batch["audio"]
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
    batch["reference"] = processor.tokenizer._normalize(batch['text'])

    with torch.no_grad():
        predicted_ids = model.generate(input_features.to("cuda"))[0]
    transcription = processor.decode(predicted_ids)
    batch["prediction"] = processor.tokenizer._normalize(transcription)
    return batch

result = librispeech_test_clean.map(map_to_pred)

wer = load("wer")
print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
