In [None]:
#!pip install moviepy
#!pip install mutagen
#!pip install transformers
#!pip install datasets
#!pip install librosa

In [None]:
import moviepy.editor as mp
from mutagen.mp3 import MP3
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import librosa
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
import timeit
from os import listdir
from os.path import isfile, join
import re

In [None]:
def extract_audio(video):
    v = re.findall(r"(.*?)\.mp4", video)[0]
    my_clip = mp.VideoFileClip(f"{path}{video}")
    my_clip.audio.write_audiofile(f"recording_{v}.mp3")
    
def calculate_length(video, extension):
    v = re.findall(f"(.*?)\.{extension}", video)[0]
    audio = MP3(f"recording_{v}.mp3")
    return round(audio.info.length)

def extract_text(video, length, processor, model, overlap, extension, audio_path):
    v = re.findall(f"(.*?)\.{extension}", video)[0]
    intervals = list(range(0, length, (30-overlap))) 
    duration = [30]* int(len(intervals)-1)
    duration.append(length-intervals[-1])
    cc = []
    for intv, dur in zip(intervals, duration):
        try:
            # Load 30 seconds of a file, starting intv seconds in
            y, sr = librosa.load(f"{audio_path}{v}.mp3", sr=16000, offset=intv, duration=dur)
            inputs = processor(y, return_tensors="pt", padding="longest", sampling_rate=16000)
            input_features = inputs.input_features
            generated_ids = model.generate(inputs=input_features)
            transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            cc.append(transcription)
            with open(f"text_{v}.txt", "a+") as f:
                f.write(f"{transcription}\n")   
        except (RuntimeError, NameError) as e:
            #print(e)
            print(length-intv)

In [None]:
def run(video_files, audio_path):
    done = [] #if you don't want to run all the text extraction in once, 
              #you can provide a list of previously processed files
    processor = WhisperProcessor.from_pretrained("openai/whisper-large")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
    start = timeit.default_timer()
    for video in video_files:
        if video not in done and video.endswith("mp4"):
            extract_audio(video)
            length = calculate_length(video, "mp4")
            extract_text(video, length, processor, model, 3, "mp4")
            done.append(video)
        elif video not in done and video.endswith("mp3"):
            audio = MP3(f"{path}{video}")
            length = round(audio.info.length)
            extract_text(video, length, processor, model, 3, "mp3", audio_path)
            done.append(video)
        else:
            print(video, "Extension not supported! Please provide a valid file, such as mp4 or mp3")

    stop = timeit.default_timer()
    print('Time: ', stop - start)  

In [None]:
path = "" #add path to your MP4 or MP3 files
video_files = [f for f in listdir(path) if isfile(join(path, f))]
run(video_files, path)

Let's visualize a wave!

In [None]:
import matplotlib.pyplot as plt

def visualize_wave(recording):
    '''
    This function is used to visualize an audio file, given in MP3 format.
    '''
    y, sr = librosa.load(recording, sr=16000, duration=30)
    plt.figure(figsize=(7,3))
    librosa.display.waveshow(y, sr=sr)
    plt.title('Wave')
    plt.savefig('wave.png', format='png', transparent=True)
    plt.show()