In [1]:
import os
import torch
import subprocess
import pandas as pd
import soundfile as sf
from pydub import AudioSegment
from pydub.silence import split_on_silence
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [2]:
def parse_time(t):
    start_time, end_time = map(int, t.split("_"))
    dur = end_time - start_time
    return start_time, dur
    
def format_audio(audio_path, audio_file, output_path, format = 'flac'):
    try:
        audio_name = audio_file.split(".")[0]
        audio = os.path.join(audio_path, audio_file)
        export_path = os.path.join(output_path, audio_name) + f".{format}"
        cmd = f'ffmpeg -i \"{audio}\"  -vn -ac 1 -ar {16000} -y \"{export_path}\"'
        subprocess.call(cmd)
    except Exception as ex:
        print("Error: ", ex)
    return f"{export_path}"

def speech2text(path):
    # load audio
    audio_input, sample_rate = sf.read(path)
    # pad input values and return pt tensor
    input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values.to(DEVICE)
    # retrieve logits & take argmax
    pred = model(input_values)
    logits = pred.logits
    predicted_ids = torch.argmax(logits, dim=-1)[0]

    # transcribe
    transcription = processor.decode(predicted_ids)

    return transcription

def split_audio(audiopath, audio_name, output_path):
    audiotype = audio_name.split(".")[-1]
    # Read in audio
    print('Loading audio..')
    sound = AudioSegment.from_file(os.path.join(audiopath, audio_name), format=audiotype)
    print("Loading done")
    # split
    print('Start split')
    chunks = split_on_silence(sound,min_silence_len=2000,silence_thresh=-70)#min_silence_len: split if silence is 0.3s。silence_thresh：less than -70dBFS is silence。
    # Create saving folder
    if not os.path.exists(output_path):os.mkdir(output_path)
    print("Split done")
    # save splits
    print('Saving...')
    for i in range(len(chunks)):
        new = chunks[i]
        save_name = os.path.join(output_path, '%04d.%s'%(i,audiotype))
        new.export(save_name, format = 'mp3')
        print('%04d'%i,len(new))
    print('Save done')

In [3]:
DEVICE = 'cpu'
BASE_PATH = os.path.abspath(".")
AUDIO_FOLDER = os.path.join(BASE_PATH, "Audio", "Original")
SPLIT_AUDIO_FOLDER = os.path.join(BASE_PATH, "Audio", "Splitted")
CLEAN_SPEECH_PATH = os.path.join(BASE_PATH, "Audio", "Cleaned")
TRANSCIPTION_PATH = os.path.join(BASE_PATH, "Transcription")
AUDIO = "audio.wav"

In [4]:
# load pretrained model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(DEVICE)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
split_audio(AUDIO_FOLDER, AUDIO, SPLIT_AUDIO_FOLDER)

In [6]:
AUDIO_NAMES = []
for root, dir, files in os.walk(SPLIT_AUDIO_FOLDER):
    AUDIO_NAMES += files
CLEAN_SPEECH = [format_audio(SPLIT_AUDIO_FOLDER, audio_name, CLEAN_SPEECH_PATH, "flac") for audio_name in AUDIO_NAMES]
AUDIO_PATH = [os.path.join(CLEAN_SPEECH_PATH, audio_name) for audio_name in CLEAN_SPEECH]

In [7]:
subtitles = pd.DataFrame(columns=["Name", "Path", "Text"])
subtitles['Name'] = AUDIO_NAMES
subtitles["Path"] = AUDIO_PATH
subtitles["Text"] = subtitles["Path"].apply(lambda x: speech2text(x))

In [8]:
subtitles.head()

Unnamed: 0,Name,Path,Text
0,0000.m4a,e:\Graduate\2021-2022 Term 2\STA561\Project\te...,WHAT IS TAMERONS MOSS NATIONALITY


In [9]:
filename = "transcription.csv"
save = subtitles.drop(["Path"], axis = 1)
save.loc[save["Text"]=="", ["Text"]] = " "
save.to_csv(os.path.join(TRANSCIPTION_PATH, filename), index=False)