# Splitting Audio Files

In [1]:
# Manipulating audio
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.utils import mediainfo

from pydub.playback import play

# Modifying strings
import re

# Machine Learning Model
import torch
import whisper

# Progress bar
from tqdm.notebook import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#DEVICE = "cpu"

In [8]:
# Choose File
filename_dir = "french_audio/split_testing/"
filename = "01.2 Nationality, Language, Country.mp3"

filename_prefix = filename[:4]
filename_title = filename[4:-4]

track = AudioSegment.from_mp3(filename_dir + filename)
original_bitrate = mediainfo(filename_dir + filename)["bit_rate"]

In [9]:
# Parameters empirically tuned
chunks = split_on_silence(track, min_silence_len=600, silence_thresh=-30, keep_silence=200)

def chunk_filename(filename_prefix, i):
    return filename_prefix + "_" + str(i) + ".mp3"

# Save split up audio
for i, chunk in enumerate(chunks):
    chunk.export(chunk_filename(filename_prefix, i).format(i), format="mp3")

### Load Model and Apply to Chunks

In [4]:
model = whisper.load_model("medium", device=DEVICE)

In [10]:
phrases_fr = []
phrases_en = []

decode_options = {"fp16": True}

num_chunks = len(chunks)
for i in tqdm(range(num_chunks)):
    # Ignoring the English title
    if i > 1:
        result_fr = model.transcribe(chunk_filename(filename_prefix, i), language="fr", task="transcribe", **decode_options)
        result_en = model.transcribe(chunk_filename(filename_prefix, i), language="fr", task="translate", **decode_options)

        phrases_fr.append(result_fr["text"])
        phrases_en.append(result_en["text"])


  0%|          | 0/135 [00:00<?, ?it/s]

In [13]:
pairs = []

for i, (phrase_fr, phrase_en) in enumerate(zip(phrases_fr, phrases_en)):
    # Remove trailing whitespace
    phrase_fr_strip = phrase_fr.lstrip()
    phrase_en_strip = phrase_en.lstrip()

    # Un-capitalize phrases
    # TODO Code in logic that takes into account final full stop
    # TODO Deal with ALL CAPS case and make it all lower case
    phrase_fr_clean = phrase_fr_strip[0].lower() + phrase_fr_strip[1:]
    phrase_en_clean = phrase_en_strip[0].lower() + phrase_en_strip[1:]

    pairs.append((phrase_fr_clean, phrase_en_clean))

print(pairs)

[('un passeport', 'a passport'), ("les papiers d'identité.", 'identity papers'), ("une carte d'identité", 'an identity card.'), ('un continent.', 'a continent.'), ('a bie!', 'a B'), ('une nation', 'a nation'), ('la nationalité', 'nationality'), ('fatima va demander la nationalité française.', 'fatima will ask French nationality.'), ("être d'origine", 'being of origin'), ("c'est un français d'origine italienne.", "it's a Frenchman of Italian origin."), ('éTRANGER', 'stranger'), ('éTRANGER', 'foreign'), ('un étranger', 'a stranger'), ('une étrangère', 'a foreigner'), ('immigrés.', 'immigrate'), ('un immigré', 'an immigrant'), ('une immigrée', 'an immigrant'), ("les immigrés ont parfois du mal à s'intégrer.", 'immigrants sometimes have trouble integrating themselves.'), ("l'immigration", 'immigration'), ('émigrez.', 'emigrate'), ('une langue', 'a language'), ('la langue maternelle', 'the mother tongue'), ('une langue étrangère', 'a foreign language'), ('bILANGUE', 'be taking a breath'), (