# Splitting Audio Files

In [1]:
# Finding audio files
import glob

# Manipulating audio
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.utils import mediainfo

# Testing audio
from pydub.playback import play

# Generating Anki deck
import genanki
import random

# Resetting random seed
from datetime import datetime

# Modifying strings
import re

# Machine Learning Model
import torch
import whisper

# Saving output
import pandas as pd
import os.path

# Checking confidence
import numpy as np
import matplotlib.pyplot as plt

# Progress bar
from tqdm.notebook import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#DEVICE = "cpu"

# Has to be changed if on CPU vs GPU
fp16_true = not (DEVICE == "cpu")
decode_options = {"fp16": fp16_true}

In [2]:
# Be careful to only run this cell once because it eats up VRAM
model = whisper.load_model("medium", device=DEVICE)

In [3]:
# Formatting for each phrase
def sanitise(phrase):
    """Sanitise a phrase, which for our purposes means:
    - Remove leading and trailing whitespace

    Args:
        phrase (str): Phrase to sanitise.

    Returns:
        str: Sanitised phrase.
    """

    # Strip whitespace then full stop
    stripped = phrase.strip().strip(".")

    # Often preceding proper nouns
    exceptions = ["The", "La", "Le", "L", "Les"]

    phrase_clean = None
    # Un-capitalize phrases
    # All uppers are usually errors
    if stripped.isupper():
        phrase_clean = stripped.lower()
    # Title case is sometimes a noun, also check for empty string
    # Also check it's not a question
    elif (re.split(" |'", stripped)[0] in exceptions or not stripped.istitle()) and len(stripped) > 1 and not stripped[-1] == "?":
        phrase_clean = stripped[0].lower() + stripped[1:]
    else:
        phrase_clean = stripped

    return phrase_clean

# This is the function that wraps all the difficult parts
def generate_phrases(chunk_filename_list, decode_options):
    """Generate phrases from a list of audio files.
    
    Args:
        chunk_filename_list (list): List of filenames of audio files.
        decode_options (dict): Dictionary of options for decoding.
        
    Returns:
        phrases_fr (list): List of French phrases.
        phrases_en (list): List of English translations.
    """

    phrases_fr = []
    phrases_en = []

    for chunk_filename in tqdm(chunk_filename_list):
        # Transcribe using loaded model
        result_fr = model.transcribe(chunk_filename, language="fr", task="transcribe", **decode_options)
        result_en = model.transcribe(chunk_filename, language="fr", task="translate", **decode_options)

        # Saving text
        phrases_fr.append(result_fr["text"])
        phrases_en.append(result_en["text"])

    return phrases_fr, phrases_en

# Return numbers "xx.y" and section name
def split_filename(filename):
    # TODO Fix to be more generic
    prefix = filename[:4]
    title = filename[5:-4]

    return (prefix, title)

# Generating new filenames for each chunk
def generate_chunk_filename(filename_prefix, i):
    return filename_prefix + "_" + str(i) + ".mp3"

In [4]:
# Package Name
pkg_name = "Mastering French Vocabulary"

# Create chapter names if available
#pkg_chapters = {}
pkg_chapters = { 1 : "Personal Information", 2 : "The Human Body", 3 : "Health and Medicine", 4 : "Psyche, Mind, Behaviour",
                5 : "Food, Clothing, Shopping", 6 : "Living", 7 : "Private Life, Social Relationships", 8 : "Education and Training",
                9 : "Professional and Work World", 10 : "Leisure Activities", 11 : "Travel and Tourism", 12 : "Art, Music, Literature",
                13 : "History, Religion, Philosophy", 14 : "State, Law, Politics", 15 : "Economy and Business", 16 : "Communication and Mass Media",
                17 : "Transportation, Vehicles", 18 : "Nature, Environment, Ecology", 19 : "Time and Space", 20 : "Colours and Shapes",
                21 : "Quantities, Measurements, Numbers", 22 : "General Terms", 23 : "Verbal Communication", 24 : "Language Structures" }

# Find all mp3 files
dir = "french_audio/"
save_dir = "split_audio/" # Place to save split tracks TODO Implement this later

# Choosing files from directory
file_numbers = [0, 1]

filename_dir_list = sorted(glob.glob(dir + "*.mp3"))
filename_only_list = [ f_dir.split("/")[-1] for f_dir in filename_dir_list ] # Chop off directory
chapter_list = [ int(filename.split(".", 1)[0]) for filename in filename_only_list] # Extract chapters from filenames

# Only selecting specific files
filenames = [filename_only_list[file_number] for file_number in file_numbers]
chapter_names = [pkg_chapters[chapter_list[file_number]] for file_number in file_numbers]

# Creating pairs of prefix and titles
split_filenames = [split_filename(filename) for filename in filenames]

tracks = [AudioSegment.from_mp3(dir + filename) for filename in filenames]
#original_bitrates = [mediainfo(dir + filename)["bit_rate"] for filename in filenames] # Redundant because later chunks inherit same bitrate

In [5]:
chunk_filename_lists = []

# Loop through every original track and split then save
for track, (filename_prefix, _) in zip(tracks, split_filenames):
    # Parameters empirically tuned
    # Discard first two chunks because they're always English
    chunks = split_on_silence(track, min_silence_len=600, silence_thresh=-40, keep_silence=300)[2:]

    # Create new chunk names
    chunk_filename_list = [generate_chunk_filename(filename_prefix, i) for i, _ in enumerate(chunks)]
    chunk_filename_lists.append(chunk_filename_list)

    # Save split up audio
    for (chunk, chunk_name) in zip(chunks, chunk_filename_list):
        # Only write if filename doesn't exist
        if not os.path.isfile(chunk_name):
            chunk.export(chunk_name, format="mp3")

### Apply Model to Chunks

In [6]:
pairs_chunk_lists = []

# Costly operation
for chunk_filename_list in chunk_filename_lists:
    phrases_fr, phrases_en = generate_phrases(chunk_filename_list, decode_options)

    # Sanitise all outputs
    pairs = [ tuple(map(sanitise, phrase_pair)) for phrase_pair in zip(phrases_fr, phrases_en)]
    pairs_chunk_lists.append(zip(pairs, chunk_filename_list))

  0%|          | 0/109 [00:00<?, ?it/s]

  0%|          | 0/133 [00:00<?, ?it/s]

## Creating Anki Deck

In [7]:
model_name = 'English/French with Audio'

# Now have "phrases_fr", "phrases_en" and "chunk_filename_list", which is enough to create a deck
random.seed(model_name)

# Defining model
# Model with audio
model_audio = genanki.Model(
  random.randrange(1 << 30, 1 << 31),
  model_name,
  fields=[
    {'name': 'English'},
    {'name': 'French'},
    {'name': 'Audio'}
  ],
  templates=[
    {
      'name': 'Card',
      'qfmt': '{{English}}<br><br>{{type:French}}',
      'afmt': '{{FrontSide}}<hr id="answer">{{French}}<br>{{Audio}}',
    },
    ],
  css=""".card {
      font-family: "Times New Roman", Times, serif;
      font-size: 56pt;
      text-align: center;
      color: black;
      }
      #typeans{
      font-family: "Times New Roman", Times, serif !important;
      font-size: 40pt !important;
      text-align: center;
      color: black;
      }
      input[type=text] {
      width: 100%;
      text-align: center;
      padding: 12px 0px;
      margin: 8px 0;
      box-sizing: border-box;
      background-color: #eee;
      }"""
)

In [11]:
# Allowing for multiple decks
deck_list = []

random.seed(datetime.now().timestamp())

# Create Decks
for i, ((_, filename_title), chapter_name) in enumerate(zip(split_filenames, chapter_names)):
    # Adding "::" creates hierarchy within Anki
    my_deck = genanki.Deck(
      random.randrange(1 << 30, 1 << 31), # model_id
      pkg_name + "::" + chapter_name + "::" + filename_title)

    # Loop through words
    pairs_chunk_list = pairs_chunk_lists[i]
    for ((phrase_fr, phrase_en), chunk_filename) in pairs_chunk_list:
        note = genanki.Note(model=model_audio, fields=[phrase_en, phrase_fr, "[sound:{}]".format(chunk_filename)])
        my_deck.add_note(note)

    # Add to deck list
    deck_list.append(my_deck)

In [12]:
# Flattening list of chunk_filename_lists
chunk_filename_flat = [item for sublist in chunk_filename_lists for item in sublist]

# Create package
my_package = genanki.Package(deck_list)
my_package.media_files = chunk_filename_flat

package_name = "+".join([filename_prefix for (filename_prefix, _) in split_filenames])
my_package.write_to_file(package_name + ".apkg")