# Splitting Audio Files

In [1]:
# Finding audio files
import glob

# Manipulating audio
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.utils import mediainfo

from pydub.playback import play

# Generating Anki deck
import genanki
import random

# Modifying strings
import re

# Machine Learning Model
import torch
import whisper

# Saving output
import pandas as pd
import os.path

# Checking confidence
import numpy as np
import matplotlib.pyplot as plt

# Progress bar
from tqdm.notebook import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#DEVICE = "cpu"

# Has to be changed if on CPU vs GPU
decode_options = {"fp16": True}

In [5]:
# Be careful to only run this cell once because it eats up VRAM
model = whisper.load_model("medium", device=DEVICE)

In [103]:
# Package Name
pkg_name = "Mastering French Vocabulary"

# Create chapter names if available
#pkg_chapters = {}
pkg_chapters = { 1 : "Personal Information", 2 : "The Human Body", 3 : "Health and Medicine", 4 : "Psyche, Mind, Behaviour",
                5 : "Food, Clothing, Shopping", 6 : "Living", 7 : "Private Life, Social Relationships", 8 : "Education and Training",
                9 : "Professional and Work World", 10 : "Leisure Activities", 11 : "Travel and Tourism", 12 : "Art, Music, Literature",
                13 : "History, Religion, Philosophy", 14 : "State, Law, Politics", 15 : "Economy and Business", 16 : "Communication and Mass Media",
                17 : "Transportation, Vehicles", 18 : "Nature, Environment, Ecology", 19 : "Time and Space", 20 : "Colours and Shapes",
                21 : "Quantities, Measurements, Numbers", 22 : "General Terms", 23 : "Verbal Communication", 24 : "Language Structures" }

# Find all mp3 files
dir = "french_audio/"
filename_dir_list = sorted(glob.glob(dir + "*.mp3"))
filename_only_list = [ f_dir.split("/")[-1] for f_dir in filename_dir_list ]
chapter_list = [ int(filename.split(".", 1)[0]) for filename in filename_only_list]

# Choose file
file_number = 0
filename = filename_only_list[file_number]
chapter = pkg_chapters[chapter_list[file_number]]

def split_filename(filename):
    # TODO Fix to be more generic
    prefix = filename[:4]
    title = filename[5:-4]

    return (prefix, title)

filename_prefix, filename_title = split_filename(filename)

track = AudioSegment.from_mp3(dir + filename)
original_bitrate = mediainfo(dir + filename)["bit_rate"]

[1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24]


In [85]:
# Parameters empirically tuned
# Discard first two chunks because they're English
chunks = split_on_silence(track, min_silence_len=600, silence_thresh=-30, keep_silence=200)[2:]

def chunk_filename(filename_prefix, i):
    return filename_prefix + "_" + str(i) + ".mp3"

chunk_filename_list = [ chunk_filename(filename_prefix, i) for i in range(len(chunks)) ]

# Save split up audio
for i, (chunk, chunk_name) in enumerate(zip(chunks, chunk_filename_list)):
    # Only write if filename doesn't exist
    if not os.path.isfile(chunk_name):
        chunk.export(chunk_name, format="mp3")

### Load Model and Apply to Chunks

In [87]:
phrases_fr = []
phrases_en = []

num_chunks = len(chunks)
for i in tqdm(range(num_chunks)):
    # Ignoring the English title
    result_fr = model.transcribe(chunk_filename(filename_prefix, i), language="fr", task="transcribe", **decode_options)
    result_en = model.transcribe(chunk_filename(filename_prefix, i), language="fr", task="translate", **decode_options)

    # Saving text
    phrases_fr.append(result_fr["text"])
    phrases_en.append(result_en["text"])

  0%|          | 0/109 [00:00<?, ?it/s]

In [88]:
def sanitise(phrase):
    # Strip whitespace then full stop
    stripped = phrase.strip().strip(".")

    # Often preceding proper nouns
    exceptions = ["The", "La", "Le", "L", "Les"]

    phrase_clean = None
    # Un-capitalize phrases
    # All uppers are usually errors
    if stripped.isupper():
        phrase_clean = stripped.lower()
    # Title case is sometimes a noun, also check for empty string
    # Also check it's not a question
    elif (re.split(" |'", stripped)[0] in exceptions or not stripped.istitle()) and len(stripped) > 1 and not stripped[-1] == "?":
        phrase_clean = stripped[0].lower() + stripped[1:]
    else:
        phrase_clean = stripped

    return phrase_clean

# Sanitise all outputs
pairs = [ tuple(map(sanitise, phrase_pair)) for phrase_pair in zip(phrases_fr, phrases_en)]

print(pairs)

[('le nom', 'the name'), ('le nom de famille', 'family name'), ('le prénom', 'the first name'), ("s'appeler", 'to call'), ("Comment t'appelles-tu?", "What's your name?"), ("Comment tu t'appelles?", "What's your name?"), ('Monsieur', 'Sir'), ('Messieurs,', 'Gentlemen!'), ('Madame', 'Madam'), ('Mesdames', 'ladies and gentlemen'), ('madame Martin naît du pont', 'madame Martin, born in Dupont'), ('Mademoiselle', 'Miss'), ('Mesdemoiselles', 'Mademoiselle'), ('habitez quelque chose', 'to live something'), ('habitez à', 'Habit A'), ("j'habite une maison neuve", 'i live in a new house'), ("j'habite à Paris", 'i live in Paris'), ("l'address", 'address'), ('un domicile', 'a home'), ('sans domicile fixe', 'no fixed home'), ('les coordonnées', 'the coordinates'), ('donne-moi les coordonnées de Paul', "give me Paul's coordinates!"), ('une rue', 'a street'), ('26 rue du Labrador', '26 Rue du Labrador'), ('une route', 'a route'), ('une avenue', 'an avenue'), ('un boulevard', 'a boulevard'), ('une pla

#### CSV Saving

In [29]:
# Foolishly unzipping previously unzipped quantity
# Putting it in a data frame
data = pd.DataFrame(dict(zip(["phrases_fr", "phrases_en"], zip(*pairs))))

# Saving to CSV
CSV_title = re.sub(",? ", "_", filename_title)
data.to_csv(CSV_title + ".csv")

## Creating Anki Deck

In [89]:
# Now have "phrases_fr", "phrases_en" and "chunk_filename_list", which is enough to create a deck
random.seed(1)

# Defining model
# Model with audio
model_audio = genanki.Model(
  random.randrange(1 << 30, 1 << 31),
  'English/French with Audio',
  fields=[
    {'name': 'English'},
    {'name': 'French'},
    {'name': 'Audio'}
  ],
  templates=[
    {
      'name': 'Card',
      'qfmt': '{{English}}<br><br>{{type:French}}',
      'afmt': '{{FrontSide}}<hr id="answer">{{French}}<br>{{Audio}}',
    },
    ],
  css=""".card {
      font-family: "Times New Roman", Times, serif;
      font-size: 56pt;
      text-align: center;
      color: black;
      }
      #typeans{
      font-family: "Times New Roman", Times, serif !important;
      font-size: 40pt !important;
      text-align: center;
      color: black;
      }
      input[type=text] {
      width: 100%;
      text-align: center;
      padding: 12px 0px;
      margin: 8px 0;
      box-sizing: border-box;
      background-color: #eee;
      }"""
)

In [100]:
# Create Deck
# Adding "::" creates hierarchy within Anki
my_deck = genanki.Deck(
  random.randrange(1 << 30, 1 << 31), # model_id
  pkg_name + "::" + chapter + "::" + filename_title)

# Loop through words
for i, ((phrase_fr, phrase_en), chunk_filename) in enumerate(zip(pairs, chunk_filename_list)):
    note = genanki.Note(model=model_audio, fields=[phrase_en, phrase_fr, "[sound:{}]".format(chunk_filename)])

    my_deck.add_note(note)

deck_list = []
deck_list.append(my_deck)
#deck_list.append(parent_deck)

In [101]:
# Create package
my_package = genanki.Package(deck_list)
my_package.media_files = chunk_filename_list

my_package.write_to_file(filename_prefix + ".apkg")