# Splitting Audio Files

In [1]:
# Finding audio files
import glob

# Manipulating audio
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.utils import mediainfo

from pydub.playback import play

# Generating Anki deck
import genanki
import random

# Modifying strings
import re

# Machine Learning Model
import torch
import whisper

# Saving output
import pandas as pd
import os.path

# Checking confidence
import numpy as np
import matplotlib.pyplot as plt

# Progress bar
from tqdm.notebook import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#DEVICE = "cpu"

In [44]:
# Find all mp3 files
dir = "french_audio/split_testing/"
filename_dir_list = sorted(glob.glob(dir + "*.mp3"))
filename_only_list = [ f_dir.split("/")[-1] for f_dir in filename_dir_list ]

# Choose file
file_number = 1
filename = filename_only_list[file_number]

# TODO Fix to be more generic
filename_prefix = filename[:4]
filename_title = filename[5:-4]

track = AudioSegment.from_mp3(dir + filename)
original_bitrate = mediainfo(dir + filename)["bit_rate"]

In [47]:
# Parameters empirically tuned
# Discard first two chunks because they're English
chunks = split_on_silence(track, min_silence_len=600, silence_thresh=-30, keep_silence=200)[2:]

def chunk_filename(filename_prefix, i):
    return filename_prefix + "_" + str(i) + ".mp3"

chunk_filename_list = [ chunk_filename(filename_prefix, i) for i in range(len(chunks)) ]

# Save split up audio
for i, (chunk, chunk_name) in enumerate(zip(chunks, chunk_filename_list)):
    # Only write if filename doesn't exist
    if not os.path.isfile(chunk_name):
        chunk.export(chunk_name, format="mp3")

### Load Model and Apply to Chunks

In [5]:
model = whisper.load_model("medium", device=DEVICE)

In [6]:
phrases_fr = []
phrases_en = []

logprobs_fr = []
logprobs_en = []

# Has to be changed if on CPU vs GPU
decode_options = {"fp16": True}

num_chunks = len(chunks)
for i in tqdm(range(num_chunks)):
    # Ignoring the English title
    result_fr = model.transcribe(chunk_filename(filename_prefix, i), language="fr", task="transcribe", **decode_options)
    result_en = model.transcribe(chunk_filename(filename_prefix, i), language="fr", task="translate", **decode_options)

    # Saving text
    phrases_fr.append(result_fr["text"])
    phrases_en.append(result_en["text"])

    # Saving confidence
    logprobs_fr.append(result_fr["segments"][0]["avg_logprob"])
    logprobs_en.append(result_en["segments"][0]["avg_logprob"])


  0%|          | 0/135 [00:00<?, ?it/s]

In [25]:
def sanitise(phrase):
    # Strip whitespace then full stop
    stripped = phrase.strip().strip(".")

    # Often preceding proper nouns
    exceptions = ["The", "La", "Le", "L", "Les"]

    phrase_clean = None
    # Un-capitalize phrases
    # All uppers are usually errors
    if stripped.isupper():
        phrase_clean = stripped.lower()
    # Title case is sometimes a noun, also check for empty string
    # Also check it's not a question
    elif (re.split(" |'", stripped)[0] in exceptions or not stripped.istitle()) and len(stripped) > 1 and not stripped[-1] == "?":
        phrase_clean = stripped[0].lower() + stripped[1:]
    else:
        phrase_clean = stripped

    return phrase_clean

# Sanitise all outputs
pairs = [ tuple(map(sanitise, phrase_pair)) for phrase_pair in zip(phrases_fr, phrases_en)]

print(pairs)

[('un passeport', 'a passport'), ("les papiers d'identité", 'identity papers'), ("une carte d'identité", 'an identity card'), ('un continent', 'a continent'), ('a bie!', 'a b'), ('une nation', 'a nation'), ('la nationalité', 'Nationality'), ('fatima va demander la nationalité française', 'fatima will ask French nationality'), ("être d'origine", 'being of origin'), ("c'est un français d'origine italienne", "it's a Frenchman of Italian origin"), ('étranger', 'Stranger'), ('étranger', 'Overseas'), ('un étranger', 'a stranger'), ('une étrangère', 'a foreigner'), ('Immigrés', 'immigrate'), ('un immigré', 'an immigrant'), ('une immigrée', 'an immigrant'), ("les immigrés ont parfois du mal à s'intégrer", 'immigrants sometimes have trouble integrating themselves'), ("l'immigration", 'Immigration'), ('Émigrez', 'emigrate'), ('une langue', 'a language'), ('la langue maternelle', 'the mother tongue'), ('une langue étrangère', 'a foreign language'), ('bilangue', 'b catch'), ('le rop', 'the dress')

In [29]:
# Foolishly unzipping previously unzipped quantity
# Putting it in a data frame
data = pd.DataFrame(dict(zip(["phrases_fr", "phrases_en"], zip(*pairs))))

# Saving to CSV
CSV_title = re.sub(",? ", "_", filename_title)
data.to_csv(CSV_title + ".csv")

## Creating Anki Deck

In [48]:
# Now have "phrases_fr", "phrases_en" and "chunk_filename_list", which is enough to create a deck
random.seed(1)

# Model without audio
model_no_audio = genanki.Model(
  random.randrange(1 << 30, 1 << 31),
  'English/French without Audio',
  fields=[
    {'name': 'English'},
    {'name': 'French'},
  ],
  templates=[
    {
      'name': 'Card',
      'qfmt': '{{English}}',
      'afmt': '{{FrontSide}}<hr id="answer">{{French}}',
    },
    ])

model_audio = genanki.Model(
  random.randrange(1 << 30, 1 << 31),
  'English/French with Audio',
  fields=[
    {'name': 'English'},
    {'name': 'French'},
    {'name': 'Audio'}
  ],
  templates=[
    {
      'name': 'Card',
      'qfmt': '{{English}}',
      'afmt': '{{FrontSide}}<hr id="answer">{{French}}<br>{{Audio}}',
    },
    ])

In [50]:
# Create Deck
my_deck = genanki.Deck(
  random.randrange(1 << 30, 1 << 31), # model_id
  filename_title)

# Loop through words
for i, ((phrase_fr, phrase_en), chunk_filename) in enumerate(zip(pairs, chunk_filename_list)):
    note = genanki.Note(model=model_audio, fields=[phrase_en, phrase_fr, "[sound:{}]".format(chunk_filename)])

    my_deck.add_note(note)

In [52]:
# Create package
my_package = genanki.Package(my_deck)
my_package.media_files = chunk_filename_list

my_package.write_to_file(filename_prefix + ".apkg")