# Splitting Audio Files

In [55]:
# Manipulating audio
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.utils import mediainfo

from pydub.playback import play

# Modifying strings
import re

# Machine Learning Model
import torch
import whisper

# Saving output
import pandas as pd

# Checking confidence
import numpy as np
import matplotlib.pyplot as plt

# Progress bar
from tqdm.notebook import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#DEVICE = "cpu"

In [18]:
# Choose File
filename_dir = "french_audio/split_testing/"
filename = "01.2 Nationality, Language, Country.mp3"

filename_prefix = filename[:4]
filename_title = filename[5:-4]

track = AudioSegment.from_mp3(filename_dir + filename)
original_bitrate = mediainfo(filename_dir + filename)["bit_rate"]

In [3]:
# Parameters empirically tuned
chunks = split_on_silence(track, min_silence_len=600, silence_thresh=-30, keep_silence=200)

def chunk_filename(filename_prefix, i):
    return filename_prefix + "_" + str(i) + ".mp3"

# Save split up audio
for i, chunk in enumerate(chunks):
    chunk.export(chunk_filename(filename_prefix, i).format(i), format="mp3")

### Load Model and Apply to Chunks

In [57]:
del model
torch.cuda.empty_cache()

In [58]:
model = whisper.load_model("medium", device=DEVICE)

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 5.76 GiB total capacity; 4.02 GiB already allocated; 38.69 MiB free; 4.03 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [45]:
phrases_fr = []
phrases_en = []

logprobs_fr = []
logprobs_en = []

# Has to be changed if on CPU vs GPU
decode_options = {"fp16": True}

num_chunks = len(chunks)
for i in tqdm(range(num_chunks)):
    # Ignoring the English title
    if i > 1:
        result_fr = model.transcribe(chunk_filename(filename_prefix, i), language="fr", task="transcribe", **decode_options)
        result_en = model.transcribe(chunk_filename(filename_prefix, i), language="fr", task="translate", **decode_options)

        # Saving text
        phrases_fr.append(result_fr["text"])
        phrases_en.append(result_en["text"])

        # Saving confidence
        logprobs_fr.append(result_fr["segments"][0]["avg_logprob"])
        logprobs_en.append(result_en["segments"][0]["avg_logprob"])


  0%|          | 0/135 [00:00<?, ?it/s]

In [48]:
print(phrases_fr)

[' Un passeport.', " Les papiers d'identité.", " une carte d'identité.", ' un continent.', ' un pays.', ' une nation.', ' La nationalité.', ' Fatima va demander la nationalité française.', " être d'origine.", " C'est un français d'origine italienne.", ' étranger', ' étrangère', ' Un étranger.', ' Une étrangère.', ' Immigré.', ' un immigré.', ' Une immigrée.', " Les immigrés ont parfois du mal à s'intégrer.", " l'immigration", ' émigré.', ' Une langue.', ' La langue maternelle.', ' Une langue étrangère.', ' BILANG', ' Le rebs', " L'Afrique.", " L'Amérique.", ' Lasi.', " l'Australie.", ' La France.', ' Français, Française.', " l'Allemagne.", ' allemand allemande', ' La Grande Bretagne', ' Britannique.', " L'Angleterre", ' Anglais, Anglaise.', " L'Italie.", ' Italien, Italienne.', " l'Espagne.", ' Espagnol, Espagnol', ' le Portugal.', ' Portugais, Portugaises.', ' La Belgique.', ' Belge.', ' Les Pays-Bas.', ' néerlandais, néerlandaises.', ' La Hollande.', ' Hollandaise', ' Le Luxembourg',

In [56]:
phrases_fr[np.argmin(logprobs_fr)]

' Le rebs'

In [31]:
def sanitise(phrase):
    # Strip whitespace then full stop
    stripped = phrase.strip().strip(".")

    phrase_clean = None
    # Un-capitalize phrases
    # All uppers are usually errors
    if stripped.isupper():
        phrase_clean = stripped.lower()
    # Title case is sometimes a noun, also check for empty string
    elif (stripped.split(" ", 1)[0] == "The" or not stripped.istitle()) and len(stripped) > 1:
        phrase_clean = stripped[0].lower() + stripped[1:]
    else:
        phrase_clean = stripped

    return phrase_clean

# Sanitise all outputs
pairs = [ tuple(map(sanitise, phrase_pair)) for phrase_pair in zip(phrases_fr, phrases_en)]

print(pairs)

[('un passeport', 'a passport!'), ("les papiers d'identité", 'identity papers'), ("une carte d'identité", 'a identity card'), ('un continent', 'a continent'), ('un pays', 'a country!'), ('une nation', 'a nation'), ('la nationalité', 'Nationality'), ('fatima va demander la nationalité française', 'fatima will ask the French nationality'), ("être d'origine", 'to be of origin'), ("c'est un français d'origine italienne", "it's a French from Italian origin"), ('étranger', 'Stranger'), ('étrangère', 'Stranger'), ('un étranger', 'a stranger'), ('une étrangère', 'a stranger'), ('Immigré', 'Immigrate!'), ('un immigré', 'an immigrant'), ('une immigrée', 'a immigrant'), ("les immigrés ont parfois du mal à s'intégrer", 'immigrants sometimes have trouble integrating themselves'), ("l'immigration", 'Immigration'), ('émigré', 'Emigré'), ('une langue', 'a language'), ('la langue maternelle', 'the mother tongue'), ('une langue étrangère', 'a foreign language'), ('bilang', 'bye now!'), ('le contre premi

In [29]:
# Foolishly unzipping previously unzipped quantity
# Putting it in a data frame
data = pd.DataFrame(dict(zip(["phrases_fr", "phrases_en"], zip(*pairs))))

# Saving to CSV
CSV_title = re.sub(",? ", "_", filename_title)
data.to_csv(CSV_title + ".csv")