In [5]:
import pandas as pd
import numpy as np
from pydub import AudioSegment
from pydub.silence import split_on_silence, detect_silence
import gc

import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.DEBUG)
from scipy.io import wavfile
file = "/home/peterr/macocu/task6_speech/data/00009874.flac.wav"

def get_duration(file:str) -> float:
    rate, data = wavfile.read(file)
    return len(data)/rate

get_duration(file)


pd.set_option("display.max_colwidth", 100)
f = "/home/nikolal/projects/parlaspeech/transcripts_normalized.txt"
df = pd.read_table(f, names=["unknown1", "path", "sentence"],
        #nrows=2000
        )

df["path"] = "/home/peterr/macocu/task6_speech/data/" + df.path.str.replace("flac/", "") + ".wav"
df["duration"] = df.path.apply(get_duration)
df = df.drop(columns="unknown1")
import os
df["filesize"] = df["path"].apply(os.path.getsize)
for char in ['*', ':', ';']:
    df["sentence"] = df.sentence.str.replace(char, "")
df.sort_values(by="filesize", ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df["to_edit"] = df.duration > 20
df.loc[df.to_edit, :].sample(5)
df["cuts"] = 0

In [6]:
def splitter(file, MIN=5, MAX=10):
    def is_ok(pauses, duration, MIN=MIN, MAX=MAX):
        pauses = [0, *pauses, duration]
        durations = [MIN * 1000 <= e - s <= MAX * 1000 for s, e in zip(pauses[:-1], pauses[1:])]
        return all(durations)

    def _splitter(pauses, duration, MIN=MIN, MAX=MAX):
        from itertools import combinations, chain
        def powerset(iterable):
            "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
            s = list(iterable)
            return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
        all_combinations = list(powerset(pauses))
        logging.debug(f"Testing {len(all_combinations)} combinations....")
        for i, combination in enumerate(all_combinations):
            if i & 1024 == 0:
                gc.collect()
            if is_ok(combination, duration, MIN=MIN, MAX=MAX):
                return list(combination)
        logging.debug(f"No solution found so that {MIN=}s <= duration <= {MAX=}s.")
        return None

    def _splitter_heuristic(pauses, duration, MIN=MIN, MAX=MAX):
        pauses = [0, *pauses, duration]
        subsequent_pauses = [[s,e] for s, e in zip(pauses, pauses[1:]) if MIN <= e-s <= MAX]
        res = set()
        for s, e in subsequent_pauses:
            if s != 0:
                res.add(s)
            if e != duration:
                res.add(e)
        result = sorted(list(res))
        if _is_ok(pauses, duration, MIN=MIN, MAX=MAX):
            return result
        else:
            return None 
    audio = AudioSegment.from_wav(file)
    duration = audio.duration_seconds * 1000
    logging.debug(f"Duration: {duration/1000} s")
    res = None
    for silence in [1000, 800, 500, 300, 200]:
        logging.info(f"Testing silence {silence}")
        detected_silences = detect_silence(audio, min_silence_len=silence, silence_thresh=-40) # In seconds
        if detected_silences == []:
            logging.info(f"No silences detected")
            continue
        nr_of_silences = len(detected_silences)
        logging.debug(f"Got {nr_of_silences} silences.")
        centroids = np.array(detected_silences).mean(axis=1)
        centroids = centroids.tolist()
        res = None
        if nr_of_silences > 15:
            res = _splitter_heuristic(centroids, duration, MIN=MIN, MAX=MAX)
        else:
            res = _splitter(centroids, duration, MIN=MIN, MAX=MAX)
        if res != None:
            logging.debug(f" Success! Found splitting: {res}")
            break
    if res == None:
        raise Exception("No splitting was found.")
    centroids = np.array(res).tolist()
    cuts = [0, *centroids, duration]

    return cuts
    # new_filenames = list()
    # for i, (start, end) in enumerate(zip(cuts[0:],cuts[1:])):
    #     new_file = file.replace(".flac.wav", f"_{i:2}.flac.wav")
    #     segment = audio[start:end]
    #     logging.info(f"Exporting segment nr. {i+1} as f{new_file}")
    #     segment.export(new_file, format="wav")
    #     new_filenames.append(new_file)
    # return new_filenames



In [7]:
#df = pd.read_csv("/home/peterr/macocu/task6_speech/8_results.csv")
#df["cuts"] = np.nan
df[df.to_edit]["cuts"].unique()

array([0])

In [8]:

i = 0
for row in df.loc[df.to_edit,].iterrows():
    if row[1]["cuts"] != 0:
        logging.debug(f"File {row[1]['path']} was already processed.")
        continue
    try:
        file = row[1]["path"]
        cuts = splitter(file)
        df.loc[row[0], "cuts"] = str(cuts)
        logging.info(f"Finished number {i}.")
        i += 1
    except:
        logging.error(f"Could not calculate splits for {row[1]['path']}...")
        continue
    finally:
        df.to_csv("/home/peterr/macocu/task6_speech/8_results.csv", index = False)

2021-12-14 10:32:08,453 - DEBUG - Duration: 20.1 s
2021-12-14 10:32:08,454 - INFO - Testing silence 1000
2021-12-14 10:32:09,020 - DEBUG - Got 1 silences.
2021-12-14 10:32:09,021 - DEBUG - Testing 2 combinations....
2021-12-14 10:32:09,070 - DEBUG - No solution found so that MIN=5s <= duration <= MAX=10s.
2021-12-14 10:32:09,071 - INFO - Testing silence 800
2021-12-14 10:32:09,527 - DEBUG - Got 3 silences.
2021-12-14 10:32:09,528 - DEBUG - Testing 8 combinations....
2021-12-14 10:32:09,673 - DEBUG - No solution found so that MIN=5s <= duration <= MAX=10s.
2021-12-14 10:32:09,674 - INFO - Testing silence 500
2021-12-14 10:32:10,015 - DEBUG - Got 8 silences.
2021-12-14 10:32:10,016 - DEBUG - Testing 256 combinations....
2021-12-14 10:32:14,539 - DEBUG - No solution found so that MIN=5s <= duration <= MAX=10s.
2021-12-14 10:32:14,540 - INFO - Testing silence 300
2021-12-14 10:32:14,796 - DEBUG - Got 12 silences.
2021-12-14 10:32:14,798 - DEBUG - Testing 4096 combinations....
2021-12-14 10

In [3]:
file

'/home/peterr/macocu/task6_speech/data/00009874.flac.wav'