In [1]:
import pandas as pd
import numpy as np
from pydub import AudioSegment
from pydub.silence import split_on_silence, detect_silence
import gc

import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
pd.set_option("display.max_colwidth", 100)
df = pd.read_csv("/home/peterr/macocu/task6_speech/8_results.csv")

df["to_edit"] = df.duration > 10
subset = df.loc[df.to_edit&df.cuts.isin([0, "0"])]

subset.head()

Unnamed: 0,path,sentence,duration,filesize,to_edit,cuts
12826,/home/peterr/macocu/task6_speech/data/00019849.flac.wav,a prihodi samo naknada za korištenje voda prelaze sedamsto milijuna kuna i trebali bi biti kao u...,10.099,323246,True,0
12827,/home/peterr/macocu/task6_speech/data/00021653.flac.wav,i zaista vi svojim primjerom ste pokazali u praksi kako to treba funkcionirati ne samo deklarato...,10.1,323278,True,0
12828,/home/peterr/macocu/task6_speech/data/00012903.flac.wav,znači novi zakon o javnoj nabavi bi trebao voditi računa o tome da se ne može bez razloga bez op...,10.1,323278,True,0
12829,/home/peterr/macocu/task6_speech/data/00020435.flac.wav,dobro je da su tu isključene tvrtke kćeri koje rade konsolidirani izvještaj jer one već imaju za...,10.1,323278,True,0
12830,/home/peterr/macocu/task6_speech/data/00014246.flac.wav,probajte stati malo razmisliti pročitati odreagirati kompletirati tu cijelu priču i onda izaći s...,10.1,323278,True,0


In [2]:
def splitter(file, MIN=5, MAX=10):
    def _is_ok(pauses, duration, MIN=MIN, MAX=MAX):
        pauses = [0, *pauses, duration]
        durations = [MIN * 1000 <= e - s <= MAX * 1000 for s, e in zip(pauses[:-1], pauses[1:])]
        return all(durations)

    def _splitter(pauses, duration, MIN=MIN, MAX=MAX):
        from itertools import combinations, chain
        def powerset(iterable):
            "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
            s = list(iterable)
            return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
        all_combinations = list(powerset(pauses))
        logging.debug(f"Testing {len(all_combinations)} combinations....")
        for i, combination in enumerate(all_combinations):
            if i & 1024 == 0:
                gc.collect()
            if _is_ok(combination, duration, MIN=MIN, MAX=MAX):
                return list(combination)
        logging.debug(f"No solution found so that {MIN=}s <= duration <= {MAX=}s.")
        return None

    def _splitter_heuristic(pauses, duration, MIN=MIN, MAX=MAX):
        pauses = [0, *pauses, duration]
        
        start = 0
        results = list()
        for p in pauses:
            if p - start < MIN*1000:
                continue
            if p-start > MAX * 1000:
                return None
            results.append(p)
            start = p
        return results[:-1]

    audio = AudioSegment.from_wav(file)
    duration = audio.duration_seconds * 1000
    logging.debug(f"Duration: {duration/1000} s")
    res = None
    for silence in [1000,900, 800, 500,400, 300, 200]:
        logging.debug(f"Testing silence {silence}")
        detected_silences = detect_silence(audio, min_silence_len=silence, silence_thresh=-30) # In seconds
        if detected_silences == []:
            logging.debug(f"No silences detected")
            continue
        nr_of_silences = len(detected_silences)
        logging.debug(f"Got {nr_of_silences} silences.")
        centroids = np.array(detected_silences).mean(axis=1)
        centroids = centroids.tolist()
        res = None
        if nr_of_silences > 15:
            logging.info("Using heuristic splitter")
            res = _splitter_heuristic(centroids, duration, MIN=MIN, MAX=MAX)
        else:
            res = _splitter(centroids, duration, MIN=MIN, MAX=MAX)
        if res != None:
            logging.debug(f" Success! Found splitting: {res}")
            break
    if res == None:
        raise Exception("No splitting was found.")
    centroids = np.array(res).tolist()
    cuts = [0, *centroids, duration]

    return cuts

In [3]:
i = 0
for row in df.loc[df.to_edit&df.cuts.isin([0, "0"])].iterrows():
    if row[1]["cuts"] == 0:
        logging.debug(f"File {row[1]['path']} was already processed.")
        continue
    try:
        file = row[1]["path"]
        cuts = splitter(file)
        df.loc[row[0], "cuts"] = str(cuts)
        logging.info(f"Finished number {i}.")
        i += 1
    except:
        logging.error(f"Could not calculate splits for {row[1]['path']}...")
        continue

2021-12-15 13:08:10,740 - ERROR - Could not calculate splits for /home/peterr/macocu/task6_speech/data/00019849.flac.wav...
2021-12-15 13:08:36,278 - ERROR - Could not calculate splits for /home/peterr/macocu/task6_speech/data/00021653.flac.wav...
2021-12-15 13:08:58,508 - ERROR - Could not calculate splits for /home/peterr/macocu/task6_speech/data/00012903.flac.wav...
2021-12-15 13:09:22,273 - ERROR - Could not calculate splits for /home/peterr/macocu/task6_speech/data/00020435.flac.wav...
2021-12-15 13:09:25,232 - ERROR - Could not calculate splits for /home/peterr/macocu/task6_speech/data/00014246.flac.wav...
2021-12-15 13:09:48,782 - ERROR - Could not calculate splits for /home/peterr/macocu/task6_speech/data/00004541.flac.wav...
2021-12-15 13:09:51,004 - ERROR - Could not calculate splits for /home/peterr/macocu/task6_speech/data/00002504.flac.wav...
2021-12-15 13:09:52,726 - ERROR - Could not calculate splits for /home/peterr/macocu/task6_speech/data/00020881.flac.wav...
2021-12-

```
Heuristic splitter:
start is 0
go through the pauses, until the pause for which the following holds: MIN <= pause - start <= MAX.
that pause is a split, add it to plit results.
set start is pause.
continue from pause.
you end when you reach ending.
```

In [5]:
MIN, MAX = 5, 20

def _splitter_heuristic(pauses, duration, MIN=MIN, MAX=MAX):
    pauses = [0, *pauses, duration]
    
    start = 0
    results = list()
    for p in pauses:
        if p - start < MIN*1000:
            continue
        if p-start > MAX * 1000:
            return None
        results.append(p)
        start = p
    return results[:-1]





In [16]:
file = "/home/peterr/macocu/task6_speech/data/00014074.flac.wav"
audio = AudioSegment.from_file(file)
detected_silences = detect_silence(audio, min_silence_len=200, silence_thresh=-30)
pauses = np.array(detected_silences).mean(axis=1).tolist()
duration = 1000*audio.duration_seconds
print("Pauses: ", pauses)
res = _splitter_heuristic(pauses, duration, MIN=5, MAX=15)

res


Pauses:  [6358.0, 11784.0, 14918.5, 19551.0, 21170.5, 24648.5, 26313.5, 27595.0, 30030.5, 30945.0, 34873.5, 37019.5, 38669.0, 39481.5, 41433.0, 43483.5, 45590.5, 47016.0]


[6358.0, 11784.0, 19551.0, 24648.5, 30030.5, 37019.5]

In [20]:
df.to_csv("/home/peterr/macocu/task6_speech/8_results.csv", index=False)