In [1]:
import pandas as pd
import numpy as np
from pydub import AudioSegment
from pydub.silence import split_on_silence, detect_silence
import gc

import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.DEBUG)
from scipy.io import wavfile
file = "/home/peterr/macocu/task6_speech/data/00009874.flac.wav"

def get_duration(file:str) -> float:
    rate, data = wavfile.read(file)
    return len(data)/rate

get_duration(file)


pd.set_option("display.max_colwidth", 100)
f = "/home/nikolal/projects/parlaspeech/transcripts_normalized.txt"
df = pd.read_table(f, names=["unknown1", "path", "sentence"],
        #nrows=2000
        )

df["path"] = "/home/peterr/macocu/task6_speech/data/" + df.path.str.replace("flac/", "") + ".wav"
df["duration"] = df.path.apply(get_duration)
df = df.drop(columns="unknown1")
import os
df["filesize"] = df["path"].apply(os.path.getsize)
for char in ['*', ':', ';']:
    df["sentence"] = df.sentence.str.replace(char, "")
df.sort_values(by="filesize", ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df["to_edit"] = df.duration > 20
df.loc[df.to_edit, :].sample(5)
df["cuts"] = ""

2021-12-13 21:25:20,756 - INFO - NumExpr defaulting to 8 threads.


In [2]:

def splitter(file, MIN=5, MAX=10):
    def is_ok(pauses, duration, MIN=10, MAX=20):
        pauses = [0, *pauses, duration]
        durations = [MIN * 1000 <= e - s <= MAX * 1000 for s, e in zip(pauses[:-1], pauses[1:])]
        return all(durations)

    def _splitter(pauses, duration, MIN=10, MAX=20):
        from itertools import combinations, chain
        def powerset(iterable):
            "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
            s = list(iterable)
            return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
        all_combinations = list(powerset(pauses))
        logging.debug(f"Testing {len(all_combinations)} combinations....")
        for i, combination in enumerate(all_combinations):
            if i & 1024 == 0:
                gc.collect()
            if is_ok(combination, duration, MIN=MIN, MAX=MAX):
                return list(combination)
        logging.debug(f"No solution found so that {MIN=}s <= duration <= {MAX=}s.")
        return None
    audio = AudioSegment.from_wav(file)
    duration = audio.duration_seconds * 1000
    logging.debug(f"Duration: {duration/1000} s")
    res = None
    for silence in [1000, 800, 500, 300, 200]:
        logging.info(f"Testing silence {silence}")
        detected_silences = detect_silence(audio, min_silence_len=silence, silence_thresh=-40) # In seconds
        if detected_silences == []:
            logging.info(f"No silences detected")
            continue
        nr_of_silences = len(detected_silences)
        logging.debug(f"Got {nr_of_silences} silences.")
        centroids = np.array(detected_silences).mean(axis=1)
        centroids = centroids.tolist()
        res = None
        if nr_of_silences > 10:
            res = _splitter(centroids[0::2], duration, MIN=MIN, MAX=MAX)
            if res == None:
                res = _splitter(centroids[1::2], duration, MIN=MIN, MAX=MAX)
        if res == None:
            res = _splitter(centroids, duration, MIN=MIN, MAX=MAX)
        if res != None:
            logging.debug(f" Success! Found splitting: {res}")
            break
    if res == None:
        raise Exception("No splitting was found.")
    centroids = np.array(res).tolist()
    cuts = [0, *centroids, duration]

    return cuts
    # new_filenames = list()
    # for i, (start, end) in enumerate(zip(cuts[0:],cuts[1:])):
    #     new_file = file.replace(".flac.wav", f"_{i:2}.flac.wav")
    #     segment = audio[start:end]
    #     logging.info(f"Exporting segment nr. {i+1} as f{new_file}")
    #     segment.export(new_file, format="wav")
    #     new_filenames.append(new_file)
    # return new_filenames



In [3]:
df = pd.read_csv("/home/peterr/macocu/task6_speech/8_results.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,path,sentence,duration,filesize,to_edit,cuts
0,0,/home/peterr/macocu/task6_speech/data/00019728.flac.wav,imamo odgovor na repliku,0.6,19278,False,
1,1,/home/peterr/macocu/task6_speech/data/00013519.flac.wav,pa kako je to moguće,0.6,19278,False,
2,2,/home/peterr/macocu/task6_speech/data/00021417.flac.wav,kaznenog zakona samo ide u prilog tezi da katalog kaznenih djela unatoč tome što je dopunjen i d...,0.629,20206,False,
3,3,/home/peterr/macocu/task6_speech/data/00017866.flac.wav,ponovit ću još jedanput,0.7,22478,False,
4,4,/home/peterr/macocu/task6_speech/data/00004197.flac.wav,evo to je moj odgovor,0.7,22478,False,


In [4]:

i = 0
for row in df.loc[df.to_edit,].iterrows():
    if pd.notna(row[1]["cuts"]):
        continue
    try:
        file = row[1]["path"]
        cuts = splitter(file)
        df.loc[row[0], "cuts"] = str(cuts)
        logging.info(f"Finished number {i}.")
        i += 1
    except:
        logging.error(f"Could not calculate splits for {row[1]['path']}...")
        continue
    finally:
        df.to_csv("/home/peterr/macocu/task6_speech/8_results.csv", index = False)

2021-12-13 21:25:44,478 - DEBUG - Duration: 20.1 s
2021-12-13 21:25:44,478 - INFO - Testing silence 1000
2021-12-13 21:25:45,039 - DEBUG - Got 3 silences.
2021-12-13 21:25:45,039 - DEBUG - Testing 8 combinations....
2021-12-13 21:25:45,179 - DEBUG - No solution found so that MIN=5s <= duration <= MAX=10s.
2021-12-13 21:25:45,180 - INFO - Testing silence 800
2021-12-13 21:25:45,649 - DEBUG - Got 3 silences.
2021-12-13 21:25:45,650 - DEBUG - Testing 8 combinations....
2021-12-13 21:25:45,790 - DEBUG - No solution found so that MIN=5s <= duration <= MAX=10s.
2021-12-13 21:25:45,791 - INFO - Testing silence 500
2021-12-13 21:25:46,136 - DEBUG - Got 4 silences.
2021-12-13 21:25:46,138 - DEBUG - Testing 16 combinations....
2021-12-13 21:25:46,398 - DEBUG - No solution found so that MIN=5s <= duration <= MAX=10s.
2021-12-13 21:25:46,399 - INFO - Testing silence 300
2021-12-13 21:25:46,661 - DEBUG - Got 6 silences.
2021-12-13 21:25:46,662 - DEBUG - Testing 64 combinations....
2021-12-13 21:25: