In [1]:
import pandas as pd
import numpy as np
from pydub import AudioSegment
from pydub.silence import split_on_silence, detect_silence
import gc

import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
from scipy.io import wavfile
file = "/home/peterr/macocu/task6_speech/data/00009874.flac.wav"

def get_duration(file:str) -> float:
    rate, data = wavfile.read(file)
    return len(data)/rate

get_duration(file)


pd.set_option("display.max_colwidth", 100)
f = "/home/nikolal/projects/parlaspeech/transcripts_normalized.txt"
df = pd.read_table(f, names=["unknown1", "path", "sentence"],
        #nrows=2000
        )

df["path"] = "/home/peterr/macocu/task6_speech/data/" + df.path.str.replace("flac/", "") + ".wav"
df["duration"] = df.path.apply(get_duration)
df = df.drop(columns="unknown1")
import os
df["filesize"] = df["path"].apply(os.path.getsize)
for char in ['*', ':', ';']:
    df["sentence"] = df.sentence.str.replace(char, "")
df.sort_values(by="filesize", ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df["to_edit"] = df.duration > 20
df.loc[df.to_edit, :].sample(5)

2021-12-13 14:58:25,211 - INFO - NumExpr defaulting to 8 threads.


Unnamed: 0,path,sentence,duration,filesize,to_edit
21019,/home/peterr/macocu/task6_speech/data/00008523.flac.wav,i ono što je iz izvješća vidljivo da je hbor imao dobit od dvijesto pet milijuna kuna a ovdje je...,23.4,748878,True
22637,/home/peterr/macocu/task6_speech/data/00004185.flac.wav,evo kolega aleksić ja se uvijek ispočetka fasciniram da ne kažem šokiram kako se uopće moglo dog...,36.2,1158478,True
20097,/home/peterr/macocu/task6_speech/data/00001554.flac.wav,držao sam da gospodin grbin kao pravnik razlikuje poziciju savjetnika u vladi kako one savjetnik...,20.4,652878,True
21414,/home/peterr/macocu/task6_speech/data/00003330.flac.wav,zato i ne čudi izjava hernadya ja ću je prenijeti u cijelosti zapravo on priznaje kako se loše u...,25.1,803278,True
19997,/home/peterr/macocu/task6_speech/data/00001945.flac.wav,smatram da bi to trebao biti politički konsenzus svih političkih opcija da nam prioritet budu po...,20.2,646478,True


In [2]:

def splitter(file, MIN=5, MAX=10):
    def is_ok(pauses, duration, MIN=10, MAX=20):
        pauses = [0, *pauses, duration]
        durations = [MIN * 1000 <= e - s <= MAX * 1000 for s, e in zip(pauses[:-1], pauses[1:])]
        return all(durations)

    def _splitter(pauses, duration, MIN=10, MAX=20):
        from itertools import combinations, chain
        def powerset(iterable):
            "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
            s = list(iterable)
            return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
        all_combinations = list(powerset(pauses))
        logging.debug(f"Testing {len(all_combinations)} combinations....")
        for i, combination in enumerate(all_combinations):
            if i & 1024 == 0:
                gc.collect()
            if is_ok(combination, duration, MIN=MIN, MAX=MAX):
                return list(combination)
        logging.debug(f"No solution found so that {MIN=}s <= duration <= {MAX=}s.")
        return None
    audio = AudioSegment.from_wav(file)
    duration = audio.duration_seconds * 1000
    logging.debug(f"Duration: {duration/1000} s")
    res = None
    for silence in [1000, 800, 500, 300, 200]:
        logging.info(f"Testing silence {silence}")
        detected_silences = detect_silence(audio, min_silence_len=silence, silence_thresh=-40) # In seconds
        if detected_silences == []:
            logging.warning(f"No silences detected")
            continue
        nr_of_silences = len(detected_silences)
        logging.debug(f"Got {nr_of_silences} silences.")
        centroids = np.array(detected_silences).mean(axis=1)
        centroids = centroids.tolist()

        res = _splitter(centroids, duration, MIN=MIN, MAX=MAX)
        if res != None:
            logging.debug(f" Success! Found splitting: {res}")
            break
    if res == None:
        raise Exception("No splitting was found.")
    centroids = np.array(res).tolist()
    cuts = [0, *centroids, duration]

    return cuts
    # new_filenames = list()
    # for i, (start, end) in enumerate(zip(cuts[0:],cuts[1:])):
    #     new_file = file.replace(".flac.wav", f"_{i:2}.flac.wav")
    #     segment = audio[start:end]
    #     logging.info(f"Exporting segment nr. {i+1} as f{new_file}")
    #     segment.export(new_file, format="wav")
    #     new_filenames.append(new_file)
    # return new_filenames



In [3]:
df["cuts"] = ""
i = 0
for row in df.loc[df.to_edit,].iterrows():
    try:
        file = row[1]["path"]
        cuts = splitter(file)
        df.loc[row[0], "cuts"] = str(cuts)
        logging.info(f"Finished number {i}.")
        i += 1
    except:
        continue
df.to_csv("/home/peterr/macocu/task6_speech/8_results.csv")

2021-12-13 14:58:27,104 - INFO - Testing silence 1000
2021-12-13 14:58:27,718 - INFO - Testing silence 800
2021-12-13 14:58:28,325 - INFO - Testing silence 500
2021-12-13 14:58:32,842 - INFO - Testing silence 300
2021-12-13 14:58:34,030 - INFO - Finished number 0.
2021-12-13 14:58:34,031 - INFO - Testing silence 1000
2021-12-13 14:58:34,581 - INFO - Testing silence 800
2021-12-13 14:58:35,091 - INFO - Testing silence 500
2021-12-13 14:58:35,985 - INFO - Finished number 1.
2021-12-13 14:58:35,987 - INFO - Testing silence 1000
2021-12-13 14:58:36,537 - INFO - Testing silence 800
2021-12-13 14:58:37,006 - INFO - Testing silence 500
2021-12-13 14:58:37,352 - INFO - Testing silence 300
2021-12-13 14:58:37,791 - INFO - Finished number 2.
2021-12-13 14:58:37,792 - INFO - Testing silence 1000
2021-12-13 14:58:38,342 - INFO - Testing silence 800
2021-12-13 14:58:38,810 - INFO - Testing silence 500
2021-12-13 14:58:39,195 - INFO - Testing silence 300
2021-12-13 14:58:39,596 - INFO - Finished num