In [14]:
import os
import shutil
from pydub.utils import mediainfo
import random
import re


In [2]:
def get_sample_rate(src):
    info = mediainfo(src)
    return int(info["sample_rate"])

def get_total_samples(src):
    info = mediainfo(src)
    return int(info["duration_ts"])

def cut(src, dest, start_sample, end_sample):
    # print("cut", int(start_sample*(sample_rate / 22050)), int(start_sample*(sample_rate / 22050)) + (end_sample-start_sample))
    os.system(f'ffmpeg -i "{src}" -af atrim=start_sample={start_sample}:end_sample={end_sample} "{dest}" -y')

In [3]:
singings = 0
for dirname, _, filenames in os.walk("raw_singing"):
    for f in filenames:
        if (".wav" in f):
            singings += get_total_samples(os.path.join(dirname, f)) // 661500 - 2

speechs = 0
for dirname, _, filenames in os.walk("raw_speech"):
    for f in filenames:
        if (".wav" in f):
            speechs += get_total_samples(os.path.join(dirname, f)) // 661500

singings, speechs

(274, 419)

In [4]:
for dirname, _, fns in os.walk("./"):
    print(dirname == "xxm_mixed")

False
False
False
False
False
False


In [10]:
CHUNK_SIZE = 661500


def get_cut_list(src_dir, dst_dir, skip_head_n = 0, skip_tail_n = 0):
    cut_list = []
    
    for i, filename in enumerate(filter(lambda x: ".wav" in x, os.listdir(src_dir))):
        src =  os.path.join(src_dir, filename)
        alias = os.path.join(dst_dir, re.findall("BV[0-9a-zA-Z]*", filename)[0] + f"_{i}")
        total_samples = get_total_samples(src)
        
        for j in range(skip_head_n, total_samples // CHUNK_SIZE - skip_tail_n):
            cut_list.append((src, f"{alias}_{j}.wav", CHUNK_SIZE * j, CHUNK_SIZE * (j+1)))

    return cut_list

speech_cut_list = get_cut_list("raw_speech", "xxm_speech")
singing_cut_list = get_cut_list("raw_singing", "xxm_singing", skip_head_n=1, skip_tail_n=1)

print(len(speech_cut_list), speech_cut_list[8])
print(len(singing_cut_list), singing_cut_list[8])

419 ('raw_speech\\BV17M41197j6_书？满满快快出一本~.wav', 'xxm_speech\\BV17M41197j6_0_8.wav', 5292000, 5953500)
274 ('raw_singing\\BV1xM411R7zn_人间不值得-咻咻满-2023年10月25日-直播歌切.wav', 'xxm_singing\\BV1xM411R7zn_1_4.wav', 2646000, 3307500)


In [11]:
for x in singing_cut_list:
    cut(*x)

In [12]:
for x in random.sample(speech_cut_list, len(singing_cut_list)):
    cut(*x)

In [15]:
# Rename mixed
def rename_audio_copy_label(src_dir):
    for dirname, _, filenames in os.walk(src_dir):
        for f in filenames:
            if ".wav" in f:
                BV = re.findall("BV[0-9a-zA-Z]*", f)[0]
                print(dirname, BV)
                os.rename(os.path.join(dirname, f), os.path.join(dirname, BV+".wav"))
                shutil.copyfile(f"label/{BV}.csv",  os.path.join(dirname, BV+".csv"))


rename_audio_copy_label("xxm_mixed")
rename_audio_copy_label("xxm_mixed_test")

xxm_mixed BV1uk4y1F7nv
xxm_mixed BV1bT411S7ck
xxm_mixed BV1WN41167kE
xxm_mixed BV15z4y1M7ee
xxm_mixed BV1sL411k71J
xxm_mixed BV1rD4y1A7PY
xxm_mixed BV1mu4y1e7oL
xxm_mixed BV1V84y1e7s9
xxm_mixed BV1uN411E7KD
xxm_mixed BV1ST411t77z
xxm_mixed_test BV1JN4y1U7ba


FileNotFoundError: [Errno 2] No such file or directory: 'label/BV1JN4y1U7ba.csv'

In [None]:
# # Resample to 44.1K
# file_list = list(filter(lambda x: ".m4a" in x, os.listdir("../xxm_collection/bili0901")))

# for file in file_list:
#     sample_rate = get_sample_rate("../xxm_collection/bili0901/"+file)
#     print(file, sample_rate)
#     if (sample_rate == 44100):
#         os.system(f'cp "../xxm_collection/bili0901/{file}" raw_singing/')
#     else:
#         os.system(f'ffmpeg -i "../xxm_collection/bili0901/{file}" -ar 44100 raw_singing/{file}')
# # for i, filename in enumerate(file_list):
# #     print(i, filename)
# #     process("../xxm_collection/bili0901/"+filename, "xxm_singing/"+str(i))

In [None]:
# file_list = list(filter(lambda x: ".m4a" in x, os.listdir("raw_singing")))
# file_list = random.choices(file_list, k=137)
# file_list

# for i, filename in  enumerate(file_list):
#     print(i, filename)
#     process("raw_singing/"+filename, "xxm_singing/"+str(i), onlyone=True)