In [1]:
import librosa
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts


In [2]:
# notebook dir. Путь к этой тетради и разархивированным датасетам calls и CallsDataset
dir_path = %pwd
# сигналы false {dir_path+"\\calls\\false"}
# сигналы true {dir_path+"\\calls\\true"}

```python
# Для создания таблиц все файлы из демо и рабочей директории раскинул по двум папкам calls\false и calls\true
def dct():
    count = 0
    dct = {}
    dct['id'] = []
    dct['name'] = []
    dct['duration'] = []
    for file in os.listdir():
        y, sr = librosa.load(file)
        count += 1
        dct['id'].append(count)
        dct['name'].append(file)
        dct['duration'].append(int(len(y)/sr))
    return pd.DataFrame(dct)

# path = dir_path

#%cd {dir_path+"\\calls\\false"}
# Сохраняем измененную таблицу в новый CSV-файл
#dct().to_csv(path+'false.csv', index=False)

#%cd {dir_path+"\\calls\\true"}
# Сохраняем измененную таблицу в новый CSV-файл
#dct().to_csv(path+'true.csv', index=False)

%cd {dir_path}

print('false.csv', 'duration', np.median(pd.read_csv('false.csv')['duration']))
print('true.csv', 'duration', np.median(pd.read_csv('true.csv')['duration']))
```

```
false.csv duration 4.0
true.csv duration 36.0
```

```python
# time split
sec = 20
segments_sec_label = f'segments_{sec}sec'
print(segments_sec_label)

# add col comment to data. Генератор списка аугментаций к сигналу в зависимости от его длины
aug_gen = (lambda text='': 'augs-- gen' + text
           + np.random.choice(['', ' white_noise', ' mx2y', 
                               ' mxy', ' low_hz', ' avg', 
                               ' slow_down', ' echo', ' relu', 
                               ' mn2y', ' mny', ' reverse', 
                               ' stft', ' noised_echo', ' high_hz', 
                               ' pink_noise']))
comment = lambda x: aug_gen(" repeat") if x < sec else aug_gen()
# Пример comment(len(y)/sr): augs-- gen white_noise mx2y avg noised_echo

# add col original_path to data
def original_path(label):
    file_name = ['FalseSilent', 'FalseSounds', 'FalseSpeech', 'TrueSilent',
                 'TrueSounds', 'TrueSpeech', 'false', 'true']
    file_dir =  ["CallsDataset\\Ложь_тишин", # 'FalseSilent'
                "CallsDataset\\Ложь_посторонние_звуки", # 'FalseSounds'
                "CallsDataset\\Ложь_разговоры", # 'FalseSpeech'
                "CallsDataset\\Истина_тишина", # 'TrueSilent'
                "CallsDataset\\Истина_посторонние_звуки", # 'TrueSounds'
                "CallsDataset\\Истина_разговоры", # 'TrueSpeech'
                "calls\\false", # 'false'
                "calls\\true"] # 'true'
    file_dict = dict(zip(file_name, file_dir))
    label = label.split(' ')[0]
    return file_dict[label]

# add col new_file_names to data
def new_file_names(data):
    cls = lambda x: 'true' if x==1 else 'false'
    lst = []
    data = data.reset_index(drop=True)
    for i in range(len(data)):
        row = data.loc[i, ['label', 'id', 'sample', 'cls']]
        name = ' id-'.join(map(str,[row[0], row[1]]))+'.mp3'
        path = f"dataset\\{row[2]}\\{cls(row[3])}\\" + name
        lst.append(path)
    data['path'] = lst
    return data

# repeat rows
def add_rows(data, k=None):
    def _add_rows(data, i, k):
        if k == None:
            k = data.loc[i,segments_sec_label]
        if k > 1:
            k -= 1
            new_data = pd.concat([data[i:i+1]]*k, axis=0)
            data = pd.concat([data, new_data])
        return data
    for i in range(len(data)):
        data = _add_rows(data, i, k)
    data = data.sort_values(['id', 'label', 'segment_number']).reset_index(drop=True)
    return data

# add label, file_type, cls, sample,
# add segments_sec, segment_number, start_sec, 
# add stop_sec, segment_type, origin_duration,
# add comment, new_file_names
# add rows
# update id, duration
# to data
def upgrade_data(label):
    data = pd.read_csv(label)
    
    # name --> label, file_type
    def split_name(data, del_name=False):
        data[["label", "file_type"]] = data["name"].str.split(".", n=1, expand=True)
        if del_name: data.drop("name", axis=1, inplace=True)
        return data    
    data = split_name(data, del_name=False)
    
    # original_path
    data['original_path'] = data['label'].apply(lambda label: original_path(label)+'\\'+label+'.mp3')
    
    # cls
    data = data.assign(cls=lambda x: 1 if label.split('.')[0]=='true' else 0)
    
    # sample
    def sample(data):
        s = ['val', 'val', 'test', 'train', 'train', 'train', 'train', 'train', 'train', 'train']
        sample = []
        count = 0
        data = data.sort_values('duration', ascending=False)
        for x in data['duration']:
            sample.append(s[count])
            if count < len(s)-1: count += 1
            else: count = 0
        data['sample'] = sample
        return data.sort_values('id', ascending=True)
    data = sample(data)
    
    # segments_sec, segment_number, start_sec, stop_sec, segment_type, duration, origin_duration
    def segment_options(data):
        # origin_duration
        data['original_duration'] = data['duration']
        # segments_sec with shift 50%
        shift = 2
        data[segments_sec_label] = data['original_duration'].apply(lambda x: int(x/sec)*shift)
        def segment_time(data):
            segments = np.unique(data[segments_sec_label]).item()
            duration = np.unique(data['original_duration']).item()
            s = int(sec/shift)
            start = [x*s for x in range(segments)]
            finish = [(x*s)+shift*s for x in range(segments-1)]+[duration]
            return start, finish  
        # segment_type
        data = data.assign(segment_type=lambda x: 'full_audio')
        # segment_number
        data = data.assign(segment_number=lambda x: 0)
        # start_sec, stop_sec
        data = data.assign(start_sec=lambda x: 0).assign(stop_sec=lambda x: data['duration'])    
        # repeat rows. id * segments_sec_label
        data = add_rows(data)    
        # set start_sec, stop_sec, segment_type for new_data.  data_0_1(original audio), new_data(fragments) --> data
        data_0_1 = data.query(f'{segments_sec_label} in [0,1]')
        data = data.query(f'{segments_sec_label} not in [0,1]').reset_index(drop=True)
        new_data = data[0:0]
        for i in np.unique(data['id']):
            fragment = data.query(f'id == {i}').reset_index(drop=True)
            fragment['segment_number'] = np.arange(len(fragment))
            start, stop = segment_time(fragment)
            fragment['start_sec'] = start
            fragment['stop_sec'] = stop
            fragment['segment_type'] = ['segment']*len(fragment)
            new_data = pd.concat([new_data, fragment])
        data = pd.concat([data_0_1, new_data]).reset_index(drop=True)   
        # update duration
        data['duration'] = data['stop_sec']-data['start_sec']
        return  data
    data = segment_options(data)
    # update id
    data = data.sort_values(['id','segment_number'])
    data['id'] = np.arange(len(data))
    columns_lst = ['label', 'cls', 'id', 'sample', 'original_duration', segments_sec_label, 
                   'segment_type', 'segment_number', 'duration', 'start_sec', 'stop_sec', 'original_path']
    data = data[columns_lst]
    return data.reset_index(drop=True)

# true, false*k --> all
def result_data(false, true, false_k=4, reversion_shift=True):
    # false --> false*k
    false = add_rows(false, k=false_k)
    # len(false) --> len(false)=len(true)
    def balanced_true_class(false, true, k=1):
        def query(sample):
            s = true.copy()
            s = s.query(f'sample == "{sample}"')
            s = s.sort_values('duration', ascending=False)
            s = s.reset_index(drop=True)
            l = len(false.query(f'sample == "{sample}"'))*k
            s = s[:l]
            return s
        return pd.concat([query("train"),query("val"),query("test")]).reset_index(drop=True)
    true = balanced_true_class(false, true, k=1)
    # true + false --> all
    data = pd.concat([true,false])
    # all*3
    data = pd.concat([data]*2)
    # короткие сегменты портят статистику в тестовых выборках. 
    # пусть будут в трейне, больше пользы.
    lst = data.query('sample == "test" and duration < 5').index
    data.drop(index=lst, inplace=True)
    
    #######################
    data = data.sort_values(['cls', 'id', 'label', 'segment_number']).reset_index(drop=True)
    smpls = ["train", "val"]
    
    # reversion shift. отступ между стартами сегментов для выборок в smpls
    if reversion_shift:
        # индексы каждого 4 сегмента длинного сигнала. Создаем разницу между стартами сегментов 4 x 3 секунд
        lst = np.arange(0, 559, 4)
        # список лишних сегментов - все, кроме каждого 4 сегмента.
        lst = data.query('sample in @smpls and segment_number not in @lst').index
        # Удалим все строки по индексам кроме каждого 4 значения, изменив исходный датафрейм
        data.drop(index=lst, inplace=True)

    # comment
    data['comment'] =data['duration'].apply(lambda x: comment(x))
    
    # update id
    data = data.sort_values(['cls', 'id', 'label', 'segment_number']).reset_index(drop=True)
    data['id'] = np.arange(len(data))
    
    # new_file_name
    data = new_file_names(data)
    return data

%cd {dir_path}
false = upgrade_data('false.csv').query('original_duration >= 15')
true = upgrade_data('true.csv').query('original_duration >= 15')
all_data = result_data(false, true, false_k=45, reversion_shift=True)
all_data.to_csv('revshift_20-2.csv', index=False)

# show result
def items(data):
    def items(data, cls, sample):
        items = lambda data, cls, sample: data.query(f'cls == {cls} and sample == "{sample}"')['duration']
        s = items(data, cls, sample)
        # str: sample files count
        result = sample + ': ' + str(len(s)) + ' files. '
        # str: sample duration
        result_2 = str(sum(s)) + ' sec. '
        # str: Average duration for sample
        result_3 = 'Average: ' + str(np.median(s)) + ' sec.'
        return print(result + result_2 + result_3)
    for cls in [0, 1]:
        if cls==0: head = '\nfalse '
        else: head = '\n\ntrue '
        print(head, len(data.query(f'cls == {cls}')))
        for sample in ["train", "val", "test"]:
            items(data, cls, sample)
            
items(all_data)
```


```
segments_20sec
C:\Users\dtata\Downloads\AI Machine Learning\emergency_calls\back_up\dataset_revshift

false  6840
train: 3780 files. 72720 sec. Average: 20.0 sec.
val: 1710 files. 33840 sec. Average: 20.0 sec.
test: 1350 files. 25380 sec. Average: 20.0 sec.


true  1660
train: 918 files. 18178 sec. Average: 20.0 sec.
val: 292 files. 5778 sec. Average: 20.0 sec.
test: 450 files. 8882 sec. Average: 20.0 sec.
```

```python
# Последний этап шлифовки. Балансировка числа аугментаций.
# Нужно сократить false в train и val.
all_data = pd.read_csv('revshift_20-2.csv')
 
train_data_cls0 = all_data.query('sample == "train" and cls == 0 and comment != "augs-- "').reset_index(drop=True)
train_data_cls0 = train_data_cls0.loc[np.arange(0,len(train_data_cls0), 4)]
train_data_cls1 = all_data.query('sample == "train" and cls == 1')

val_data_cls0 = all_data.query('sample == "val" and cls == 0 and comment != "augs-- "').reset_index(drop=True)
val_data_cls0 = val_data_cls0.loc[np.arange(0,len(val_data_cls0), 6)]
val_data_cls1 = all_data.query('sample == "val" and cls == 1')

test_data = all_data.query('sample == "test"')
all_data = pd.concat([train_data_cls0, train_data_cls1, val_data_cls0, val_data_cls1, test_data])

# новый признак - count. Считает число повторений одного сегмента в таблице
# count = оригинальный_сегмент + аугментированный_сегмент * x
cols = ['label', 'segment_number']
data_vc = all_data.value_counts(cols).reset_index()
all_data = pd.merge(all_data, data_vc, on=cols)

# фильтруем тестововую выборку до уникальных значений более 15 секунд. Удаляем аугментации
test_data = pd.read_csv('revshift_20-2.csv')
test_data = test_data.sort_values(['label', 'segment_number'])
test_data = test_data.query('sample == "test"').drop_duplicates(['label'])
test_data = test_data.assign(comment='augs-- ').assign(count=1)

all_data = pd.concat([all_data.query('sample not in "test"'), 
                      test_data]).query('duration >= 15').reset_index(drop=True)
all_data['original_dir_cls'] = all_data['original_path'].apply(lambda x: x.split("\\")[1])
items(all_data)
all_data.to_csv('revshift_20-2.csv', index=False)
```


```
false  1236
train: 945 files. 18184 sec. Average: 20.0 sec.
val: 285 files. 5640 sec. Average: 20.0 sec.
test: 6 files. 118 sec. Average: 20.0 sec.


true  1251
train: 918 files. 18178 sec. Average: 20.0 sec.
val: 292 files. 5778 sec. Average: 20.0 sec.
test: 41 files. 804 sec. Average: 20.0 sec.
```