In [1]:
import os
from sklearn.model_selection import train_test_split
import shutil
import pandas as pd

In [2]:
base_dir = '/data/private/SU/bbchip13/chainsaw_classification/'
chainsaw_dir = base_dir+'wav_chainsaw_5sec/'
no_chainsaw_dir = base_dir+'/ESC-50-master/split_wav/'

result_base_dir = base_dir+'data/'

In [3]:
merge_list = []

chainsaw_filenames = [chainsaw_dir+filename for filename in os.listdir(chainsaw_dir)
                         if filename.endswith('.wav')]
chainsaw_filenames = [filename for filename in chainsaw_filenames
                         if os.path.getsize(filename) == 441078]

x_train_chainsaw, x_test_chainsaw \
    = train_test_split(chainsaw_filenames, test_size = 0.25, random_state = 7)
x_train_chainsaw, x_val_chainsaw \
    = train_test_split(x_train_chainsaw, test_size = 0.33, random_state = 7)
    
merge_list += [(os.path.basename(filename), 1, 'train') for filename in x_train_chainsaw]
merge_list += [(os.path.basename(filename), 1, 'val') for filename in x_val_chainsaw]
merge_list += [(os.path.basename(filename), 1, 'test') for filename in x_test_chainsaw]

os.makedirs(result_base_dir, exist_ok=True)
for filename in chainsaw_filenames:
    shutil.copy(filename, result_base_dir)

no_chainsaw_dirs = [no_chainsaw_dir+dirname for dirname in os.listdir(no_chainsaw_dir)
                       if os.path.isdir(no_chainsaw_dir+dirname)]
for dirname in no_chainsaw_dirs:
    no_chainsaw_filenames = [dirname+'/'+filename for filename in os.listdir(dirname)
                             if filename.endswith('.wav')]
    
    x_train_no_chainsaw, x_test_no_chainsaw \
        = train_test_split(no_chainsaw_filenames, test_size = 0.33, random_state = 7)
    x_train_no_chainsaw, x_val_no_chainsaw \
        = train_test_split(x_train_no_chainsaw, test_size = 0.33, random_state = 7)
        
    merge_list += [(os.path.basename(filename), 0, 'train') for filename in x_train_no_chainsaw]
    merge_list += [(os.path.basename(filename), 0, 'val') for filename in x_val_no_chainsaw]
    merge_list += [(os.path.basename(filename), 0, 'test') for filename in x_test_no_chainsaw]
    
    for filename in no_chainsaw_filenames:
        shutil.copy(filename, result_base_dir)

In [4]:
merge_list[:10]

[('3-144827-A-11.wav', 0, 'train'),
 ('2-125966-A-11.wav', 0, 'train'),
 ('4-195497-B-11.wav', 0, 'train'),
 ('2-137162-A-11.wav', 0, 'train'),
 ('4-167063-A-11.wav', 0, 'train'),
 ('3-155642-B-11.wav', 0, 'train'),
 ('2-132157-A-11.wav', 0, 'train'),
 ('5-200461-B-11.wav', 0, 'train'),
 ('4-204618-A-11.wav', 0, 'val'),
 ('1-28135-B-11.wav', 0, 'val'),
 ('2-155801-A-11.wav', 0, 'val'),
 ('2-102852-A-11.wav', 0, 'val'),
 ('1-91359-B-11.wav', 0, 'val'),
 ('5-219379-C-11.wav', 0, 'val'),
 ('3-164120-A-11.wav', 0, 'val'),
 ('1-39901-B-11.wav', 0, 'val'),
 ('1-43760-A-11.wav', 0, 'val'),
 ('3-144827-B-11.wav', 0, 'test'),
 ('5-219379-A-11.wav', 0, 'test'),
 ('5-208810-A-11.wav', 0, 'test'),
 ('3-155642-A-11.wav', 0, 'test'),
 ('5-200461-A-11.wav', 0, 'test'),
 ('1-28135-A-11.wav', 0, 'test'),
 ('3-166422-A-11.wav', 0, 'test'),
 ('1-39901-A-11.wav', 0, 'test'),
 ('2-124662-A-11.wav', 0, 'test'),
 ('5-213077-A-11.wav', 0, 'test'),
 ('4-195497-A-11.wav', 0, 'test'),
 ('3-164630-A-11.wav', 0, '

In [5]:
df = pd.DataFrame({
                    'filename':[filename for filename, _, _ in merge_list],
                    'label':[label for _, label, _ in merge_list],
                    'type':[data_type for _, _, data_type in merge_list]
                  })
df = df.sample(frac=1).reset_index(drop=True)
df.head

<bound method NDFrame.head of                filename  label   type
0     3-112557-A-23.wav      0   test
1      1-17150-A-12.wav      0   test
2        1455_64.07.wav      1   test
3      4-132383-A-2.wav      0  train
4      4-159426-A-9.wav      0  train
5        1070_55.21.wav      1   test
6        1527_49.08.wav      1  train
7     2-135649-B-45.wav      0   test
8         847_60.15.wav      1  train
9      4-130584-A-4.wav      0  train
10     1-24524-B-19.wav      0  train
11     4-161303-B-5.wav      0    val
12       820_425.46.wav      1  train
13    5-261464-A-23.wav      0    val
14        523_47.35.wav      1   test
15       1039_81.95.wav      1  train
16    5-240671-A-44.wav      0   test
17     1-38559-A-14.wav      0    val
18      1152_119.31.wav      1  train
19     2-98392-A-23.wav      0    val
20        345_18.41.wav      1  train
21        339_65.26.wav      1   test
22     1-80840-A-13.wav      0  train
23      1243_320.39.wav      1   test
24    5-254832-B-15.

In [6]:
df.to_csv('data_annotations.csv', index=False)

In [None]:
df = pd.read_csv('data_annotations.csv')
for idx, filename, label, data_type in df.itertuples():
    result_dir = result_base_dir+data_type+'/'
    os.makedirs(result_dir, exist_ok=True)
    shutil.move(result_base_dir+filename, result_dir)