# Dataset split

This file divides our dataset into train, validation and test set. We can't use preprogrammed functions to do that, because we divided each of our files into other smallers (i.e. one sound to six images). Putting images made out of same mp3 file might lead to the *data leakage* and make our results *not trustworthy* and *biased*.

In [19]:
import os

# define number of files for all sets
train = 0.8 # 80% of all sound should be in the train set
val = 0.1 # 10% validation set
test = 0.1 # 10% test set
kfolds = 1
basePath="./data/rawData/" # path with sound files. Can be downloaded with "AM_downloadData"
imPath = "./data/prepared/" # path with images (melspectrograms)
                                # Can be generated with "AM_prepareData" after downloading sound files
destPath = "./data/splited/" # destination path - where the split dataset should be copied
                            # This folder will be used to train CNNs

# first find all of the mp3 files in the directory
birds=[] # list of all bird spiecies (Ember, Phyll...)
singleBirdList=[] # list of files for one bird
allFilesList=[] # list of all files for all birds. A list of singleBirdLists. 
for root, dirs, files in os.walk(basePath):
    if root == basePath:
        birds=dirs
print(birds)

trainSet=[]
testSet=[]
valSet=[]

birdsShort=[] # list of short file names
birdNumber=0
for nr,bird in enumerate(birds):
    for root, dirs, files in os.walk(basePath+bird):
        for file in files:
            if file.endswith(".mp3"):
                singleBirdList.append(os.path.join(root, file))
    if len(singleBirdList) > 50:
        birdsShort.append(str(birdNumber)+bird[:5])
        birdNumber = birdNumber+1;
        print("Found ", len(singleBirdList), ' mp3 files for ', bird)
        trainSet.append(int(train*len(singleBirdList)))
        valSet.append(int(val*len(singleBirdList)))
        roundDiff = len(singleBirdList)-(int(train*len(singleBirdList))+int(test*len(singleBirdList))+int(val*len(singleBirdList))) 
        testSet.append(int(test*len(singleBirdList)) + roundDiff)
        print("Size of train: ", int(train*len(singleBirdList)),", val: ",int(val*len(singleBirdList)) ,", test: ",int(test*len(singleBirdList)))
        allFilesList.append(singleBirdList)
    singleBirdList=[]


print(trainSet)
print(valSet)
print(testSet)


['Alaudaarvensis', 'Erithacusrubecula', 'Fringillacoelebs', 'Parusmajor', 'Phoenicurusochruros', 'Phoenicurusphoenicurus', 'Phylloscopuscollybita', 'Troglodytestroglodytes', 'Turdusmerula', 'Turdusphilomelos']
Found  195  mp3 files for  Alaudaarvensis
Size of train:  156 , val:  19 , test:  19
Found  547  mp3 files for  Erithacusrubecula
Size of train:  437 , val:  54 , test:  54
Found  462  mp3 files for  Fringillacoelebs
Size of train:  369 , val:  46 , test:  46
Found  424  mp3 files for  Parusmajor
Size of train:  339 , val:  42 , test:  42
Found  268  mp3 files for  Phoenicurusochruros
Size of train:  214 , val:  26 , test:  26
Found  162  mp3 files for  Phoenicurusphoenicurus
Size of train:  129 , val:  16 , test:  16
Found  288  mp3 files for  Phylloscopuscollybita
Size of train:  230 , val:  28 , test:  28
Found  331  mp3 files for  Troglodytestroglodytes
Size of train:  264 , val:  33 , test:  33
Found  353  mp3 files for  Turdusmerula
Size of train:  282 , val:  35 , test:  3

In [20]:
# randomly choose mp3 files for each set  

from random import sample

trainFiles=[]
valFiles=[]
testFiles=[]

for index, singleBirdList in enumerate(allFilesList):
    randFiles = sample(range(len(singleBirdList)), len(singleBirdList))
    start = 0
    end = trainSet[index]
    trainFiles.append(randFiles[start:end])
    start = end 
    end = start + valSet[index]
    valFiles.append(randFiles[start:end])
    start = end 
    end = start + testSet[index] 
    testFiles.append(randFiles[start:end])
    print("Selected random files number:\n train: ", len(trainFiles[index]),"/",trainSet[index],
          ", val: ",len(valFiles[index]),"/",valSet[index],
          ", test: ",len(testFiles[index]),"/",testSet[index])



Selected random files number:
 train:  156 / 156 , val:  19 / 19 , test:  20 / 20
Selected random files number:
 train:  437 / 437 , val:  54 / 54 , test:  56 / 56
Selected random files number:
 train:  369 / 369 , val:  46 / 46 , test:  47 / 47
Selected random files number:
 train:  339 / 339 , val:  42 / 42 , test:  43 / 43
Selected random files number:
 train:  214 / 214 , val:  26 / 26 , test:  28 / 28
Selected random files number:
 train:  129 / 129 , val:  16 / 16 , test:  17 / 17
Selected random files number:
 train:  230 / 230 , val:  28 / 28 , test:  30 / 30
Selected random files number:
 train:  264 / 264 , val:  33 / 33 , test:  34 / 34
Selected random files number:
 train:  282 / 282 , val:  35 / 35 , test:  36 / 36
Selected random files number:
 train:  280 / 280 , val:  35 / 35 , test:  36 / 36


# Copying files to a new directory
In previous blocks we've randomly selected which files will belong to which sets. Now we have to find corresponding images to those files and copy them to new directory.

* new directory - i.e. ..data/mels/final
    - train 
        - one folder for each class
        - Ember
        - Phyll
        - ...
    - val
        - folders for each class
    - test
        - folders for each class

In [44]:
def extractName(string):
    return string.split("\\")[-1].replace(' ', '')[:-4]

In [22]:
# sort all the lists to make copying files easier
sets=[trainFiles,valFiles,testFiles]
for fileSet in sets:
    for index,files in enumerate(fileSet):
        fileSet[index].sort()

In [24]:
# change full names to short

for root, dirs, files in os.walk(basePath):
    if root == basePath:
        birds=dirs
# birdsShort=[]
# for bird in birds:
#    birdsShort.append(bird[:5])

setNames = ["train/","val/","test/"]

print("Long: ", birds,"\nShort: ",birdsShort)


Long:  ['Alaudaarvensis', 'Erithacusrubecula', 'Fringillacoelebs', 'Parusmajor', 'Phoenicurusochruros', 'Phoenicurusphoenicurus', 'Phylloscopuscollybita', 'Troglodytestroglodytes', 'Turdusmerula', 'Turdusphilomelos'] 
Short:  ['0Alaud', '1Erith', '2Fring', '3Parus', '4Phoen', '5Phoen', '6Phyll', '7Trogl', '8Turdu', '9Turdu']


In [47]:

import shutil

counter=0
for birdNumber, bird in enumerate(birdsShort): # for each class (bird) check where the file should be copied
    print(counter)
    counter=0
    for setName, fileSet in zip(setNames, sets): # check for all datasets: train, val and test sests 
        for setNumber in fileSet[birdNumber]:
            for fileNumber, file in enumerate(allFilesList[birdNumber]):
                if setNumber == fileNumber: # if file number to copy is same as number of file, then copy it
                    
                    for root, dirs, files in os.walk(imPath):
                        for file2 in files:
                            extracted_name = extractName(file)
                            # print()
                            # print("Extracted name:", extracted_name)
                            # print("Current file:", file2)
                            # print("Is in", extracted_name in file2)

                            if extracted_name in file2:
                                counter=counter+1
                                source=root+"/"+file2
                                
                                destination = destPath+setName+bird+"/"
                                if not os.path.exists(destination):
                                    os.makedirs(destination)
                                shutil.copy2(source, destination)
                                #print(source, "   ->   ", destination)

            

0
2662
11886
7913
5448
4465
3037
3515
7422
15305
