In [None]:
from random         import randint
from math           import floor
from matplotlib     import pyplot
from numpy          import mean, cov, std
from scipy.stats    import pearsonr
from os.path import exists
from os import makedirs

import csv

Setting of the datasets

In [None]:
# singer (vocal database) pitch range setup
PITCH_RANGE = {
    "1": (57,76), # V01:  A2(57)-E4(76)
    "2": (54,77), # V02:  F#2(54)-F4(77)
    "3": (53,69), # V03:  F2(53)-A3(69) 
    "4": (45,72), # V04:  A1(45)-C4(72)
    "5": (52,79), # V05:  E2(52)-G4(79)
    "6": (52,78)  # V06:  E2(52)-G#3(78)
}

# parameter value range setup
PARAM_RANGE = {
    # full range
    "a": {
        "global": {
            "BRE": (0,127), # (min,max)  
            "BRI": (0,127),  
            "CLE": (0,127),  
            "GEN": (0,127),  
            "GWL": (0,127),  
            "OPE": (0,127),  
            "Vibrate": (1,5) 
        }   
    },
    # limited range
    "b": {
        "global": None,
        "1": {
            "BRE": (0,63),  
            "BRI": (15,127),  
            "CLE": (0,31),  
            "GEN": (44,106),  
            "GWL": (0,127),  
            "OPE": (0,127),  
            "Vibrate": (1,5) 
        },
        "2": {
            "BRE": (0,95),  
            "BRI": (32,127),  
            "CLE": (0,42),  
            "GEN": (31,111),  
            "GWL": (0,127),  
            "OPE": (0,127),  
            "Vibrate": (1,5) 
        },
        "3": {
            "BRE": (0,5),  
            "BRI": (15,127),  
            "CLE": (0,31),  
            "GEN": (38,104),  
            "GWL": (0,127),  
            "OPE": (0,127),  
            "Vibrate": (1,5) 
        },
        "4": {
            "BRE": (0,63),  
            "BRI": (15,127),  
            "CLE": (0,31),  
            "GEN": (36,100),  
            "GWL": (0,112),  
            "OPE": (0,127),  
            "Vibrate": (1,5) 
        },
        "5": {
            "BRE": (0,78),  
            "BRI": (20,127),  
            "CLE": (0,31),  
            "GEN": (32,127),  
            "GWL": (0,127),  
            "OPE": (0,127),  
            "Vibrate": (1,5) 
        },
        "6": {
            "BRE": (0,64),  
            "BRI": (20,127),  
            "CLE": (0,64),  
            "GEN": (32,112),  
            "GWL": (0,127),  
            "OPE": (0,127),  
            "Vibrate": (1,5) 
        }
    }
}

# setups of dataset generation
GEN_SETUP = {
    # testing set of full parameter range
    "1": {
        "sample_size"   : 21350,
        "singer_ls"     : [1,2,3,4,5],
        "singer_distri" : [4270, 4270, 4270, 4270, 4270],   # sample distribution of singer
        "param_range"   : PARAM_RANGE["a"]
    },
    # validation set of full parameter range
    "2": {
        "sample_size"   : 6000,
        "singer_ls"     : [1,2,3,4,5,6],
        "singer_distri" : [1000, 1000, 1000, 1000, 1000, 1000],   # sample distribution of singer
        "param_range"   : PARAM_RANGE["a"]
    },
    # training set of full parameter range
    "3": {
        "sample_size"   : 6000,
        "singer_ls"     : [1,2,3,4,5,6],
        "singer_distri" : [1000, 1000, 1000, 1000, 1000, 1000],   # sample distribution of singer
        "param_range"   : PARAM_RANGE["a"]
    },
    # testing set of limited parameter range
    "4": {
        "sample_size"   : 21350,
        "singer_ls"     : [1,2,3,4,5],
        "singer_distri" : [4270, 4270, 4270, 4270, 4270],   # sample distribution of singer
        "param_range"   : PARAM_RANGE["b"]
    },
    # validation set of limited parameter range
    "5": {
        "sample_size"   : 6000,
        "singer_ls"     : [1,2,3,4,5,6],
        "singer_distri" : [1000, 1000, 1000, 1000, 1000, 1000],   # sample distribution of singer
        "param_range"   : PARAM_RANGE["b"]
    },
    # training set of limited parameter range
    "6": {
        "sample_size"   : 6000,
        "singer_ls"     : [1,2,3,4,5,6],
        "singer_distri" : [1000, 1000, 1000, 1000, 1000, 1000],   # sample distribution of singer
        "param_range"   : PARAM_RANGE["b"]
    },
    "7": {
        "sample_size"   : 6000,
        "singer_ls"     : [1,2,3,4,5,6],
        "singer_distri" : [1000, 1000, 1000, 1000, 1000, 1000],   # sample distribution of singer
        "param_range"   : PARAM_RANGE["b"]
    }
}

In [None]:
# current generate dataset
DATASET = "7"

Constant

In [None]:
# constant
import os
import sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "..\\00_Constant"))
if sys.path.count(parent_dir) == 0:
    sys.path.append(parent_dir)
import fyp_constants as constants

# path
DATA_FOLDER_PATH        = constants.DATA_FOLDER_PATH
DATASET_FOLDER_PATH     = f"{DATA_FOLDER_PATH}\\{DATASET}"
CSV_FOLDER_PATH         = f"{DATASET_FOLDER_PATH}\\{constants.DATA_CSV_FOLDER_NM}"
SAMPLECSV_FOLDER_PATH   = f"{DATASET_FOLDER_PATH}\\{constants.DATA_SAMPLE_FOLDER_NM}"
RAW_FOLDER_PATH         = f"{DATASET_FOLDER_PATH}\\{constants.DATA_RAW_FOLDER_NM}"
SAMPLEWAV_FOLDER_PATH   = f"{DATASET_FOLDER_PATH}\\{constants.DATA_SAMPLEWAV_FOLDER_NM}"
PROCESSED_FOLDER_PATH   = f"{DATASET_FOLDER_PATH}\\{constants.DATA_PROCESSED_FOLDER_NM}"
ALLSAMPLE_CSVFILE_PATH  = f"{CSV_FOLDER_PATH}\\{constants.ALLSAMPLE_CSVFILE_NM}"

NOTESEQ_CSVFILE_PATH    = constants.NOTESEQ_CSVFILE_PATH
LYRIC_CSVFILE_PATH      = constants.LYRIC_CSVFILE_PATH

# sample setting
SAMPLE_LEN              = constants.SAMPLE_LEN
NOTELEN_LS              = constants.NOTELEN_LS
NOTELEN_UNIT            = constants.NOTELEN_UNIT
NOTESCALE               = constants.NOTESCALE
PARAMETERTYPE_LS        = constants.PARAMETERTYPE_LS


# note sequence
noteSeq_ls = []
with open(NOTESEQ_CSVFILE_PATH, "r") as seqFile:
    csv_reader = csv.reader(seqFile, delimiter=",")
    for line in csv_reader:
        noteSeq = []
        for note_len in line:
            noteSeq.append(float(note_len))
        noteSeq_ls.append(noteSeq)
NOTESEQ_LS      = noteSeq_ls
NOTESEQ_COUNT   = len(NOTESEQ_LS)

#lyric
with open(LYRIC_CSVFILE_PATH, "r") as lyric_csv:
    csv_reader = csv.reader(lyric_csv, delimiter=",")
    LYRIC_LS = next(csv_reader)
LYRIC_COUNT = len(LYRIC_LS)

In [None]:
# check setup
print(NOTESEQ_COUNT)
print(LYRIC_COUNT)

print("Dataset:",DATASET)
print("Setting folder:",DATASET_FOLDER_PATH)

Sample & Note Class

In [None]:
class Note:
    def __init__(self, length, pitch_v, lyric):
        self.length     = length
        self.pitch_v    = pitch_v
        self.lyric      = lyric

    def getPitch(self):
        return NOTESCALE[self.pitch_v%len(NOTESCALE)]+str(floor(self.pitch_v/len(NOTESCALE))-2)

    def getLenValue(self):
        return floor(self.length*NOTELEN_UNIT)


class Sample:
    def __init__(self):
        # singer id
        self.singer:int             = None
        self.note_seq:list[Note]    = []
        self.parameter:dict         = {}

    def set_singer(self, s:int):
        self.singer = s

    def add_note(self, note:Note):
        self.note_seq.append(note)

    def set_parameter(self, type, value):
        self.parameter[type] = value

    # convert this sample to a string for a CSV record
    def toCsvStr(self):
        end = ""
        
        pitchVSeqStr    = ""
        pitchSeqStr     = ""
        lyricSeqStr     = ""
        lenVSeqStr      = ""
        lenSeqStr       = ""

        for idx in range(len(self.note_seq)):
            end = " " if idx != len(self.note_seq)-1 else ""

            pitchVSeqStr    += str(self.note_seq[idx].pitch_v)+end
            pitchSeqStr     += self.note_seq[idx].getPitch()+end
            lyricSeqStr     += self.note_seq[idx].lyric+end
            lenVSeqStr      += str(self.note_seq[idx].getLenValue())+end
            lenSeqStr       += str(self.note_seq[idx].length)+"s"+end

        p = self.parameter

        return f"{self.singer},{len(self.note_seq)},{pitchVSeqStr},{pitchSeqStr},{lyricSeqStr},{lenVSeqStr},{lenSeqStr},{p['BRE']},{p['BRI']},{p['CLE']},{p['GEN']},{p['GWL']},{p['OPE']},{p['Vibrate']}"

Generate samples

In [None]:
# assign setting to a list of samples
def assign(sample_ls : list[Sample]):
    # assign singer (vocal database)
    distributeSinger(sample_ls)

    for sample in sample_ls:
        # assign a parameter setting to the sample
        assignParam(sample)
        
        # randomly assign a note sequence to the sample
        note_seq = NOTESEQ_LS[randint(0,NOTESEQ_COUNT-1)]
        
        # generate note for the sample
        for note_len in note_seq:
            # randomly assign pitch to the note
            pitch_range = PITCH_RANGE[str(sample.singer)]
            pitch_v = randint(pitch_range[0], pitch_range[1])
            
            # randomly assign lyric to the note
            lyric   = LYRIC_LS[randint(0, LYRIC_COUNT-1)]

            # create the note
            note = Note(length=note_len, pitch_v=pitch_v, lyric=lyric)
            sample.add_note(note)
    return
            

# distribute singer (vocal database) for a list of sample
def distributeSinger(sample_ls:list[Sample]):
    sample_idx = 0
    # for each singer in the dataset setup
    for singer_idx in range(len(GEN_SETUP[DATASET]["singer_ls"])):
        # the amount of sample distributed to this singner
        amt = GEN_SETUP[DATASET]["singer_distri"][singer_idx]
        # the id of this singer
        singer_id = GEN_SETUP[DATASET]["singer_ls"][singer_idx]
        
        # assign next (amt) of record to this singer
        for _ in range(amt):
            sample_ls[sample_idx].set_singer(singer_id)
            sample_idx += 1
    return


# generate attribute parameter
def assignParam(sample:Sample):
    # get the param range of the generating dataset
    param_setup = GEN_SETUP[DATASET]["param_range"]
    # get the param range of the singer of the sample
    param_range = param_setup[str(sample.singer)] if param_setup["global"] == None else param_setup["global"]
    
    # assign the parameter value to the sample
    for type in PARAMETERTYPE_LS:
        value = randint(param_range[type][0], param_range[type][1])
        sample.set_parameter(type, value)
        
        
# check parameter is diversity enough
def checkParam(sample_ls:list[Sample]):
    parameter_data = { type:[sample.parameter[type] for sample in sample_ls] for type in PARAMETERTYPE_LS }
    
    print("Count:")
    print(str(len(sample_ls))+"\n")

    print(f"Mean:")
    for type in parameter_data:
        print(f"{type} {mean(parameter_data[type])}", end="; ")
    print("\n")

    print(f"Correlation:")
    for idxA in range(len(PARAMETERTYPE_LS)):
        for idxB in range(idxA+1, len(PARAMETERTYPE_LS)):
            typeA   = PARAMETERTYPE_LS[idxA]
            typeB   = PARAMETERTYPE_LS[idxB]
            lsA     = parameter_data[typeA]
            lsB     = parameter_data[typeB]
            corr, _ = pearsonr(lsA, lsB)
            print(f"{typeA}-{typeB}: {corr}")

In [None]:
# Main

# create the neccessary folders
def createFolder(path):
    if not exists(path):
        makedirs(path)
createFolder(DATA_FOLDER_PATH)
createFolder(DATASET_FOLDER_PATH)
createFolder(CSV_FOLDER_PATH)
createFolder(SAMPLECSV_FOLDER_PATH)
createFolder(RAW_FOLDER_PATH)
createFolder(SAMPLEWAV_FOLDER_PATH)
createFolder(PROCESSED_FOLDER_PATH)


# init samples
sample_ls = [Sample() for _ in range(GEN_SETUP[DATASET]["sample_size"])]

# assign setting to the samples
assign(sample_ls)
# summary the sample setting
checkParam(sample_ls)


# wirte to csv, 2 types of csv file will be created
# 1) allSample.csv: a csv file that store the data of all the samples
# 2) [sample idx].csv: every sample will have its own csv file, for sample generating
with open(ALLSAMPLE_CSVFILE_PATH, "w") as allSample_csvFile:
    allSample_csvFile.write("#,singer,note count,pitch value seq,pitch seq,lyric,note length value seq,note length seq,BRE,BRI,CLE,GEN,GWL,OPE,Vibrate\n")

    sample_cnt = 0
    for sample in sample_ls:
        sample_cnt += 1
        sampleStr = f"{sample_cnt},{sample.toCsvStr()}"

        # append the record to the allSample.csv
        allSample_csvFile.write(f"{sampleStr}\n")
        
        # create a type 2 csv file for this sample
        with open(f"{SAMPLECSV_FOLDER_PATH}/{sample_cnt}.csv","w") as sample_file:
            sample_file.write(sampleStr)