# GENRE TAGGING IDMT-SMT DATASET
The following notebook shows the process our data collector used to parse through
the IDMT-SMT audio signal dataset and how the simple genre classification system 
was created for our metadata file.
## IMPORT
In the .py files we use an import for the class bellow but in the case of this notebook we can just insure to run this class module bellow before running main.

In [None]:
import xml.sax
import glob 
import os
import pandas as pd
import xml.etree.ElementTree as ET

## FILE HANDLER
### Class for parsing XML Files
This class allows us to extract the data we need from the XML files provided by the IDMT-SMT audio signal dataset and allows us to easily create our own metadata CSV file for minipulating and processing audio files to help process our own audio files as well as train our machine learning model. This class searches through each indavidual file in the Lists directory in the downloaded dataset and creates a dictionary of key value pairs, the value is a list that contains all the data collected from the XML. This data can then be sent back to our main file to be processed into a CSV file using pandas dataframe functions. 

In [None]:
class FileHandler(xml.sax.ContentHandler):

    def __init__(self):
        self.currentData = ""
        self.fileID = ""
        self.instrumentSetting = ""
        self.playStyle = ""
        self.midi = ""
        self.string = ""
        self.fret = ""
        self.fxGroup = ""
        self.fxType = ""
        self.fxSetting = ""
        self.fileTag = ""
        self.dict = {"fileID":[], "instrumentSetting":[], "playStyle":[], 
                     "midi":[], "string":[], "fret":[], "fxGroup":[], 
                     "fxType":[], "fxSetting":[], "fileTag":[]}
    
    def startElement(self, tag, attributes):
        self.currentData = tag
    
    def endElement(self, tag):
        if self.currentData == "fileID":
            self.dict["fileID"].append(self.fileID)
        elif self.currentData == "instrumentsetting":
            self.dict["instrumentSetting"].append(self.instrumentSetting)
        elif self.currentData == "playstyle":
            self.dict["playStyle"].append(self.playStyle)
        elif self.currentData == "midinr":
            self.dict["midi"].append(self.midi)
        elif self.currentData == "string":
            self.dict["string"].append(self.string)
        elif self.currentData == "fret":
            self.dict["fret"].append(self.fret)
        elif self.currentData == "fxgroup":
            self.dict["fxGroup"].append(self.fxGroup)
        elif self.currentData == "fxsetting":
            self.dict["fxSetting"].append(self.fxSetting)
        elif self.currentData == "fxtype":
            self.dict["fxType"].append(self.fxType)
        elif self.currentData == "filenr":
            self.dict["fileTag"].append(self.fileTag)
        self.currentData = ""
            
    def characters(self, tag):
        if self.currentData == "fileID":
            self.fileID = tag
        elif self.currentData == "instrumentsetting":
            self.instrumentSetting = tag
        elif self.currentData == "playstyle":
            self.playStyle = tag    
        elif self.currentData == "midinr":
            self.midi = tag    
        elif self.currentData == "string":
            self.string = tag    
        elif self.currentData == "fret":
            self.fret = tag  
        elif self.currentData == "fxgroup":
            self.fxGroup = tag  
        elif self.currentData == "fxsetting":
            self.fxSetting = tag 
        elif self.currentData == "fxtype":
            self.fxType = tag 
        elif self.currentData == "filenr":
            self.fileTag = tag 

## FILE PARSER FUNCTION
### initializes our handler class

In [None]:
def file_parse_xml(path):
    handler = FileHandler()
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)
    parser.parse(path)
    return handler.dict

## CREATE CSV FILE


In [None]:
def create_csv(data, csvName):
    csv_filename = csvName
    
    if not os.path.exists(csv_filename):
        df = pd.DataFrame(data)
        df.to_csv(csv_filename, index=False)
    else:
        df = pd.DataFrame(data)
        df.to_csv(csv_filename, mode='a', index=False, header=False)

## EFFECT ID TABLE CREATION 

In [None]:
def effect_table_creation(path):
    tree = ET.parse(path)
    root = tree.getroot()
    effectDict = {'fxName':[], 'fxNameID':[], 'fxType':[], 'fxTypeID':[], 'fxSetting':[],
                  'fxSettingID':[], 'genre':[], 'paramName':[], 'paramID':[]}
    
    for fxinformation in root.findall('.//fxinformation'):
        fxgroupID = fxinformation.find('.//fxgroup/ID').text
        fxtypeID = fxinformation.find('.//fxtype/ID').text
        fxsettingID = fxinformation.find('.//fxsetting/ID').text
        fxgroup = fxinformation.find('.//fxgroup/name').text
        fxtype = fxinformation.find('.//fxtype/name').text
        fxsetting = fxinformation.find('.//fxsetting/name').text
        effectDict['fxName'].append(fxgroup)
        effectDict['fxNameID'].append(fxgroupID)
        effectDict['fxSetting'].append(fxsetting)
        effectDict['fxSettingID'].append(fxsettingID)
        effectDict['fxType'].append(fxtype)
        effectDict['fxTypeID'].append(fxtypeID)

        if fxsettingID == '1':
            effectDict['genre'].append('indie')
        elif fxsettingID == '2':
            effectDict['genre'].append('rock')
        elif fxsettingID == '3':
            effectDict['genre'].append('metal')
        else:
            effectDict['genre'].append('other')
            
        param_info = fxinformation.find('.//paraminformation')
        if param_info is not None:
            paramNameList = []
            paramIDList = []
            for param in param_info.findall('.//parameter'):
                param_name = param.find('name').text
                param_value = param.find('value').text
                paramNameList.append(param_name)
                paramIDList.append(param_value)
            effectDict['paramName'].append(paramNameList)
            effectDict['paramID'].append(paramIDList)
    
    create_csv(effectDict, "effectData.csv")

## EFFECT XML PATH AGGREGATION

In [None]:
def effect_xml_file(root):
    fnameList = []
    
    for effectDir in os.listdir(root):
        subDir = os.path.join(root, effectDir)
        # The [:3] asks for the first 3 files for each parent directory
        fnames = glob.glob(f"{subDir}/*.xml")[:3]
        fnameList.extend(fnames)
        
    return fnameList

## START HERE

In [None]:
def main():
    root = "dataset/monophonic/Lists"
    fnames = glob.glob(f"{root}/*/*.xml")
    path = ""
    
    # Collect the data for all the files in IDMT-SMT dataset
    for file in fnames:
        path = file
        fileData = file_parse_xml(path)
        create_csv(fileData, "fileData.csv") 
        
    fnameList = effect_xml_file(root)
    
    # Collect the data for all the effects in IDMT-SMT dataset
    for filePath in fnameList:
        effect_table_creation(filePath)