## Build a list with unit dictionaries including below data
- {
- 'musicname' : 'original filename',
- 'staff_id' : int,
- 'start' : int, (number of start measure)
- 'end' : int, (number of end measure)
- 'tempo' : int,
- 'timesig' : '4/4' or '6/8' or..., (time signature)
- 'beats' : '1/4,0; 1/2,1; 1/8,1; 1/8,0', (假設為4/4拍，左為一單位的拍子，順序為：四分休止符、二分音符、八分音符、八分休止符，1/4 + 1/2 + 1/8 + 1/8 = 1) (實際上拍子長度為小數型式如0.25, 0.125, 0.0625)
- 'instrument' : string, (此Staff代表的樂器)
- 'instsrc' : string (此Staff選用的樂器)
- }

## 演算法及注意事項
- 不管同staff中偶爾出現的複數旋律，tag < track >
- tempo取同單位中的最大值
- 若同單位中發生time signature(ex: 4/4 to 6/8), 就是於第二或三或四小節發生拍子改變，則捨棄此單位
- 關於記錄Staff所選用的樂器的'instsrc', mscx的metadata的< Part >中可能有多個< Channel >, 只記綠第一個< Channel >中< program value="number" >的number

## 休止符 tag < Rest>
* 先將拍子長度存入BeatStr,
* 之後發現連續的休止符再修改BeatStr,
* 取出上個休止符長度，合併長度後覆蓋過BeatStr中上個休止符長度。

## 連結線 tag < Tie >
- 先不將拍子長度存入BeatStr,
- 並先將拍子長度存入暫存的tiedBeats,
- 之後確認連結線結束後再將tiedBeats存入BeatStr.

## 多連音 tag < Tuplet >
- 一組多連音必在同一小節內
- 所有音符數量不一定等於actualNotes值
- 所有音符長度相加後 * normalNotes值 / actualNotes值，即為實際長度
- 發現一包含10個1/16或1/32音長的< Tuplet >會造成音長比較問題，須以 "差之絕對值小於約0.000000001" 取代 "相等" 之if判斷式

## 未來可簡化或重構的部分
- timeSig change detection
- 四小節單位結算部分
- for else最末小節處理部分

## OOP in the future
- MscxObj: self.staffList, self.tempoDic, self.instrumentDic, self.usedInstSrcDic
- MscxObj: loopMeasure(), lastMeasure(), per4Measure(), genDataDic()
- UnitObj: self.musicname, self.timesig, self.beats, self.tempo
- UnitObj: self.start, self.end, self.staff_id, self.instrument
- UnitObj: getTempo(), getInstrument(), getBeats(), ifChordOrRest(), ifDots()
- UnitObj: ifTuplet(), ifTie(), ifTimeSigChanges(), lastRestOrNot()
- UnitObj: pickleDump(), pickleLoad()
- MainClass: mainList, beatDic, instSrcDic, fileLoopForADir, doubleCheck()

In [1]:
from bs4 import BeautifulSoup as bs
from decimal import *
import os
import sys
import pickle
import pymongo
import csv
from pymongo import MongoClient

#Beat dictionary
beatDic = {
    'measure':None,
    'whole':Decimal(1),
    'half':Decimal(1)/2,
    'quarter':Decimal(1)/4,
    'eighth':Decimal(1)/8,
    '16th':Decimal(1)/16,
    '32nd':Decimal(1)/32,
    '64th':Decimal(1)/64,
    '128th':Decimal(1)/128
}

#Build a instrument source dictionary
with open('C:/Users/BigData/git/DownloadMusic/instrument_table.csv','r') as infile:
    instSrcDic = dict(csv.reader(infile))

#Generate unit-data dictionary containing below properties
def genDataDic(staff_id, timeSig, beatStr, filename, start, end):
    beatStr = beatStr[:-1]
    if 'only1' in tempoDic:
        tempo = tempoDic['only1']
    else:
        tempo = tempoDic[end]
    return {
        'staff_id':staff_id,
        'timesig':timeSig,
        'beats':beatStr,
        'musicname':filename,
        'start':start,
        'end':end,
        'tempo':tempo,
        'instrument':instrumentDic[staff_id],
        'instsrc':instSrcDic[usedInstSrcDic[staff_id]]
    }

In [2]:
#Called by doLastMeasure() to simply the process
#of determination if Time Signature changed
def diffTimeSigsOrNot(staff, diffTimeSigs, timeSig, beatStr, unitBeatSum, mod, start, end, filename):
    if diffTimeSigs == False:
        if abs(unitBeatSum - beatDic['measure'] * mod) < 0.000000001:
            print 'beatStr: ' + beatStr
            print '------------------------------------'
            print 'True, unitBeatSum: ' + str(unitBeatSum)
            print '------------------------------------'
            mainList.append(genDataDic(staff, timeSig, beatStr, filename, start, end))
        else:
            print 'False, unitBeatSum: ' + str(unitBeatSum)
            sys.exit('Error: Incorrect sum of beats of a unit')        
    else:
        print 'Time Signature changed, ignore this unit.'
        print '------------------------------------'    

#Called by getBeats() to simplify the process handing last measure
def doLastMeasure(staff, i, diffTimeSigs, sigN, sigD, beatStr, unitBeatSum, filename):
    timeSig = str(sigN)+'/'+str(sigD)
    if i % 4 == 1:
        diffTimeSigsOrNot(staff, diffTimeSigs, timeSig, beatStr, unitBeatSum, 1, i, i, filename)
    elif i % 4 == 2:
        diffTimeSigsOrNot(staff, diffTimeSigs, timeSig, beatStr, unitBeatSum, 2, i-1, i, filename)
    elif i % 4 == 3:
        diffTimeSigsOrNot(staff, diffTimeSigs, timeSig, beatStr, unitBeatSum, 3, i-2, i, filename)
    else:
        diffTimeSigsOrNot(staff, diffTimeSigs, timeSig, beatStr, unitBeatSum, 4, i-3, i, filename)

In [3]:
#Called by countBeats() to determine if last note is a Rest
#whose beat should be merged with this Rest
def lastRestOrNot(beatStr, thisBeat):
    #If beatStr is False, it's the 1st note(Rest) of a unit
    if not beatStr:
        beatStr = str(thisBeat)+',0;'
    #If beatStr is True, it's not the 1st note(Rest) of a unit
    else:
        #If last note is a Rest
        if beatStr[-2] == '0':
            lastBeat = Decimal(beatStr.rsplit(';', 2)[-2].split(',')[0])
            combinedBeat = str(lastBeat + thisBeat)
            #beatStr contains equivalent to or more than 2 pairs
            if len(beatStr.rsplit(';', 2)) > 2:
                beatStr = beatStr.rsplit(';', 2)[-3] + ';' + combinedBeat + ',0;'
            #beatStr contains exactly 1 pair
            elif len(beatStr.rsplit(';', 2)) == 2:
                beatStr = combinedBeat + ',0;'
        #If last note is not a Rest
        else:
            beatStr += str(thisBeat)+',0;'
    return beatStr

#Called by countBeats() to handle tag <Tie>
def ifTie(beatStr, thisBeat, note, tiedBeats):
    #Totally 2*4=8 conditions
    if tiedBeats == 0:
        #No tie
        if not note.find('Tie') and not note.find('endSpanner'):
            beatStr += str(thisBeat) + ',1;'
        #Tie starts.
        elif note.find('Tie') and not note.find('endSpanner'):
            tiedBeats += thisBeat
        #Tie ends at start of a unit
        elif not note.find('Tie') and note.find('endSpanner'):
            beatStr += str(thisBeat) + ',1;'
        #Tie continues at start of a unit
        elif note.find('Tie') and note.find('endSpanner'):
            tiedBeats += thisBeat
    else: #if tiedBeats != 0:
        #It shouldn't happen. Just for debugging
        if not note.find('Tie') and not note.find('endSpanner'):
            sys.exit('tiedBeats != 0 and not Tie and not endSpanner')
        #It shouldn't happen. Just for debugging
        elif note.find('Tie') and not note.find('endSpanner'):
            sys.exit('tiedBeats != 0 and Tie and not endSpanner')
        #Tie ends
        elif not note.find('Tie') and note.find('endSpanner'):
            tiedBeats += thisBeat
            beatStr += str(tiedBeats) + ',1;'
            tiedBeats = 0
        #Tie continues
        elif note.find('Tie') and note.find('endSpanner'):
            tiedBeats += thisBeat
    return beatStr, tiedBeats

#Called by countBeats() to handle tag <dots>
def ifDots(note):
    if not note.find('dots'):
        multi = 1
    else:
        dots = note.find('dots').text
        if dots == '1':
            multi = 1.5
        elif dots == '2':
            multi = 1.75
        elif dots == '3':
            multi = 1.875
    return Decimal(multi)

#Called by countBeats to simplify the codes
def ifChordOrRest(note, beatSum, tupletID, tupletRatio):
    print note.name,
    key = note.find('durationType').text
    multi = ifDots(note)
    if note.find('Tuplet') and note.find('Tuplet').text == tupletID:
        multi *= tupletRatio
    thisBeat = beatDic[key] * Decimal(multi)
    beatSum += thisBeat
    return thisBeat, beatSum

#Called by getBeats() to count beats & accumulate the beat-string of a unit
def countBeats(measure, beatStr, unitBeatSum, tiedBeats):
    beatSum = 0
    tupletID = ''
    tupletRatio = 1
    for note in measure.find_all(['Chord','Rest','Tuplet'], recursive=False):
        if note.name == 'Chord' and not note.find('track'):
            thisBeat, beatSum = ifChordOrRest(note, beatSum, tupletID, tupletRatio)
            beatStr, tiedBeats = ifTie(beatStr, thisBeat, note, tiedBeats)
        elif note.name == 'Rest' and not note.find('track'):
            thisBeat, beatSum = ifChordOrRest(note, beatSum, tupletID, tupletRatio)
            beatStr = lastRestOrNot(beatStr, thisBeat)
        elif note.name == 'Tuplet':
            tupletID = note['id']
            normalNotes = Decimal(note.find('normalNotes').text)
            actualNotes = Decimal(note.find('actualNotes').text)
            tupletRatio = normalNotes / actualNotes
    if abs(beatSum - beatDic['measure']) < 0.000000001:
        print '\nTrue, beatSum: ' + str(beatSum)
        unitBeatSum += beatSum
        return beatStr, unitBeatSum, tiedBeats
    else:
        print '\nFalse, beatSum: ' + str(beatSum)
        print beatStr
        sys.exit('Error: Incorrect sum of beats of a measure')

In [4]:
#Called by main function in the loop for multiple mscx files
def getTempo(dirPath, filename):
    filePath = dirPath + filename
    with open(filePath, 'r') as f:
        mscx = bs(f.read(), 'xml')
        
    #Select measures of staff#1 because tempos only appear in staff#1
    staff1Measures = mscx.select('Score > Staff:nth-of-type(1) > Measure')
    
    #Get 1st tempo.
    tempo = int(staff1Measures[0].select('Tempo > text')[0].text.rsplit(' ', 1)[-1])
        
    #If there is only 1 tempo, no need to go through every measure.
    if len(mscx.select('Score > Staff:nth-of-type(1) Tempo')) == 1:
        tempoDic['only1'] = tempo
    elif len(mscx.select('Score > Staff:nth-of-type(1) Tempo')) > 1:
        tmp = []
        for i, measure in enumerate(staff1Measures[:-1], 1):
            for t in measure.select('Tempo > text'):
                tmp.append(int(t.text.rsplit(' ', 1)[-1]))
            if i % 4 == 0:
                if tmp:
                    tempo = max(tmp)
                tempoDic[i] = tempo
                del tmp[:]
        #Handle last-measure situations with for-else
        else:
            i += 1
            for t in staff1Measures[-1].select('Tempo > text'):
                tmp.append(int(t.text.rsplit(' ', 1)[-1]))
            if tmp:
                tempo = max(tmp)
            tempoDic[i] = tempo
    else:
        sys.exit('No Tempo found')

In [5]:
def doubleCheck(j, dic, mod):
    print '[Unit #'+str(j)+']'
    timeSig = Decimal(dic['timesig'].split('/')[0]) / Decimal(dic['timesig'].split('/')[1])
    unitBeatSum2 = 0
    for value in dic['beats'].split(';'):
        unitBeatSum2 += Decimal(value[:-2])
    if abs(unitBeatSum2 - timeSig * mod) < 0.000000001:
        print dic
        print '------------------------------------'
    else:
        print 'unitBeatSum2:', unitBeatSum2
        print 'timeSig * mod =', timeSig, '*', mod, '=', timeSig * mod
        sys.exit('FATAL: Incorrect sum of beats of a unit.\n\
        The 1st check passed but the 2nd failed.')
        print '------------------------------------'

In [6]:
#Called by main function in the loop for multiple mscx files
def getBeats(dirPath, filename, staffList, mainList):
    filePath = dirPath + filename
    with open(filePath, 'r') as f:
        mscx = bs(f.read(), 'xml')
#     if max(staffList) > len(mscx.select('Score > Staff')):
#         sys.exit('Error: Staff number out of range')
    filename = filename.rsplit('.', 1)[0]
    print 'Filename: ' + filename
    print '------------------------------------'
    
    #Build a represented instrument dictionary
    for part in mscx.find_all('Part'):
        staffID = []
        for tag in part.find_all(['Staff', 'instrumentId', 'program']):
            if tag.name == 'Staff':
                staffID.append(int(tag['id']))
            elif tag.name == 'instrumentId':
                for ID in staffID:
                    instrumentDic[ID] = str(tag.text)
            else:
                for ID in staffID:
                    usedInstSrcDic[ID] = str(tag['value'])
                break
        if not part.find('instrumentId'):
            for ID in staffID:
                instrumentDic[ID] = None
    
    #Get 1st time signature
    staff1TimeSig = mscx.select_one('Score > Staff:nth-of-type(1) TimeSig')
    sigN = Decimal(staff1TimeSig.find('sigN').text)
    sigD = Decimal(staff1TimeSig.find('sigD').text)
    beatDic['measure'] = sigN/sigD
    diffTimeSigs = False
    
    #目前不用stafflist, 讓做字典產生的getBeats()處理所有staffs
    #Main loop for each staff
#     for staff in staffList:
    for staff in range(1, len(mscx.select('Score > Staff'))+1):
        beatStr = ''
        unitBeatSum = 0
        tiedBeats = 0
        for i, measure in enumerate(mscx.select('Score > \
                              Staff:nth-of-type('+str(staff)+') > \
                              Measure')[:-1], 1):
            print '[Staff #'+str(staff)+', Measure #'+str(i)+']'
            
            #Determine Time Signature & detect if it changes
            if measure.find('TimeSig'):
                sigN = Decimal(measure.find('sigN').text)
                sigD = Decimal(measure.find('sigD').text)
                if sigN/sigD != beatDic['measure']:
                    beatDic['measure'] = sigN/sigD
                    if i % 4 == 2 or i % 4 == 3:
                        diffTimeSigs = True
                        continue
                    elif i % 4 == 0:
                        diffTimeSigs = False
                        beatStr = ''
                        unitBeatSum = 0
                        tiedBeats = 0
                        print 'Time Signature changed, ignore this unit.'
                        print '------------------------------------'
                        continue
                        
            #Count beats & accumulate the beat-string of a unit
            beatStr, unitBeatSum, tiedBeats = \
            countBeats(measure, beatStr, unitBeatSum, tiedBeats)
            
            #Generate a unit dictionary per 4 measures
            if i % 4 != 0:
                print 'beatStr: ' + beatStr
                print '------------------------------------'                
            else:
                if diffTimeSigs == True:
                    diffTimeSigs = False
                    beatStr = ''
                    print 'Time Signature changed, ignore this unit.'
                    print '------------------------------------'
                    continue
                if abs(unitBeatSum - beatDic['measure'] * 4) < 0.000000001:
                    timeSig = str(sigN)+'/'+str(sigD)
                    if tiedBeats != 0:
                        beatStr += str(tiedBeats) + ',1;'
                    mainList.append(genDataDic(staff, timeSig, beatStr, filename, i-3, i))
                    print 'beatStr: ' + beatStr
                    print '------------------------------------'
                    print 'True, unitBeatSum: ' + str(unitBeatSum)
                    print '------------------------------------'
                    tiedBeats = 0
                    beatStr = ''
                    unitBeatSum = 0
                else:
                    print 'False, unitBeatSum: ' + str(unitBeatSum)
                    sys.exit('Error: Incorrect sum of beats of a unit')
                    
        #Handle last-measure situations with for-else
        else:
            i += 1
            lastMeasure = mscx.select('Score > \
                                       Staff:nth-of-type('+str(staff)+') > \
                                       Measure')[-1]
            print '[Staff #'+str(staff)+', Measure #'+str(i)+']'
            if not lastMeasure.find('TimeSig'):
                beatStr, unitBeatSum, tiedBeats = \
                countBeats(lastMeasure, beatStr, unitBeatSum, tiedBeats)
                if tiedBeats != 0:
                    beatStr += str(tiedBeats) + ',1;'
                doLastMeasure(staff, i, diffTimeSigs, sigN, sigD, beatStr, unitBeatSum, filename)
            else:
                sigN = Decimal(lastMeasure.find('sigN').text)
                sigD = Decimal(lastMeasure.find('sigD').text)
                if sigN/sigD == beatDic['measure']:
                    beatStr, unitBeatSum, tiedBeats = \
                    countBeats(lastMeasure, beatStr, unitBeatSum, tiedBeats)
                    if tiedBeats != 0:
                        beatStr += str(tiedBeats) + ',1;'
                        doLastMeasure(staff, i, diffTimeSigs, sigN, sigD, beatStr, unitBeatSum, filename)
                else:
                    print 'Time Signature changed, ignore this unit.'
                    print '------------------------------------'

In [7]:
#Main function here
#目前不用stafflist, 讓做字典產生的getBeats()處理所有staffs
staffList = []

tempoDic = {}
instrumentDic = {}
usedInstSrcDic = {}
mainList = []
dirPath = 'C:/Users/BigData/Desktop/mscx/'
for filename in os.listdir(dirPath):
    getTempo(dirPath, filename)
    getBeats(dirPath, filename, staffList, mainList)
    tempoDic.clear()
    instrumentDic.clear()
    usedInstSrcDic.clear()

#Print dicts while double checking
for j, dic in enumerate(mainList, 1):
    mod = dic['end'] - dic['start'] + 1
    doubleCheck(j, dic, mod)

mainList2 = mainList
del mainList[:]
del staffList[:]

Filename: Basket Case
------------------------------------
[Staff #1, Measure #1]
Rest 
True, beatSum: 1
beatStr: 1,0;
------------------------------------
[Staff #1, Measure #2]
Rest 
True, beatSum: 1
beatStr: 2,0;
------------------------------------
[Staff #1, Measure #3]
Chord Chord Chord Chord Chord Chord Chord Chord 
True, beatSum: 1.000
beatStr: 2,0;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;
------------------------------------
[Staff #1, Measure #4]
Chord Chord Chord Chord Chord Chord Chord Chord 
True, beatSum: 1.000
beatStr: 2,0;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;
------------------------------------
True, unitBeatSum: 4.000
------------------------------------
[Staff #1, Measure #5]
Chord Chord Chord Chord Chord Chord Chord Chord 
True, beatSum: 1.000
beatStr: 0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;0.125,1;
------------------------------------
[

In [None]:
#Store data into MongoDB
client = MongoClient('mongodb://10.120.30.8:27017')
db = client['music']
collect = db['tempo_beats']
collect.insert_many(mainList2)
del mainList2[:]

In [None]:
#Query MongoDB
cur = collect.find({'staff_id':5},{'_id':0,'musicname':1,'beats':1})
for i, item in enumerate(cur):
    print i, item
    print '------------------------------------'