## Experimenting with the library

In [None]:
import os 

In [None]:
os.chdir("../")

In [None]:
print(os.getcwd())
musicPath = os.getcwd()

In [None]:
os.chdir(musicPath+"/code/midicsv/")

In [None]:
import subprocess as sp

In [None]:
outputFile =open(musicPath+'/output/test.csv','w+')
errorFile =open('test.csv','w+')

In [None]:
print(os.getcwd())
os.listdir()

In [None]:
sp.call(['./midicsv', '-v', musicPath+"/data/maestro/2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_05_Track05_wav.midi"], stdout=outputFile, stderr=errorFile)
outputFile.close()
errorFile.close()

## Feed into Pandas DF

In [1]:
import os 
import subprocess
import sys
import pandas as pd
import math
import concurrent.futures
import numpy as np
os.chdir("../")
musicPath = os.getcwd()
dataPath = musicPath + "/data"

os.chdir(musicPath+"/code/midicsv/")

In [2]:
def midiToCsv(restofPath):
    cmd = ['./midicsv', '-v', musicPath+restofPath]
    a = subprocess.Popen(cmd, stdout=subprocess.PIPE)

    if sys.version_info[0] < 3: 
        from StringIO import StringIO
    else:
        from io import StringIO
    
    b = StringIO(a.communicate()[0].decode('utf-8'))
    
    df = pd.read_csv(b,names=["track","time", "type", "channel","note","velocity", "other", "encoded"], sep=",")
    return df

In [None]:
def getQuarterNoteRate(df):
#     print(df)
    noteRate = df['velocity'].iloc[0]
    return noteRate

In [None]:
def setQuarterNoteTime(df, noteRate, sampleRate):
    df['time']=df['time'].apply(lambda x:int(math.floor(x/(noteRate/sampleRate))))
    return df

In [None]:
def removeControlC(df):
    indexNames = df[ df['type'].str.contains('Control_c')].index
    # Delete these row indexes from dataFrame
    out = df.drop(indexNames )
    #print(out)
    return out

In [None]:
def getNote(x):
    # make easier with: df.name.str.extract(r'([\d]+)',expand=False)
    arr = x.split('|')
    return arr[1]

def reformatStrings(df):
    df['note'] = df['note'].fillna(0).astype(int)
    df['encoded'] = df['type']+"|"+df['note'].astype(str)+"|"+df['velocity'].astype(str)
    df['finalEncode'] = df['encoded'].apply(lambda x: "off" + getNote(x)  if x.endswith("|0.0") else ("on" + getNote(x)))

    return df


In [None]:
def trackisOnlyPianolike(reformedRows):
    if (len(reformedRows[reformedRows['type'].str.contains('Program_c')]) == 1):
#         print(reformedRows.iloc[0])
        if (0<=int(reformedRows.iloc[0].channel)<8):
            return True
        else:
            print("not Piano Track")
            return False
    else:
        print("changes in program")
        return False


In [None]:
def makeTrueRows(reformedRows):
#     print(reformedRows['type'].nunique())
    trueRows = reformedRows[reformedRows['type'].str.contains('Note_on_c')]
    return trueRows

In [None]:
def makeNoteString(trueRows):
    startTime = trueRows['time'].iloc[0]
    prevTime = int(startTime)
    outArr = []
    for index, row in trueRows.iterrows():
        rowTime = int(row['time'])
#         print(row)
        if prevTime == rowTime:
            outArr.append(row['finalEncode'])
#             outPut += " " + row['finalEncode']
        elif prevTime < rowTime:
            diffTime = rowTime - 1 - prevTime
            outArr.append("wait"+str(diffTime))
#             print(row['finalEncode'])
            outArr.append(row['finalEncode'])
            prevTime = int(rowTime)
#             outPut += " " + "wait"+str(diffTime)
#             outPut += " " + row['finalEncode']

#     print(outArr)
    return  " ".join(outArr)
#     for i in range (indexes[2]+2,indexes[3]):
#         print( diffTime.loc[i])

In [None]:
def removeTitle(df):
    indexNames = df[ df['type'].str.contains('Title_t')].index
    # Delete these row indexes from dataFrame
    out = df.drop(indexNames )
    #print(out)
    return out

In [None]:
def trainingString(diffTime):
    diffTime.index = pd.RangeIndex(len(diffTime.index))
    search_values = ['Start_track','End_track']
    #TODO deal with multiple tracks?
    # diffTime.head(20)
    startStops = diffTime[diffTime.type.str.contains('|'.join(search_values ))]
    indexes = startStops.index
    reformedRows = diffTime.iloc[ indexes[2]+1:indexes[3] , :]
#     print(reformedRows)
    #make new df with only the note on note off ones
    if (trackisOnlyPianolike(reformedRows)):
        trueRows = makeTrueRows(reformedRows)
    #     print(trueRows.count())
        output = makeNoteString(trueRows)
        return output

In [None]:
def processFile(fileName):
    df = midiToCsv(fileName)
#     print(df.head(20))
    noteRate = getQuarterNoteRate(df)
    withoutC = removeControlC(df)
    withoutT = removeTitle(withoutC)
    diffTime = setQuarterNoteTime(withoutT, noteRate, 12)
#     diffTime.head(20)

    reformatStrings(diffTime)
    return trainingString(diffTime)

In [None]:
def splitData(musicPath):
    songList = pd.read_csv (musicPath + '/data/maestro/maestro-v2.0.0.csv')
#     print(songList.shape)
    train = songList[songList["split"] == 'train']
    test = songList[songList["split"] == 'test']
    validation = songList[songList["split"] == 'validation']

    return train, test, validation

In [None]:
train, test, validate = splitData(musicPath)

In [None]:
def encodeFile(subDirectory, fileName):
    filePath = subDirectory+fileName
    print(filePath)
    return processFile(filePath)
    

In [None]:
def cleanFileAndSave(row, folder):
    encodedString = encodeFile('/data/maestro/', row['midi_filename'])
    withEnding = row['midi_filename'].split("/")[1]
    name = withEnding.split(".")[0]+".txt"
    text_file = open(musicPath+'/data/cleanMaestro/'+folder+"/"+ name, "w")
    n = text_file.write(encodedString)
    text_file.close()

In [None]:
def processDF(dataFrame, folder):
    for index, row in dataFrame.iterrows():
        cleanFileAndSave(row, folder)

In [None]:
train['midi_filename'].iloc[0]
# "2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_05_Track05_wav.midi"

In [None]:
processDF(train, "train")

In [None]:
def parallelProcessDF(dataFrame, folder):
    with concurrent.futures.ProcessPoolExecutor() as executor:
#         results = [executor.submit(cleanFileAndSave, row, folder) for index, row in dataFrame.iterrows()]
        for index, row in dataFrame.iterrows():
            executor.submit(cleanFileAndSave, row, folder)
#         for f in concurrent.futures.as_completed(results):
#             print(f.result())

In [None]:
parallelProcessDF(train, "train")

In [None]:
processFile("/data/maestro/2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MID--AUDIO_03_R2_2008_wav--2.midi")

### Encode DF to MIDI

In [3]:
df = midiToCsv("/data/maestro/2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MID--AUDIO_03_R2_2008_wav--2.midi")

In [4]:
def dfToCSV(file_name, df):
    df.to_csv(dataPath +  file_name, index=False, header=False)

In [14]:
def dfToMIDI(file_name,df,out_file):
    dfToCSV("/misc/simpleCSV",df)
    cmd = ['./csvmidi', '-v', dataPath+file_name, musicPath + out_file]
    a = subprocess.Popen(cmd, stdout=subprocess.PIPE)

    if sys.version_info[0] < 3: 
        from StringIO import StringIO
    else:
        from io import StringIO

#     b = StringIO(a.communicate()[0].decode('utf-8'))
#     df = pd.read_csv(b,names=["track","time", "type", "channel","note","velocity", "other", "encoded"], sep=",")
#     return df
    

In [8]:
def removeEverything(df):
    indexNames = df[ df['type'].str.contains('Control_c')].index
    # Delete these row indexes from dataFrame
    df = df.drop(indexNames )
    indexNames = df[ df['type'].str.contains('Title_t')].index
    # Delete these row indexes from dataFrame
    df = df.drop(indexNames )
    indexNames = df[ df['type'].str.contains('Note_on_c')].index
    # Delete these row indexes from dataFrame
    df = df.drop(indexNames )
    df = df.drop(columns=['other', 'encoded'])
    df = df.replace(np.nan,0)    
    return df

In [11]:
out = removeEverything(df)

       track    time             type  channel  note  velocity  other  encoded
0          0       0           Header        1   2.0     384.0    NaN      NaN
1          1       0      Start_track      NaN   NaN       NaN    NaN      NaN
2          1       0            Tempo   500000   NaN       NaN    NaN      NaN
3          1       0   Time_signature        4   2.0      24.0    8.0      NaN
4          1       1        End_track      NaN   NaN       NaN    NaN      NaN
5          2       0      Start_track      NaN   NaN       NaN    NaN      NaN
7          2       0        Program_c        0   0.0       NaN    NaN      NaN
24461      2  587303        End_track      NaN   NaN       NaN    NaN      NaN
24462      0       0      End_of_file      NaN   NaN       NaN    NaN      NaN


In [15]:
dfToMIDI("/misc/simpleCSV",out,"/output/firstTry" )