In [1]:
import sys, os
sys.path.append(os.path.abspath('../Code'))
from piece.piece import *
from helpers import *
import re, mmap, csv
import pandas as pd

This cell serves to align all markdown tables to the left.

In [2]:
%%html 
<style>
  table {margin-left: 0 !important;}
</style>

# Prerequisites

Your data needs to be in one separated values file per piece in the corpus and should at least have the columns

|measure|beat|label|
|-------|----|-----|

If you want to link the data to another dataset or to the music scores, the file names (but not the file extensions) need to be identical.

You further need a dataframe that holds the time signatures for all measures of all pieces in the format

|**measure**|piece1|piece2|...|
|-----------|------|------|---|
| 0         | 4/4  | 6/8  |...|
| 1         | 4/4  | 6/8  |...|
|...        | ...  | ...  |...|

Further down you can create such a time signature map from MuseScore2 files.

In [3]:
musescore = 'mscore'

In [11]:
def get_tsms(score_dir):
    """ Extract the Time Signature Maps from all MSCX files in score_dir
    
    Returns
    -------
    
    pd.DataFrame:
        Where the index are measure numbers and every column holds the time signatures for one piece
    """
    all_tsms = {}
    for file in os.listdir(score_dir):
        if file.endswith('.mscx'):
            filename = os.path.splitext(file)[0] 
            path = os.path.join(score_dir,file)
            p = Piece(path,timesig_map_only=True,ms=musescore)
            #measure_count = max(list(timesigs.keys()))
            all_tsms[filename] = p.get_timesig_map()
    return pd.DataFrame.from_dict(all_tsms, orient='index')

# Get time signature maps

### You can either extract them from a set of MuseScore2 files:

In [12]:
musescorefiles = './scores'

tsms = get_tsms(musescorefiles)
tsms.to_csv('time_signature_maps.tsv',sep='\t')
tsms

measure number 36 should be corrected to 38
measure number 101 should be corrected to 100


Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,312.0,313.0,314.0,315.0,316.0,317.0,318.0,319.0,100.1,100.2
K570-3,4/4,4/4,4/4,4/4,4/4,4/4,4/4,4/4,4/4,4/4,...,,,,,,,,,,
K282-2,3/4,3/4,3/4,3/4,3/4,3/4,3/4,3/4,3/4,3/4,...,,,,,,,,,,
K545-3,2/4,2/4,2/4,2/4,2/4,2/4,2/4,2/4,2/4,2/4,...,,,,,,,,,,
K280-3,3/8,3/8,3/8,3/8,3/8,3/8,3/8,3/8,3/8,3/8,...,,,,,,,,,,
K310-2,3/4,3/4,3/4,3/4,3/4,3/4,3/4,3/4,3/4,3/4,...,,,,,,,,,,
K282-3,2/4,2/4,2/4,2/4,2/4,2/4,2/4,2/4,2/4,2/4,...,,,,,,,,,,
K284-3,2/2,2/2,2/2,2/2,2/2,2/2,2/2,2/2,2/2,2/2,...,,,,,,,,,,
K281-3,2/2,2/2,2/2,2/2,2/2,2/2,2/2,2/2,2/2,2/2,...,,,,,,,,,,
K330-2,3/4,3/4,3/4,3/4,3/4,3/4,3/4,3/4,3/4,3/4,...,,,,,,,,,,
K311-3,6/8,6/8,6/8,6/8,6/8,6/8,6/8,6/8,6/8,6/8,...,,,,,,,,,,


### Or load existing time signature maps

The desired structure is a TSV file with measure numbers as indices and piece names as columns, so as to know a time signature for every measure of every piece.

In [3]:
timesignaturemaps = 'time_signature_maps.tsv'

tsms = pd.read_csv(timesignaturemaps,sep='\t',index_col=0)

# Extract harmony labels from a folder of MSCX (uncompressed MuseScore 2) files

In [10]:
scoredir = './scores'


def extract(dir,goal='',repair=False,files_re=r'.*\.mscx$',recursive=False,remove=None,metadata=False):
    """Extract harmony labels from all uncompressed MuseScore2 files (*mscx) in *dir*, correct them, and save them to a TXT file.

    Optionally, remove the labels from the file and save as an empty one.

    Parameters
    ----------

    dir: str
        path to the directory with MSCX files
    repair: bool, optional
        if you additionally want to save autocorrections to a new MSCX file
    files_re: rString, optional
        regex to select only certain pieces
    recursive: bool, optional
        include sub-directories
    remove: str, optional
        if you want to save an empty file after extraction, pass the suffix for the new file, e.g.
        ``remove='_clean'``
    """
    for subdir, dirs, files in os.walk(dir):
        if not recursive:
            dirs[:] = []
        for file in sorted(files):
            m = re.search(files_re,file)
            if m and re.search(r'.*\.mscx$',file):
                path = os.path.join(subdir,file)
                print("Processing " + path)

                with open(path, 'rb', 0) as f, mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as s:
                    if s.find(b'<Harmony>') == -1:
                        print(f"{file} contains no labels. Skipped.")
                        continue

                p = Piece(path,repair,ms=musescore)
                key = p.key
                erroneous = p.get_harmonies(True)
                if len(erroneous) > 1 or key == '':
                    msg = f"{file}'s syntax contains errors. Skipped."
                    if not repair:
                        msg += " Try option -r for autorepair."
                    print(msg)
                    continue
                all = p.get_harmonies()
                if goal == '':
                    goal = subdir
                n = re.search(r'(.*)(\..*)$',file)
                piece = n.group(1)
                if repair:
                    piece += "_repaired"
                txt = os.path.join(goal, piece + '.tsv')
                with open(txt, 'w') as tsvfile:
                    if metadata:
                        tsvfile.write('@skip: 4\n')
                        tsvfile.write(f'@piece: {piece}\n')
                        tsvfile.write(f'@key: {key}\n')
                        tsvfile.write(f'@meter: {p.timesig}\n')
                    tsvfile.write('measure\tbeat\tlabel\n')
                    writer = csv.writer(tsvfile, delimiter='\t')
                    for l in all:
                        writer.writerow(l)
                print(txt + ' written.')
                if remove:
                    p.remove_harmonies(os.path.join(subdir,piece+remove+n.group(2)))
                    
extract(scoredir, './labels/chords')

Processing ./scores/K279-1.mscx
All good.
./labels/chords/K279-1.tsv written.
Processing ./scores/K279-2.mscx
Captured only 161 out of the 160 harmonies in K279-2.mscx. Use -r to keep changes.
./labels/chords/K279-2.tsv written.
Processing ./scores/K279-3.mscx
All good.
./labels/chords/K279-3.tsv written.
Processing ./scores/K280-1.mscx
Captured only 236 out of the 237 harmonies in K280-1.mscx. Use -r to keep changes.
./labels/chords/K280-1.tsv written.
Processing ./scores/K280-2.mscx
All good.
./labels/chords/K280-2.tsv written.
Processing ./scores/K280-3.mscx
All good.
./labels/chords/K280-3.tsv written.
Processing ./scores/K281-1.mscx
All good.
./labels/chords/K281-1.tsv written.
Processing ./scores/K281-2.mscx
All good.
./labels/chords/K281-2.tsv written.
Processing ./scores/K281-3.mscx
All good.
./labels/chords/K281-3.tsv written.
Processing ./scores/K282-1.mscx
All good.
./labels/chords/K282-1.tsv written.
Processing ./scores/K282-2.mscx
Captured only 121 out of the 122 harmonies

# Add a continuous position to every entry

### Specify the folder where the data lies and the folder for the expanded dataset.

In [9]:
data = './labels/chords'
newset = './labels/chords'

def add_positions(dir, new_dir, time_signatures, files_re=".*", recursive=False, extensions=['tsv', 'txt', 'csv'], sep=['\t', '\t', ',']):
    """ For every data entry, a decimal position is calculated from `measure` and `beat` and added as a column.
    
    For this, we need a dataframe with time signature maps (tsms, see above). The time signatures are added in another column.
    
    Parameters
    ----------
    
    dir: str
        Directory with the files to process.
    new_dir: str
        Directory where to save the processed files.
    time_signatures: pd.DataFrame
        Index are measure numbers, columns are pieces, entries are strings such as '4/4' or '3/8'.
    new_sizes: list or tuple
        Corresponding new beat sizes.
    files_re: str
        In case you want to process only files where this regular expression finds a match.
    recursive : :obj:`bool`, optional
        Scan subdirectories as well? Defaults to `True`.
    extensions : :obj:`list` of :obj:`str`, optional
        File extensions to consider. Defaults to `['tsv','csv']`.
        If the list is shorter than `extensions`, the last element is used.
    sep : :obj:`list` of :obj:`str`, optional
        The separator symbols corresponding to the extensions. Defaults to `['\t',',']`.
    """
    
    for subdir, dirs, files in os.walk(dir):
        dirs.sort()
        if not recursive:
            dirs[:] = []

        exts = '|'.join(extensions)
        for file in files:
      
            m = re.match(f'(.+)\.({exts})$',file)
            if m and re.search(files_re,file):
          
                name = m[1]
                ext  = m[2]
                ind = extensions.index(ext)
                s = sep[-1] if ind >= len(sep) else sep[ind]
                path = os.path.join(subdir,file)
                df = pd.read_csv(path, sep=s, error_bad_lines=False, warn_bad_lines=True)
                print("Working on " + file)
                def add_decimals(r):
                    timesig = time_signatures.loc[r.measure][name]
                    r.loc['position'] =  measure_decimal(r.measure,r.beat,timesig,decimals=2)
                    r.loc['timesig'] = timesig
                    return r
                df = df.apply(add_decimals,axis=1)
                
                df['duration'] = pd.Series([])
                df['duration'].iloc[:-1] =  (df.position[1:].values - df.position[:-1].values).round(2)

                df.to_csv(os.path.join(new_dir,file),sep='\t',index=False)

add_positions(data,newset,tsms,)

Working on K279-1.tsv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


# Adapt representation of beats


In [9]:
def change_beatsize(beat,orig_beatsize=4,new_beatsize=4):
    """Convert one beat size to another.

    Parameters:
    -----------

    b: str
        Beat in the shape beat.subbeatFraction
    orig_beatsize, new_beatsize: int
        Old and new sizes of the beat, e.g. 8, 4


    Example:
    --------

    >>> change_beatsize('1.1/2',2)
    '2'
    >>> change_beatsize('4',8)
    '2.1/2'

    """
     
    b = beat2float(beat)
    scale = new_beatsize/orig_beatsize
    if scale > 1:
        new_b = b * scale - (scale - 1)
    elif scale < 1:
        new_b = (b + (1 / scale - 1)) * scale
    else:
        return fractionize(beat)
    return fractionize(new_b)

In [10]:
data = './cadences_expanded'
newset = './cadences_expanded'


def adapt_beats(dir, new_dir, orig_sizes = [2,8], new_sizes = [4,4], recursive=False, extensions=['tsv', 'txt', 'csv'], sep=['\t', '\t', ',']):
    """ Apply the function `change_beatsize` to all rows of all files in `dir`
    if the time signature's denominator is included in `orig_sizes`.
    
    For this to work, the dataset has to have been expanded in the section
    "Add a continuous position to every entry" above, so it holds a column
    named "timesig".
    
    As a result, all beats will be converted to strings of the shape beat.subbeatFraction
    
    Parameters
    ----------
    
    dir: str
        Directory with the files to process.
    new_dir: str
        Directory where to save the processed files.
    orig_sizes: list or tuple
        All beat sizes / denominators you want to alter.
    new_sizes: list or tuple
        Corresponding new beat sizes.
    recursive : :obj:`bool`, optional
        Scan subdirectories as well? Defaults to `True`.
    extensions : :obj:`list` of :obj:`str`, optional
        File extensions to consider. Defaults to `['tsv','csv']`.
        If the list is shorter than `extensions`, the last element is used.
    sep : :obj:`list` of :obj:`str`, optional
        The separator symbols corresponding to the extensions. Defaults to `['\t',',']`.
    
    """
    
    for subdir, dirs, files in os.walk(dir):
        dirs.sort()
        if not recursive:
            dirs[:] = []

        exts = '|'.join(extensions)
        for file in files:
            m = re.match(f'(.+)\.({exts})$',file)
            if m:
                name = m[1]
                ext  = m[2]
                ind = extensions.index(ext)
                s = sep[-1] if ind >= len(sep) else sep[ind]
                path = os.path.join(subdir,file)
                df = pd.read_csv(path, sep=s, error_bad_lines=False, warn_bad_lines=True)
                
                def quarter_beats(r):
                    denom = int(r.timesig.split('/')[1])
                    if denom in orig_sizes:
                        r.beat = change_beatsize(r.beat,denom,new_sizes[orig_sizes.index(denom)])
                    else:
                        r.beat = fractionize(r.beat)
                    return r
                df = df.apply(quarter_beats, axis=1)
                
                df.to_csv(os.path.join(new_dir,file),sep='\t',index=False)

#adapt_beats(data,newset,[8])

# Adapt data types

In [11]:
data = './cadences_expanded'
newset = './cadences_expanded'


def adapt_types(dir, new_dir, types={'measure':int}, recursive=False, extensions=['tsv', 'txt', 'csv'], sep=['\t', '\t', ',']):
    """ This function reads the files with the given `extensions`, reading values using the given `sep`arator(s) and applying
    the `types` given as a dictionary. Then the files are stored using the new dtypes.
    
    Parameters
    ----------
    
    dir: str
        Directory with the files to process.
    new_dir: str
        Directory where to save the processed files.
    types: dict, optional
        For every column you want to change, indicate the dtype.
    recursive : :obj:`bool`, optional
        Scan subdirectories as well? Defaults to `True`.
    extensions : :obj:`list` of :obj:`str`, optional
        File extensions to consider. Defaults to `['tsv','csv']`.
        If the list is shorter than `extensions`, the last element is used.
    sep : :obj:`list` of :obj:`str`, optional
        The separator symbols corresponding to the extensions. Defaults to `['\t',',']`.
    
    """
    
    for subdir, dirs, files in os.walk(dir):
        dirs.sort()
        if not recursive:
            dirs[:] = []

        exts = '|'.join(extensions)
        for file in files:
            m = re.match(f'(.+)\.({exts})$',file)
            if m:
                name = m[1]
                ext  = m[2]
                ind = extensions.index(ext)
                s = sep[-1] if ind >= len(sep) else sep[ind]
                path = os.path.join(subdir,file)
                df = pd.read_csv(path, sep=s, error_bad_lines=False, warn_bad_lines=True, dtype=types)
                df.to_csv(os.path.join(new_dir,file),sep='\t',index=False)


#adapt_types(data,newset)

# Split values into two columns

In [7]:
#data = './cadences_expanded'
data = './chords_expanded'
#newset = './cadences_expanded'
newset = './chords_expanded'


def split_values(dir, new_dir, column, splitchar, names, drop=False, recursive=False, extensions=['tsv', 'txt', 'csv'], sep=['\t', '\t', ','], files_re='.*'):
    """ Split the column `column` by the string `splitchar` and create additional columns named `names`.
    
    Parameters
    ----------
    
    dir: str
        Directory with the files to process.
    new_dir: str
        Directory where to save the processed files.
    column: str
        Column to be split.
    splitchar: str
        String used as a separator
    names: tuple or list
        List with names for the new columns. Use existing column names to overwrite.
    drop: bool, optional
        If True, the split column will be dropped.
    recursive : :obj:`bool`, optional
        Scan subdirectories as well? Defaults to `True`.
    extensions : :obj:`list` of :obj:`str`, optional
        File extensions to consider. Defaults to `['tsv', 'txt', 'csv']`.
    sep : :obj:`list` of :obj:`str`, optional
        The separator symbols corresponding to the extensions. Defaults to `['\t', '\t', ',']`.
        If the list is shorter than `extensions`, the last element is used.
    
    """
    
    for subdir, dirs, files in os.walk(dir):
        dirs.sort()
        if not recursive:
            dirs[:] = []

        exts = '|'.join(extensions)
        for file in files:
            m = re.match(f'(.+)\.({exts})$',file)
            if m and re.search(files_re,file):
                name = m[1]
                ext  = m[2]
                ind = extensions.index(ext)
                s = sep[-1] if ind >= len(sep) else sep[ind]
                path = os.path.join(subdir,file)
                df = pd.read_csv(path, sep=s, error_bad_lines=False, warn_bad_lines=True)
                
                spl = df.loc[:,column].str.split(splitchar,len(names),expand=True)
                for i,c in enumerate(spl.columns):
                    df.loc[:, names[i]] = spl.loc[:,c]
                if drop:
                    df = df.drop(columns=column)
                    
                df.to_csv(os.path.join(new_dir,file),sep='\t',index=False)

split_values(data,newset,'label','-',['label','alt_label'])

In [8]:
from module import *
cadencedir = './cadences_expanded/'

chords = dataset(data,scoredir) 
cadences = dataset(cadencedir,scoredir)
merged = merged_dataset(chords,cadences,compute_all=True)
merged.dump(dir='./')

Labels expanded into separate columns.



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Extracted all cadence sequences.
Computed cadence stages.
Added stage information to self.df
Added neighbour indices to self.df
Stored ./dataset.tsv
Stored ./sequences.tsv
Stored ./stages.tsv
