In [7]:
import partitura as pt
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import os
import pandas as pd
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset

# Pitch Spelling with Partitura

Have you always been bad at spelling bee, do you find that spelling notes makes this even worse. Your time of struggling is over.... Today we going to teach a Model to learn how to *pitch* spell.

### Definition

Spelling a pitch relates to the system of naming notes by letters (A-G) and sharp(#) and flat (♭) signs - and sometimes double sharp and flat signs, resulting in names or 'spellings' like 'A♭', 'D#', 'F♭♭'.

Translating between frequencies in Hz and such names is non-trivial. You need to consider :

-  The 'concert pitch' you are taking as a reference
- The temperament in which the piece is played
- The overall key that the music would be notated in
- Use of the correct enharmonic equivalents for accidentals (Using the correct enharmonic equivalent, Purpose of double-sharps and double-flats?)

If translating between, say, MIDI note numbers and 'spelled' names, the first two steps can be skipped.

Spelled pitch names often have an octave number appended for disambiguation - e.g. 'A♭3', 'D#5'.


### Some Spelling algorithms

Partitura contains an implementation for a standard algorithm for Pitch Spelling. The algorithm in question is called ps13 created by Meredith and al.:

	The ps13 pitch spelling algorithm, D Meredith - Journal of New Music Research, 2006

Some notable algorithms and currect SOTA is PKSpell.

	PKSpell: Data-driven pitch spelling and key signature estimation
	F Foscarin, N Audebert, R Fournier-S'Niehotta, 2021


Let's first download a pitch spelling dataset.

In [16]:
!wget https://github.com/CPJKU/vienna4x22/archive/refs/heads/master.zip

--2022-10-31 10:47:16--  https://github.com/CPJKU/vienna4x22/archive/refs/heads/master.zip
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/CPJKU/vienna4x22/zip/refs/heads/master [following]
--2022-10-31 10:47:22--  https://codeload.github.com/CPJKU/vienna4x22/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.121.9
Connecting to codeload.github.com (codeload.github.com)|140.82.121.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘master.zip’

master.zip              [                <=> ]   6.33M  1.84MB/s    in 3.6s    

2022-10-31 10:47:27 (1.75 MB/s) - ‘master.zip’ saved [6636510]



In [17]:
!unzip ./master.zip

Archive:  ./master.zip
b2c0d7a29c236724868efba005db290c6bc01704
   creating: vienna4x22-master/
 extracting: vienna4x22-master/.gitignore  
  inflating: vienna4x22-master/README.md  
   creating: vienna4x22-master/match/
  inflating: vienna4x22-master/match/Chopin_op10_no3_p01.match  
  inflating: vienna4x22-master/match/Chopin_op10_no3_p02.match  
  inflating: vienna4x22-master/match/Chopin_op10_no3_p03.match  
  inflating: vienna4x22-master/match/Chopin_op10_no3_p04.match  
  inflating: vienna4x22-master/match/Chopin_op10_no3_p05.match  
  inflating: vienna4x22-master/match/Chopin_op10_no3_p06.match  
  inflating: vienna4x22-master/match/Chopin_op10_no3_p07.match  
  inflating: vienna4x22-master/match/Chopin_op10_no3_p08.match  
  inflating: vienna4x22-master/match/Chopin_op10_no3_p09.match  
  inflating: vienna4x22-master/match/Chopin_op10_no3_p10.match  
  inflating: vienna4x22-master/match/Chopin_op10_no3_p11.match  
  inflating: vienna4x22-master/match/Chopin_op1

In [2]:
files = [(os.path.join(root, file), os.path.join(os.path.dirname(root), "musicxml", file[:-10]+".musicxml")) for root, dirs, files in os.walk("vienna4x22-master") for file in files if file.endswith(".match")]

In [38]:
files[0]

('vienna4x22-master/match/Chopin_op10_no3_p14.match',
 'vienna4x22-master/musicxml/Chopin_op10_no3.musicxml')

In [39]:
def produce_match(alignment_fn, mfn, sfn, match_name):
	"""
	Produce and Save Match.

	Parameters
	----------
	mfn : str
		Performance Midi File Path
	sfn : str
		Score musicxml File Path
	alignment_fn : str
		Alignment ".txt" file path
	match_name : str
		Path and Save Name.
	"""
	data = pd.read_csv(alignment_fn, sep="\t")

	alignment = list()
	for x in data[["xml_id", "midi_id"]].to_numpy():
		if x[1] == "deletion":
			dd = dict(label="deletion", score_id=x[0])
		# TODO for asap alignments to contain "n"
		elif x[0] == "insertion":
			dd = dict(label="insertion", performance_id=str(x[1]))
		else:
			dd = dict(label="match", score_id=x[0], performance_id=str(x[1]))
		alignment.append(dd)
	ppart = pt.load_performance_midi(mfn)
	# This may cause re-indexing.
	spart = pt.score.merge_parts(pt.load_musicxml(sfn))
	spart = pt.score.unfold_part_maximal(spart, ignore_leaps=False)
	pt.save_match(alignment, ppart, spart, match_name)

In [14]:
for (afn, mfn, sfn, match_name) in files:
	try:
		produce_match(afn, mfn, sfn, match_name)
	except Exception as e:
		print("Match creation of file {} failed with {}".format(match_name, e))

                            o_map: -- Tuplet start=n1 end=n8 start=None end=None, substituting None

                            o_map: -- Tuplet start=n51 end=n58 start=None end=None, substituting None

                            o_map: -- Tuplet start=n109 end=n116 start=None end=None, substituting None

                            o_map: -- Tuplet start=n572 end=n579 start=None end=None, substituting None

                            o_map: -- Tuplet start=n798 end=n805 start=None end=None, substituting None

                            o_map: -- Tuplet start=n400 end=n407 start=None end=None, substituting None

                            o_map: -- Tuplet start=n625 end=n630 start=None end=None, substituting None

                            o_map: -- Tuplet start=n922 end=n929 start=None end=None, substituting None

                            o_map: -- Tuplet start=n798 end=n805 start=None end=None, substituting None

                            o_map: -- Tuplet start=n1 end=n8 

In [33]:
def tokenize_pitch_spelling(ps_note):
	step = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6}[ps_note["step"].item()]
	return [step, ps_note["alter"].item(), ps_note["octave"].item()]

def create_data(files):
	X_train = list()
	y_train = list()
	X_test = list()
	y_test = list()
	for match_file, score_file in files:
		performance, alignment = pt.load_match(match_file)
		score = pt.load_score(score_file)
		matched_notes = [alignment[idx] for idx, d in enumerate(alignment) if d["label"] == "match"]
		pna = performance.note_array()
		sna = score.note_array(include_pitch_spelling=True)
		X, y = np.zeros((len(matched_notes), 3), dtype=float), np.zeros((len(matched_notes), 3), dtype=int)
		for idx, match_note in enumerate(matched_notes):
			X[idx] = np.lib.recfunctions.structured_to_unstructured(pna[np.where(pna["id"] == str(match_note["performance_id"]))][["onset_sec", "duration_sec", "pitch"]])
			y[idx] = tokenize_pitch_spelling(sna[np.where(sna["id"] == match_note["score_id"])][["step", "alter", "octave"]])
		if os.path.basename(match_file).startswith("Mozart"):
			X_test.append(X)
			y_test.append(y)
		else:
			X_train.append(X)
			y_train.append(y)
	return X_train, y_train, X_test, y_test

In [34]:
X_train, y_train, X_test, y_test = create_data(files)

Exception in thread Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/home/manos/miniconda3/envs/partitura_tutorial/lib/python3.10/multiprocessing/queues.py", line 239, in _feed
QueueFeederThread:
Traceback (most recent call last):
  File "/home/manos/miniconda3/envs/partitura_tutorial/lib/python3.10/multiprocessing/queues.py", line 239, in _feed
    reader_close()
  File "/home/manos/miniconda3/envs/partitura_tutorial/lib/python3.10/multiprocessing/connection.py", line 182, in close
    self._close()
  File "/home/manos/miniconda3/envs/partitura_tutorial/lib/python3.10/multiprocessing/connection.py", line 366, in _close
    reader_close()
  File "/home/manos/miniconda3/envs/partitura_tutorial/lib/python3.10/multiprocessing/connection.py", line 182, in close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/manos/miniconda

### Model

In [35]:
PAD = "<PAD>"

PITCHES = {
	0: ["C", "B#", "D--"],
	1: ["C#", "B##", "D-"],
	2: ["D", "C##", "E--"],
	3: ["D#", "E-", "F--"],
	4: ["E", "D##", "F-"],
	5: ["F", "E#", "G--"],
	6: ["F#", "E##", "G-"],
	7: ["G", "F##", "A--"],
	8: ["G#", "A-"],
	9: ["A", "G##", "B--"],
	10: ["A#", "B-", "C--"],
	11: ["B", "A##", "C-"],
}

INTERVALS = {
	0: ["P1", "d2", "A7"],
	1: ["m2", "A1"],
	2: ["M2", "d3", "AA1"],
	3: ["m3", "A2"],
	4: ["M3", "d4", "AA2"],
	5: ["P4", "A3"],
	6: ["d5", "A4"],
	7: ["P5", "d6", "AA4"],
	8: ["m6", "A5"],
	9: ["M6", "d7", "AA5"],
	10: ["m7", "A6"],
	11: ["M7", "d1", "AA6"],
}

DIATONIC_PITCHES = ["C", "D", "E", "F", "G", "A", "B"]

KEY_SIGNATURES = list(range(-7, 8))
accepted_pitches = [ii for i in PITCHES.values() for ii in i]
accepted_ks = KEY_SIGNATURES
pitch_to_ix = {p: accepted_pitches.index(p) for p in accepted_pitches}
ks_to_ix = {k: KEY_SIGNATURES.index(k) for k in KEY_SIGNATURES}
# add PADDING TAD
pitch_to_ix[PAD] = len(accepted_pitches)
ks_to_ix[PAD] = len(KEY_SIGNATURES)

In [48]:
class PKSpell(nn.Module):
	"""Models that decouples key signature estimation from pitch spelling by adding a second RNN.
	This model reached state of the art performances for pitch spelling.
	"""

	def __init__(
		self,
		input_dim=3,
		hidden_dim=100,
		pitch_to_ix=pitch_to_ix,
		hidden_dim2=24,
		rnn_depth=1,
		dropout=0.1,
		bidirectional=True
	):
		super(PKSpell, self).__init__()
		self.dropout = nn.Dropout(dropout)
		self.n_out_pitch = len(pitch_to_ix)
		self.hidden_dim = hidden_dim
		self.hidden_dim2 = hidden_dim2

		# RNN layer.
		self.rnn = nn.LSTM(
			input_size=input_dim,
			hidden_size=hidden_dim // 2 if bidirectional else hidden_dim,
			bidirectional=bidirectional,
			num_layers=rnn_depth,
		)
		# Output layers.
		self.top_layer_pitch = nn.Linear(hidden_dim, self.n_out_pitch)
		# Loss function that we will use during training.
		self.loss_pitch = nn.CrossEntropyLoss(
			reduction="mean", ignore_index=pitch_to_ix[PAD]
		)

	def compute_outputs(self, sentences, sentences_len):
		sentences = nn.utils.rnn.pack_padded_sequence(sentences, sentences_len)
		rnn_out, _ = self.rnn(sentences)
		rnn_out, _ = nn.utils.rnn.pad_packed_sequence(rnn_out)
		if self.dropout is not None:
			rnn_out = self.dropout(rnn_out)
		out_pitch = self.top_layer_pitch(rnn_out)
		return out_pitch

	def forward(self, sentences, pitches, sentences_len):
		# First computes the predictions, and then the loss function.

		# Compute the outputs. The shape is (max_len, n_sentences, n_labels).
		scores_pitch = self.compute_outputs(sentences, sentences_len)

		# Flatten the outputs and the gold-standard labels, to compute the loss.
		# The input to this loss needs to be one 2-dimensional and one 1-dimensional tensor.
		scores_pitch = scores_pitch.view(-1, self.n_out_pitch)
		pitches = pitches.view(-1)
		loss = self.loss_pitch(scores_pitch, pitches)
		return loss

	def predict(self, sentences, sentences_len):
		# Compute the outputs from the linear units.
		scores_pitch, scores_ks = self.compute_outputs(sentences, sentences_len)

		# Select the top-scoring labels. The shape is now (max_len, n_sentences).
		predicted_pitch = scores_pitch.argmax(dim=2)
		return [predicted_pitch[: int(l), i].cpu().numpy() for i, l in enumerate(sentences_len)]


class PSDataset(Dataset):
	def __init__(self, x, y):
		super(PSDataset, self).__init__()
		self.x = x
		self.y = y
	def __getitem__(self, idx):
		return torch.tensor(self.x[idx]), torch.tensor(self.y[idx])
	def __len__(self):
		return len(self.x)

def collate_ps(data):
	def merge(sequences):
		lengths = [len(seq) for seq in sequences]
		padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
		for i, seq in enumerate(sequences):
			end = lengths[i]
			padded_seqs[i, :end] = seq[:end]
		return padded_seqs, lengths

	# sort a list by sequence length (descending order) to use pack_padded_sequence
	data.sort(key=lambda x: len(x[0]), reverse=True)

	# seperate source and target sequences
	src_seqs, trg_seqs = zip(*data)

	# merge sequences (from tuple of 1D tensor to 2D tensor)
	src_seqs, src_lengths = merge(src_seqs)
	trg_seqs, trg_lengths = merge(trg_seqs)

	return src_seqs, src_lengths, trg_seqs

class PKSpellPL(pl.LightningModule):
	def __init__(self):
		super(PKSpellPL, self).__init__()
		self.module = PKSpell()
	def training_step(self, batch, batch_idx):
		src_seqs, src_lengths, trg_seqs = batch
		loss = self.module(src_seqs, src_lengths, trg_seqs)
		self.log("train_loss", loss.item(), on_epoch=True)
		return loss
	def val_step(self, batch, batch_idx):
		src_seqs, src_lengths, trg_seqs = batch
		loss = self.module(src_seqs, src_lengths, trg_seqs)
		self.log("val_loss", loss.item(), on_epoch=True)
		return loss
	def configure_optimizers(self):
		optimizer = torch.optim.Adam(self.parameters(), lr=0.001, weight_decay=5e-4)
		return {
			"optimizer": optimizer,
		}


### Train the PKSpell model

In [49]:
model = PKSpellPL()
train_dataloader = DataLoader(PSDataset(X_train, y_train), collate_fn=collate_ps, batch_size=3, num_workers=2)
val_dataloader = DataLoader(PSDataset(X_test, y_test), collate_fn=collate_ps, batch_size=3, num_workers=2)
trainer = pl.Trainer()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [50]:
trainer.fit(model, train_dataloader, val_dataloader)

  rank_zero_warn(
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
Missing logger folder: /home/manos/Desktop/JKU/codes/partitura_tutorial/content/lightning_logs

  | Name   | Type    | Params
-----------------------------------
0 | module | PKSpell | 25.6 K
-----------------------------------
25.6 K    Trainable params
0         Non-trainable params
25.6 K    Total params
0.103     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/manos/miniconda3/envs/partitura_tutorial/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/manos/miniconda3/envs/partitura_tutorial/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 61, in fetch
    return self.collate_fn(data)
  File "/tmp/ipykernel_701914/2432978765.py", line 93, in collate_ps
    src_seqs, src_lengths = merge(src_seqs)
  File "/tmp/ipykernel_701914/2432978765.py", line 83, in merge
    padded_seqs[i, :end] = seq[:end]
RuntimeError: expand(torch.DoubleTensor{[728, 3]}, size=[728]): the number of sizes provided (1) must be greater or equal to the number of dimensions in the tensor (2)


# Voice Separation


Voice separation refers to the problem of assigning voices or distinct melody lines given an unquantized midi. This is usefull for the scorification of a MIDI file, the creation of different parts and analysis of different functional melodic lines.

In this tutorial we will see how to implement a model for Voice Separation from MIDI data and compare it with the build-in Voice Separation algorithm in Partitura based on Chew's elliptical model.

For this task we will use the 370 Bach Chorales Dataset which is comprised from 4 Parts for 4 Voices. We will load the parts and merge them together, eventually using a pianoroll like representation for which we will keep the Voice information as Ground truth.

In [None]:
# ! pip install parittura, plotlyb

In [None]:
! wget https://github.com/craigsapp/bach-370-chorales/archive/refs/heads/master.zip
! unzip bach-370-chorales-master

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import partitura
import numpy as np
import plotly.express as px
import os

In [None]:
def pianorolls_from_part(
    x: Union[Union[partitura.score.Part, partitura.score.PartGroup, str], np.ndarray],
    time_unit: str = "beat",
    time_div: int = 12,
    musical_beat: bool = True,
    path: str = "",
) -> Dict:
    if (
        isinstance(x, partitura.score.Part)
        or isinstance(x, partitura.score.PartGroup)
        or isinstance(x, list)
    ):
        parts = list(partitura.score.iter_parts(x))
        # set musical beat if requested
        [part.use_musical_beat() for part in parts]
        # get the maximum length of all parts to avoid shorter pianorolls
        end_time = max([int(part.beat_map([part._points[-1].t])) for part in parts])
        # define the parameters of the compute_pianoroll function
        get_pianoroll = partial(
            partitura.utils.compute_pianoroll,
            time_unit=time_unit,
            time_div=time_div,
            piano_range=True,
            remove_silence=False,
            end_time=end_time,
        )
        # compute pianorolls for all separated voices
        separated_prs = np.array([get_pianoroll(part) for part in parts])
        if not all([pr.shape == separated_prs[0].shape for pr in separated_prs]):
            raise Exception(f"Pianorolls of different lenght in {path}")
        # compute mixed pianoroll
        part = partitura.score.merge_parts(parts, reassign="voice")
        # mixed_pr = get_pianoroll(part)
        # compute mixed note array
        mixed_notearray = part.note_array()

        ## compute the voice_pianoroll, that will have the number of the voice in the bin where there is a note, and -1 where there is not
        dense_pianoroll = np.array([pr.todense() for pr in separated_prs])
        negative_pianoroll = np.zeros(dense_pianoroll.shape[1:])
        negative_pianoroll[np.sum(dense_pianoroll, axis=0) == 0] = 1
        voice_pianoroll = (
            np.concatenate(
                [np.expand_dims(negative_pianoroll, axis=0), dense_pianoroll]
            ).argmax(axis=0)
            - 1
        )
        # return {"separated_pianorolls": separated_prs, "mixed_pianoroll": mixed_pr, "mixed_notearray": mixed_notearray, "path": path}
        return {
            "voice_pianoroll": voice_pianoroll,
            "notearray_pitch": mixed_notearray["pitch"].astype(int),
            "notearray_onset_beat": mixed_notearray["onset_beat"].astype(float),
            "notearray_duration_beat": mixed_notearray["duration_beat"].astype(float),
            "notearray_voice": mixed_notearray["voice"].astype(int),
            "path": path,
        }
    elif isinstance(x, str):
        # print(f"Processing {x}")
        return pianorolls_from_part(
            partitura.load_score(x),
            time_unit,
            time_div,
            musical_beat,
            x.split(os.path.sep)[-1],
        )
    else:
        raise TypeError(f"x must be a list of Parts, not {type(x)}")


def pr_to_voice_pred(
    pianoroll: np.ndarray,
    onset_beat: np.ndarray,
    duration_beat: np.ndarray,
    pitch: np.ndarray,
    piano_range: bool,
    time_div: int,
):
    """
    Take the predicted voices from a pianoroll and map them into the note_array.
    Returns a list with a voice for each note in the input note array.

    The input pianoroll has dimension Tx88xV where V is the maximum number of voices.
    For a fixed t and note, it should contains V log probabilities that the note belong to each voice.
    WARNING: this does not work with normal or unnormalized probabilities. Only with log probabilities

    Returns a voice array, one for each note in the same order as the input parameters.
    Voices start from 1.
    """

    # shift in case the first time position is negative (pickup measure)
    positive_onset_beats = onset_beat
    if onset_beat[0] < 0:
        positive_onset_beats = positive_onset_beats - positive_onset_beats[0]
    pr_onset_idxs = torch.round(time_div * positive_onset_beats).int()
    pr_durations = torch.clip(
        torch.round(time_div * duration_beat).int(), min=1, max=None
    )
    pr_offset_idxs = pr_onset_idxs + pr_durations

    pitch_idxs = pitch
    if piano_range:
        pitch_idxs = pitch_idxs - 21  # pianorolls are with 88 only notes

    pred_voice = torch.zeros(pitch.shape, dtype=torch.int64)
    for i, (p, ons, offs) in enumerate(zip(pitch_idxs, pr_onset_idxs, pr_offset_idxs)):
        # get predictions from the pianoroll
        voice = pianoroll[:, ons : offs + 1, p]
        # sum the log probs to get a unique probability for the entire note, and take the max
        pred_voice[i] = torch.sum(voice, axis=1).argmax() + 1

    return pred_voice

def show_voice_pr(pitches, onsets, durations, voices, time_unit, time_div, return_figure= False, colors = None):
    unique_voices = np.unique(voices)
    # create the structured arrays
    struct_array = np.zeros(len(voices), dtype={'names':('pitch', 'onset_beat', 'duration_beat'),
                          'formats':('i4', 'f4', 'f4')})
    struct_array["pitch"] = pitches
    struct_array["onset_beat"] = onsets
    struct_array["duration_beat"] = durations

    # create a pianoroll where each voice has a different value
    piano_rolls = []
    end_time = onsets[-1] + durations[-1]
    for i,voice_n in enumerate(unique_voices):
        pr = partitura.utils.compute_pianoroll(struct_array[voices==voice_n],piano_range =True, time_unit = time_unit, time_div=time_div, remove_silence = False, end_time = float(end_time))
        piano_rolls.append(pr.multiply(voice_n).todense())
    # this takes the maximum, meaning that if two voices share the same note, only the highest voice will be shown
    mixed_pr = np.maximum.reduce(piano_rolls)
    if colors is None:
        colors = px.colors.sample_colorscale("turbo", [voice_n/(np.max(unique_voices) -1) for voice_n in range(np.max(unique_voices))])

    separators = [voice_n/np.max(unique_voices) for voice_n in unique_voices]
    epsilon = 0.001
    background_color = "rgba(0,0,0,0.0)"
    color_scale = [(0.0,background_color),(0+epsilon,background_color)]
    last_value = 0
    for i,sep in enumerate(separators):
        color_scale.append((last_value+epsilon,colors[i]))
        color_scale.append((sep,colors[i]))
        last_value = sep

    fig = px.imshow( mixed_pr, origin="lower", color_continuous_scale = color_scale)
    fig.show()
    if return_figure:
        return fig


In [None]:
prs = list()
for root, dirs, files in os.walk("/content/bach-370-chorales-master"):
	for file in files:
		if file.endswith("krn"):
			score = partitura.load_score(file)
			pianoroll = pianorolls_from_part(score)
			prs.append(pianoroll)