## Setup in Local Machine

In [None]:
#Install
%pip install worker
%pip install pandas
%pip install numpy
%pip install networkx
%pip install matplotlib
%pip install music21
%pip install musescore
%pip install tslearn
%pip install sklearn
%pip install multiprocessing
%pip install pyvis

In [28]:
# Import necessary libraries
import worker
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

from music21 import *
from music21 import converter, corpus, environment, note, chord
from tslearn.metrics import dtw
from multiprocessing import Pool, Manager, cpu_count
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics import pairwise_distances
from networkx.algorithms.cuts import conductance
from pyvis.network import Network
from IPython.display import display, HTML

# Set the paths to MuseScore executable
env = environment.Environment()
env['musicxmlPath'] = 'C:\\Program Files\\MuseScore 4\\bin\\MuseScore4.exe'  # Path to MuseScore executable
env['musescoreDirectPNGPath'] = 'C:\\Program Files\\MuseScore 4\\bin\\MuseScore4.exe'  # Path to MuseScore executable

## Process Sample Music

#### Functions

In [29]:
# midi to score array function (contains all data needed for score visualization)
def midi_to_sarr(midi_parsed):
    sarr = []
    for part in midi_parsed.parts:
        for element in part.flatten():
            sarr.append(element)
    return sarr

In [30]:
# sarr to narr and nmat function (removes all elements except notes, rests and chords then turn it into a matrix )
def sarr_to_nmat_and_narr(score_array):
  trashed_elements = 0
  narr = []
  nmat = pd.DataFrame(columns=['onset_beats', 'duration_beats', 'midi_pitch', 'ir_symbol'])

  for element in score_array:
    if isinstance(element, chord.Chord):
      row = [element.offset, element.duration.quarterLength, element.root().midi, 'unassigned']
      nmat.loc[len(nmat)] = row
      narr.append(element)
    elif isinstance(element, note.Rest):
      row = [element.offset, element.duration.quarterLength, 0, 'rest']
      nmat.loc[len(nmat)] = row
      narr.append(element)    
    else:
      try:
        row = [element.offset, element.duration.quarterLength, element.pitch.midi, 'unassigned']
        nmat.loc[len(nmat)] = row
        narr.append(element)
      except:
        trashed_elements += 1
        # print(f"Trashed element #{trashed_elements}:\n{note}") # for debugging
  return nmat, narr

#### Usage

In [178]:
# parse the midi and keep score title
midi_file = 'bach_846.mid' # Hardcoded, for multiple songs, make a function to iterate thru folder
# midi_file = ".\\Music Database\\GTTM Database\\Alexander Porfir’evich Borodin\\Quartet for 2 Violins, Viola and Violoncello D major 3 Nocturne.xml"
midi_parsed = converter.parse(midi_file)
score_title = midi_file[:-4] # Temporary, apparently the title is not a score element that you can extract so im using the filename

In [179]:
# Verify the paths
print("MusicXML Path:", env['musicxmlPath'])
print("MuseScore Direct PNG Path:", env['musescoreDirectPNGPath'])

MusicXML Path: C:\Program Files\MuseScore 4\bin\MuseScore4.exe
MuseScore Direct PNG Path: C:\Program Files\MuseScore 4\bin\MuseScore4.exe


In [146]:
midi_parsed.show("xml")

In [180]:
# convert parsed midi into a readable array of elements that forms the score
sarr = midi_to_sarr(midi_parsed) # this midi was cleaned
sarr

[<music21.instrument.Piano 'Piano, Piano right: Piano, Piano right'>,
 <music21.instrument.Piano 'Piano'>,
 <music21.clef.TrebleClef>,
 <music21.tempo.MetronomeMark andante Quarter=74>,
 <music21.key.Key of C major>,
 <music21.meter.TimeSignature 4/4>,
 <music21.note.Rest eighth>,
 <music21.note.Note G>,
 <music21.note.Note C>,
 <music21.tempo.MetronomeMark Quarter=75>,
 <music21.note.Note E>,
 <music21.tempo.MetronomeMark andante Quarter=74>,
 <music21.note.Note G>,
 <music21.note.Note C>,
 <music21.note.Note E>,
 <music21.note.Rest eighth>,
 <music21.note.Note G>,
 <music21.note.Note C>,
 <music21.note.Note E>,
 <music21.tempo.MetronomeMark Quarter=75>,
 <music21.note.Note G>,
 <music21.note.Note C>,
 <music21.note.Note E>,
 <music21.tempo.MetronomeMark andante Quarter=74>,
 <music21.note.Rest eighth>,
 <music21.note.Note A>,
 <music21.tempo.MetronomeMark Quarter=76>,
 <music21.note.Note D>,
 <music21.tempo.MetronomeMark andantino Quarter=78>,
 <music21.note.Note F>,
 <music21.tempo.

In [181]:
# convert score array into a note array and note matrix
nmat, narr = sarr_to_nmat_and_narr(sarr)

In [182]:
narr

[<music21.note.Rest eighth>,
 <music21.note.Note G>,
 <music21.note.Note C>,
 <music21.note.Note E>,
 <music21.note.Note G>,
 <music21.note.Note C>,
 <music21.note.Note E>,
 <music21.note.Rest eighth>,
 <music21.note.Note G>,
 <music21.note.Note C>,
 <music21.note.Note E>,
 <music21.note.Note G>,
 <music21.note.Note C>,
 <music21.note.Note E>,
 <music21.note.Rest eighth>,
 <music21.note.Note A>,
 <music21.note.Note D>,
 <music21.note.Note F>,
 <music21.note.Note A>,
 <music21.note.Note D>,
 <music21.note.Note F>,
 <music21.note.Rest eighth>,
 <music21.note.Note A>,
 <music21.note.Note D>,
 <music21.note.Note F>,
 <music21.note.Note A>,
 <music21.note.Note D>,
 <music21.note.Note F>,
 <music21.note.Rest eighth>,
 <music21.note.Note G>,
 <music21.note.Note D>,
 <music21.note.Note F>,
 <music21.note.Note G>,
 <music21.note.Note D>,
 <music21.note.Note F>,
 <music21.note.Rest eighth>,
 <music21.note.Note G>,
 <music21.note.Note D>,
 <music21.note.Note F>,
 <music21.note.Note G>,
 <music21.

## Implication-Realization Ruleset, Assignment and Score Visualization

#### Functions

In [183]:
# IR symbol calculation function
def calculate_ir_symbol(interval1, interval2, threshold=5):
    direction = interval1 * interval2
    abs_difference = abs(interval2-interval1)
    # Process
    if direction > 0 and (abs(interval2-interval1))<threshold:
        return 'P'  
    # IR2: D (Duplication)
    elif interval1 == interval2 == 0:
        return 'D' 
    # IR3: IP (Intervallic Process)
    elif ((interval1 * interval2)<0) and (-threshold <= (abs(interval2) - abs(interval1)) <= threshold) and (abs(interval2) != abs(interval1)):
        return 'IP' 
    # IR4: ID (Intervallic Duplication)
    elif ((interval1 * interval2) < 0) and (abs(interval2) == abs(interval1)):
        return 'ID'   
    # IR5: VP (Vector Process)
    elif (interval1 * interval2 > 0) and (abs(interval2-interval1) >= threshold) and (abs(interval1) <= threshold):
        return 'VP'
    # IR6: R (Reveral)
    elif (interval1 * interval2 < 0) and (abs(abs(interval2)-abs(interval1)) >= threshold) and (abs(interval1) >= threshold):
        return 'R'
    # IR7: IR (Intervallic Reveral)
    elif (interval1 * interval2 > 0) and (abs(abs(interval2)-abs(interval1)) >= threshold) and (abs(interval1) >= threshold):
        return 'IR' 
    # IR8: VR (Vector Reveral)
    elif (interval1 * interval2 < 0) and (abs(interval2 - interval1) >= threshold) and (abs(interval1) <= threshold):
        return 'VR'
    elif interval2 == 0 and not (interval1 < -5 or interval1 > 5):
        return 'IP'
    elif interval2 == 0 and (interval1 < -5 or interval1 > 5):
        return 'R'
    elif interval1 == 0 and not (interval2 < -5 or interval2 > 5):
        return 'P'
    elif interval1 == 0 and (interval2 < -5 or interval2 > 5):
        return 'VR'

In [184]:
# assign IR symbol function (original; modified)
def assign_ir_symbols(score_array):

    symbols = []
    current_group = [] 
    group_pitches = []
    
    color_map = {
        'P': 'blue',        # IR1: P (Process) 
        'D': 'green',       # IR2: D (Duplication)
        'IP': 'red',        # IR3: IP (Intervallic Process)
        'ID': 'orange',     # IR4: ID (Intervallic Duplication)
        'VP': 'purple',     # IR5: VP (Vector Process)
        'R': 'cyan',        # IR6: R (Reveral)
        'IR': 'magenta',    # IR7: IR (Intervallic Reveral)
        'VR': 'yellow',     # IR8: VR (Vector Reveral)
        'M': 'pink',        # IR9: M (Monad)
        'd': 'lime',        # IR10 d (Dyad)
    }

    def evaluate_current_group():
        if len(current_group) == 3:
            interval1 = group_pitches[1] - group_pitches[0]
            interval2 = group_pitches[2] - group_pitches[1]
            symbol = calculate_ir_symbol(interval1, interval2)
            # symbols.append(symbol)
            color = color_map.get(symbol, 'black')  # Default to black if symbol is not predefined
            symbols.extend((note, symbol, color) for note in current_group)
        elif len(current_group) == 2:
            # symbols.append('d')  # Dyad
            symbols.extend((note, 'd', color_map['d']) for note in current_group)
        elif len(current_group) == 1:
            # symbols.append('M')  # Monad
            symbols.extend((note, 'M', color_map['M']) for note in current_group)
        # else:
            # symbols.append('Error: Invalid note object')
        current_group.clear()
        group_pitches.clear()

    for element in score_array:
        if isinstance(element, note.Note):
            current_group.append(element)
            group_pitches.append(element.pitch.ps)
            if len(current_group) == 3:
                evaluate_current_group()
        elif isinstance(element, chord.Chord):
            current_group.append(element)
            group_pitches.append(element.root().ps)
            if len(current_group) == 3:
                evaluate_current_group()
        elif isinstance(element, note.Rest):
            # continue
            # Remove continue for visualization
            rest_tuple = (element, 'rest', 'black') 
            evaluate_current_group()
            symbols.append(rest_tuple)
        else:
            if current_group:
                evaluate_current_group()

    # Handle any remaining notes
    if current_group:
        evaluate_current_group()

    return symbols

In [185]:
# Score visualization function
def visualize_notes_with_symbols(notes_with_symbols):
    s = stream.Score()
    part = stream.Part()
    for note, symbol, color in notes_with_symbols:
        print(note, symbol, color)
        note.style.color = color
        note.lyric = symbol
        part.append(note)
    s.append(part)
    s.show()

In [186]:
# store ir symbol in nmat function, pass narr data to nmat
def pass_ir_symbol_narr_to_nmat(note_array, note_matrix):
    pointer = 0
    for note in note_array:
        note_data, ir_symbol, color = note
        nmat.loc[pointer, 'ir_symbol'] = ir_symbol
        pointer += 1

In [187]:
def pass_ir_symbol_narr_to_nmat(note_array, note_matrix):
    for pointer, (note_data, ir_symbol, color) in enumerate(note_array):
        note_matrix.at[pointer, 'ir_symbol'] = ir_symbol
    return note_matrix

#### Usage

In [188]:
# Usage of the above functions
try:
    narr = assign_ir_symbols(narr)
except: # temporary fix for when error occurs
    print("An erorr occured, re-importing music21")
    from music21 import *
    narr = assign_ir_symbols(narr)

# store ir symbol in nmat, pass narr data to nmat
pass_ir_symbol_narr_to_nmat(narr, nmat)

Unnamed: 0,onset_beats,duration_beats,midi_pitch,ir_symbol
0,0.0,0.5,0,rest
1,0.5,0.25,67,P
2,0.75,0.25,72,P
3,1.0,0.25,76,P
4,1.25,0.25,67,P
...,...,...,...,...
753,132.0,3.75,36,d
754,132.25,3.5,50,d
755,135.75,0.25,0,rest
756,136.0,2.75,36,M


In [189]:
nmat

Unnamed: 0,onset_beats,duration_beats,midi_pitch,ir_symbol
0,0.0,0.5,0,rest
1,0.5,0.25,67,P
2,0.75,0.25,72,P
3,1.0,0.25,76,P
4,1.25,0.25,67,P
...,...,...,...,...
753,132.0,3.75,36,d
754,132.25,3.5,50,d
755,135.75,0.25,0,rest
756,136.0,2.75,36,M


In [190]:
def assign_ir_pattern_indices(notematrix):
    pattern_index = 0
    indices = []
    i = 0
    while i < len(notematrix):
        ir_symbol = notematrix.iloc[i]['ir_symbol']
        if ir_symbol == 'd':
            indices.extend([pattern_index, pattern_index])
            i += 2
        elif ir_symbol == 'M' or ir_symbol == 'rest':
            indices.append(pattern_index)
            i += 1
        else:
            indices.extend([pattern_index, pattern_index, pattern_index])
            i += 3
        pattern_index += 1
    notematrix['pattern_index'] = indices
    return notematrix

In [191]:
assign_ir_pattern_indices(nmat)

Unnamed: 0,onset_beats,duration_beats,midi_pitch,ir_symbol,pattern_index
0,0.0,0.5,0,rest,0
1,0.5,0.25,67,P,1
2,0.75,0.25,72,P,1
3,1.0,0.25,76,P,1
4,1.25,0.25,67,P,2
...,...,...,...,...,...
753,132.0,3.75,36,d,392
754,132.25,3.5,50,d,392
755,135.75,0.25,0,rest,393
756,136.0,2.75,36,M,394


## Gestalt Based Segmentation (Functions)

In [192]:
# onset function
def get_onset(notematrix: pd.DataFrame, timetype='beat'):
  if timetype == 'beat':
    return notematrix['onset_beats']
  elif timetype == 'sec':
    return notematrix['onset_sec']
  else:
    ValueError(f"Invalid timetype: {timetype}. Choices are only 'beat' and 'sec'")

In [193]:
# duration function
def get_duration(notematrix: pd.DataFrame, timetype='beat') -> pd.Series:
  if timetype == 'beat':
    return notematrix['duration_beats']
  elif timetype == 'sec':
    return notematrix['duration_sec']
  else:
    ValueError(f"Invalid timetype: {timetype}. Choices are only 'beat' and 'sec'")

In [194]:
# Calculate Clang Boundaries Function
def calculate_clang_boundaries(notematrix: pd.DataFrame):
    cl = 2 * (get_onset(notematrix).diff().fillna(0) + get_duration(notematrix).shift(-1).fillna(0)) + abs(notematrix['midi_pitch'].diff().fillna(0))
    cl = cl.infer_objects()  # Ensure correct data types
    clb = (cl.shift(-1).fillna(0) > cl) & (cl.shift(1).fillna(0) > cl)
    clind = clb[clb].index.tolist()
    return clind, clb

In [195]:
# segment boundary calculation function
def calculate_segment_boundaries(notematrix, clind):
    first = [0] + clind
    last = clind + [len(notematrix) - 1]
    mean_pitch = [notematrix.iloc[first[i]:last[i]+1]['midi_pitch'].mean() for i in range(len(first))]
    segdist = []
    for i in range(1, len(first)):
        segdist.append(abs(mean_pitch[i] - mean_pitch[i - 1]) +
                       notematrix.iloc[first[i]]['onset_beats'] - notematrix.iloc[last[i - 1]]['onset_beats'] +
                       notematrix.iloc[first[i]]['duration_beats'] + notematrix.iloc[first[i - 1]]['duration_beats'] +
                       2 * (notematrix.iloc[first[i]]['onset_beats'] - notematrix.iloc[last[i - 1]]['onset_beats']))

    segb = [(segdist[i] > segdist[i-1] and segdist[i] > segdist[i+1]) for i in range(1, len(segdist)-1)]
    segind = [clind[i] for i in range(1, len(segdist)-1) if segb[i-1]]
    return segind

In [196]:
def calculate_segment_boundaries(notematrix, clind):
    # Initialize first and last indices for segments
    first = [0] + clind
    last = [i-1 for i in clind] + [len(notematrix) - 1]

    # Calculate mean pitch for each segment weighted by duration
    mean_pitch = []
    for i in range(len(first)):
        segment = notematrix.iloc[first[i]:last[i]+1]
        weighted_pitch_sum = (segment['midi_pitch'] * segment['duration_beats']).sum()
        total_duration = segment['duration_beats'].sum()
        mean_pitch.append(weighted_pitch_sum / total_duration)

    # Calculate segment distances
    segdist = []
    for i in range(1, len(first)):
        distance = (abs(mean_pitch[i] - mean_pitch[i - 1]) +
                    notematrix.iloc[first[i]]['onset_beats'] - notematrix.iloc[last[i - 1]]['onset_beats'] +
                    notematrix.iloc[first[i]]['duration_beats'] + notematrix.iloc[last[i - 1]]['duration_beats'] +
                    2 * (notematrix.iloc[first[i]]['onset_beats'] - notematrix.iloc[last[i - 1]]['onset_beats']))
        segdist.append(distance)

    # Identify local maxima in segment distances and check pattern_index consistency
    # segb = [(segdist[i] > segdist[i-1] and segdist[i] > segdist[i+1] and 
    #          notematrix.iloc[clind[i]]['pattern_index'] != notematrix.iloc[clind[i-1]]['pattern_index'] and
    #          notematrix.iloc[clind[i]]['pattern_index'] != notematrix.iloc[clind[i+1]]['pattern_index'])
    #         for i in range(1, len(segdist)-1)]
    segb = [(segdist[i] > segdist[i-1] and segdist[i] > segdist[i+1]) for i in range(1, len(segdist)-1)]
    segind = [clind[i] for i in range(1, len(segdist)-1) if segb[i-1]]

    # Create binary vector for segment boundaries
    s = pd.Series(0, index=range(len(notematrix)))
    s.iloc[segind] = 1

    return s

In [197]:
def adjust_segment_boundaries(notematrix, s):
    adjusted_s = s.copy()
    i = 0

    while i < len(notematrix):
        if adjusted_s.iloc[i] == 1:
            current_pattern = notematrix.iloc[i]['pattern_index']
            ir_symbol = notematrix.iloc[i]['ir_symbol']
            count = 1
            while i + count < len(notematrix) and notematrix.iloc[i + count]['pattern_index'] == current_pattern:
                count += 1
            
            # If the segment ends with an incomplete pattern and the ir_symbol is not 'd' or 'M'
            if ir_symbol != 'd' and ir_symbol != 'M':
                if count == 1:
                    # Shift the segment boundary one position down
                    if i + 1 < len(notematrix):
                        adjusted_s.iloc[i] = 0
                        adjusted_s.iloc[i + 1] = 1
                elif count == 2:
                    # Shift the segment boundary one position down to complete the pattern
                    if i + 1 < len(notematrix):
                        adjusted_s.iloc[i] = 0
                        adjusted_s.iloc[i + 1] = 1
            i += count
        else:
            i += 1

    return adjusted_s

In [198]:
def adjust_segment_boundaries(notematrix, s):
    adjusted_s = s.copy()
    indices_with_ones = np.where(s == 1)[0].tolist()
    i = 0
    
    while i < len(notematrix):
        if adjusted_s.iloc[i] == 1:
            current_pattern = notematrix.iloc[i]['pattern_index']
            ir_symbol = notematrix.iloc[i]['ir_symbol']
            
            if ir_symbol == 'M' or ir_symbol == 'rest':
                # Skip monads and rests
                i += 1
                continue
            
            elif ir_symbol == 'd':
                if 0 < i < len(notematrix) - 1:
                    prev_index = indices_with_ones[indices_with_ones.index(i) - 1] if indices_with_ones.index(i) > 0 else 0
                    next_index = indices_with_ones[indices_with_ones.index(i) + 1] if indices_with_ones.index(i) < len(indices_with_ones) - 1 else len(notematrix) - 1
                    
                    # Check the distances to previous and next indices with ones
                    if (i - prev_index) > (next_index - i):
                        adjusted_s.iloc[i] = 0
                        adjusted_s.iloc[i + 1] = 1
                    else:
                        adjusted_s.iloc[i] = 0
                        adjusted_s.iloc[i - 1] = 1
                i += 1
                continue
            
            # Handle cases for triads and other patterns
            if i > 1:  # Ensure there are at least two previous elements to check
                previous_pattern1 = notematrix.iloc[i - 1]['pattern_index']
                previous_pattern2 = notematrix.iloc[i - 2]['pattern_index']
                
                if current_pattern == previous_pattern1 == previous_pattern2:
                    i += 1
                    continue
                elif current_pattern == previous_pattern1 and current_pattern != previous_pattern2:
                    adjusted_s.iloc[i] = 0
                    adjusted_s.iloc[i + 1] = 1
                elif current_pattern != previous_pattern1 and previous_pattern1 != previous_pattern2:
                    adjusted_s.iloc[i] = 0
                    adjusted_s.iloc[i - 1] = 1
            i += 1
        else:
            i += 1

    return adjusted_s

In [199]:
def segmentgestalt(notematrix):
    if notematrix.empty:
        return None

    # Assign IR pattern indices
    notematrix = assign_ir_pattern_indices(notematrix)

    # Calculate clang boundaries
    clind, clb = calculate_clang_boundaries(notematrix)
    
    # Calculate segment boundaries
    segind = calculate_segment_boundaries(notematrix, clind)

    # Ensure segments do not split IR patterns
    adjusted_segind = []
    last_index = -1
    for idx in segind:
        pattern_index = notematrix.iloc[idx]['pattern_index']
        if pattern_index != notematrix.iloc[last_index]['pattern_index']:
            adjusted_segind.append(idx)
        last_index = idx

    # Split the note matrix into segments
    segments = []
    start_idx = 0
    for end_idx in adjusted_segind:
        segments.append(notematrix.iloc[start_idx:end_idx+1])
        start_idx = end_idx + 1
    segments.append(notematrix.iloc[start_idx:])
    
    c = pd.Series(0, index=range(len(notematrix)))
    c.iloc[clind] = 1
    
    s = pd.Series(0, index=range(len(notematrix)))
    s.iloc[adjusted_segind] = 1

    return c, s, segments

In [200]:
def segmentgestalt(notematrix):
    if notematrix.empty:
        return None

    # Assign IR pattern indices
    notematrix = assign_ir_pattern_indices(notematrix)

    # Calculate clang boundaries
    clind, clb = calculate_clang_boundaries(notematrix)
    
    # Calculate segment boundaries
    s = calculate_segment_boundaries(notematrix, clind)

    # Adjust segment boundaries to ensure IR patterns are not split
    s = adjust_segment_boundaries(notematrix, s)

    # Create binary vector for clang boundaries
    c = pd.Series(0, index=range(len(notematrix)))
    c.iloc[clind] = 1

    # Create segments based on adjusted segment boundaries
    segments = []
    start_idx = 0
    for end_idx in s[s == 1].index:
        segments.append(notematrix.iloc[start_idx:end_idx+1])
        start_idx = end_idx + 1
    segments.append(notematrix.iloc[start_idx:])

    return c, s, segments

In [201]:
clind = calculate_clang_boundaries(nmat)[0]
clind

[3,
 6,
 10,
 13,
 17,
 20,
 24,
 27,
 31,
 34,
 38,
 41,
 45,
 48,
 52,
 55,
 59,
 62,
 66,
 69,
 72,
 75,
 79,
 82,
 87,
 90,
 94,
 97,
 100,
 103,
 107,
 110,
 114,
 117,
 121,
 124,
 128,
 131,
 135,
 138,
 143,
 146,
 150,
 153,
 156,
 159,
 163,
 166,
 169,
 172,
 175,
 179,
 182,
 185,
 188,
 192,
 195,
 200,
 203,
 207,
 210,
 213,
 216,
 220,
 223,
 227,
 230,
 234,
 237,
 241,
 244,
 248,
 251,
 256,
 259,
 263,
 266,
 269,
 272,
 276,
 279,
 283,
 286,
 290,
 293,
 300,
 307,
 311,
 314,
 318,
 321,
 326,
 329,
 333,
 336,
 340,
 343,
 347,
 350,
 359,
 364,
 368,
 371,
 375,
 378,
 382,
 385,
 389,
 392,
 396,
 399,
 403,
 406,
 413,
 423,
 425,
 428,
 431,
 433,
 436,
 440,
 443,
 447,
 450,
 455,
 480,
 484,
 487,
 489,
 491,
 493,
 495,
 497,
 499,
 502,
 504,
 509,
 511,
 513,
 515,
 517,
 519,
 522,
 525,
 527,
 529,
 533,
 538,
 540,
 542,
 547,
 549,
 551,
 554,
 556,
 558,
 563,
 565,
 568,
 570,
 573,
 579,
 582,
 584,
 586,
 591,
 593,
 595,
 597,
 599,
 602,
 605

In [202]:
sind = calculate_segment_boundaries(nmat, clind)
sind

0      0
1      0
2      0
3      0
4      0
      ..
753    0
754    0
755    0
756    0
757    0
Length: 758, dtype: int64

In [206]:
adjusted = adjust_segment_boundaries(nmat, sind)
adjusted

0      0
1      0
2      0
3      0
4      0
      ..
753    0
754    0
755    0
756    0
757    0
Length: 758, dtype: int64

In [204]:
segments = segmentgestalt(nmat)

In [205]:
segments

(0      0
 1      0
 2      0
 3      1
 4      0
       ..
 753    0
 754    1
 755    0
 756    1
 757    0
 Length: 758, dtype: int64,
 0      0
 1      0
 2      0
 3      0
 4      0
       ..
 753    0
 754    0
 755    0
 756    0
 757    0
 Length: 758, dtype: int64,
 [   onset_beats duration_beats  midi_pitch ir_symbol  pattern_index
  0          0.0            0.5           0      rest              0
  1          0.5           0.25          67         P              1
  2         0.75           0.25          72         P              1
  3          1.0           0.25          76         P              1
  4         1.25           0.25          67         P              2
  5          1.5           0.25          72         P              2
  6         1.75           0.25          76         P              2
  7          2.0            0.5           0      rest              3
  8          2.5           0.25          67         P              4
  9         2.75           0.25   

In [100]:
segind = calculate_segment_boundaries(nmat, clind)

In [101]:
segind

[17,
 27,
 41,
 59,
 69,
 97,
 124,
 166,
 172,
 182,
 210,
 237,
 266,
 279,
 340,
 350,
 382,
 392,
 413,
 428,
 433,
 484,
 499,
 513,
 525,
 554,
 563,
 582,
 593,
 602,
 622,
 627,
 633,
 651,
 657,
 663,
 669,
 677,
 683,
 689,
 695,
 701,
 709,
 723,
 739,
 745,
 751]

In [None]:
adjusted_segind = []
last_index = -1
for idx in segind:
    pattern_index = nmat.iloc[idx]['pattern_index']
    if pattern_index != nmat.iloc[last_index]['pattern_index']:
        adjusted_segind.append(idx)
    last_index = idx

In [97]:
nmat.iloc[-1]['pattern_index']

395

## Get Clang Boundaries and Segments

In [89]:
# show clang boundaries
clang_boundaries = calculate_clang_boundaries(nmat)
print("Clang Boundaries:", clang_boundaries)

Clang Boundaries: ([3, 6, 10, 13, 17, 20, 24, 27, 31, 34, 38, 41, 45, 48, 52, 55, 59, 62, 66, 69, 72, 75, 79, 82, 87, 90, 94, 97, 100, 103, 107, 110, 114, 117, 121, 124, 128, 131, 135, 138, 143, 146, 150, 153, 156, 159, 163, 166, 169, 172, 175, 179, 182, 185, 188, 192, 195, 200, 203, 207, 210, 213, 216, 220, 223, 227, 230, 234, 237, 241, 244, 248, 251, 256, 259, 263, 266, 269, 272, 276, 279, 283, 286, 290, 293, 300, 307, 311, 314, 318, 321, 326, 329, 333, 336, 340, 343, 347, 350, 359, 364, 368, 371, 375, 378, 382, 385, 389, 392, 396, 399, 403, 406, 413, 423, 425, 428, 431, 433, 436, 440, 443, 447, 450, 455, 480, 484, 487, 489, 491, 493, 495, 497, 499, 502, 504, 509, 511, 513, 515, 517, 519, 522, 525, 527, 529, 533, 538, 540, 542, 547, 549, 551, 554, 556, 558, 563, 565, 568, 570, 573, 579, 582, 584, 586, 591, 593, 595, 597, 599, 602, 605, 607, 610, 614, 616, 618, 622, 625, 627, 629, 631, 633, 635, 637, 639, 641, 643, 645, 647, 649, 651, 653, 655, 657, 659, 663, 665, 667, 669, 671, 673, 

In [91]:
calculate_segment_boundaries(nmat, clang_boundaries[0])

[17,
 27,
 41,
 59,
 69,
 97,
 124,
 166,
 172,
 182,
 210,
 237,
 266,
 279,
 340,
 350,
 382,
 392,
 413,
 428,
 433,
 484,
 499,
 513,
 525,
 554,
 563,
 582,
 593,
 602,
 622,
 627,
 633,
 651,
 657,
 663,
 669,
 677,
 683,
 689,
 695,
 701,
 709,
 723,
 739,
 745,
 751]

In [84]:
# Get segments
segments = segmentgestalt(nmat)

# Store viz related properties (score title, color), labelling the segment
labeled_segments = []
assigned_color = 'red' # HARDCODED temporary since multiple scores not impremented yet
# color_list = ['blue', 'green', 'red', 'orange', 'purple', 'cyan', 'magenta', 'yellow', 'pink', 'lime']
label = (score_title, assigned_color)
for segment in segments:
    labeled_segments.append((label, segment))


# output to a readable .txt file
with open("labeled_segments.txt", "w") as f:
    f.write(f"Number of segments: {len(labeled_segments)}\n\n")
    for idx, tuple in enumerate(labeled_segments):
        label, segment = tuple
        title, color = label
        f.write(f"{title} Segment {idx+1} ({color}):\n")
        f.write(f"{segment}\n")
        f.write("--------------------------------------------\n\n")

In [87]:
segments[1]

[17,
 27,
 41,
 59,
 69,
 97,
 124,
 166,
 172,
 182,
 210,
 237,
 266,
 279,
 340,
 350,
 382,
 392,
 413,
 428,
 433,
 484,
 499,
 513,
 525,
 554,
 563,
 582,
 593,
 602,
 622,
 627,
 633,
 651,
 657,
 663,
 669,
 677,
 683,
 689,
 695,
 701,
 709,
 723,
 739,
 745,
 751]

## DTW Distance Using TSLearn

In [None]:
def segments_to_distance_matrix(segments: list[pd.DataFrame], cores=None):
    if __name__ == '__main__':
        
        if cores is not None and cores > cpu_count():
            raise ValueError(f"You don't have enough cores! Please specify a value within your system's number of cores. \n Core Count: {cpu_count()}")
        
        seg_np = [segment.to_numpy() for segment in segments]
    
        num_segments = len(seg_np)
        distance_matrix = np.zeros((num_segments, num_segments))
    
        # Create argument list for multiprocessing
        args_list = []
        for i in range(num_segments):
            for j in range(i + 1, num_segments):
                args_list.append((i, j, segments[i], segments[j]))
    
        with Manager() as manager:
            message_list = manager.list()
    
            def log_message(message):
                message_list.append(message)
    
            # Use multiprocessing Pool to parallelize the calculations
            with Pool() as pool:
                results = pool.map(worker.calculate_distance, args_list)
    
            # Update distance matrix with the results
            for i, j, distance, message in results:
                distance_matrix[i, j] = distance
                distance_matrix[j, i] = distance  # Reflect along the diagonal
                log_message(message)
    
            # Print messages from the shared list
            for message in message_list:
                print(message)
    
        return distance_matrix

In [None]:
# segments to distance matrix
dist_mat = segments_to_distance_matrix(segments)
print(f"there are {len(dist_mat)} elements in dist mat")

## Building the KNN Graph


In [None]:
# building KNN Graph
k = 3
distance_matrix = dist_mat
knn_graph = kneighbors_graph(distance_matrix, n_neighbors=k, mode='connectivity')

G = nx.from_scipy_sparse_array(knn_graph)

# Detect if the graph is disjoint
if not nx.is_connected(G):
    print("The KNN graph is disjoint. Ensuring connectivity...")

    # Calculate the connected components
    components = list(nx.connected_components(G))

    # Connect the components
    for i in range(len(components) - 1):
        min_dist = np.inf
        closest_pair = None
        for node1 in components[i]:
            for node2 in components[i + 1]:
                dist = distance_matrix[node1, node2]
                if dist < min_dist:
                    min_dist = dist
                    closest_pair = (node1, node2)

        # Add an edge between the closest pair of nodes from different components
        G.add_edge(closest_pair[0], closest_pair[1])

# Plot the final connected graph
pos = nx.spring_layout(G, seed=42, iterations=50)
pos_dict = {i: pos[i] for i in range(len(pos))}
nx.draw(G, node_size=50, pos=pos_dict)
plt.title('Bach Prelude in C')
plt.show()

In [None]:
# distance matrix to knn graph function
def distance_matrix_to_knn_graph(k: int, distance_matrix: np.array, graph_title: str,
                                 seed: int, iterations: int):
  knn_graph = kneighbors_graph(distance_matrix, n_neighbors=k, mode='connectivity')

  G = nx.from_scipy_sparse_array(knn_graph)

  # Detect if the graph is disjoint
  if not nx.is_connected(G):
      print("The KNN graph is disjoint. Ensuring connectivity...")

      # Calculate the connected components
      components = list(nx.connected_components(G))

      # Connect the components
      for i in range(len(components) - 1):
          min_dist = np.inf
          closest_pair = None
          for node1 in components[i]:
              for node2 in components[i + 1]:
                  dist = distance_matrix[node1, node2]
                  if dist < min_dist:
                      min_dist = dist
                      closest_pair = (node1, node2)

          # Add an edge between the closest pair of nodes from different components
          G.add_edge(closest_pair[0], closest_pair[1])

  # Plot the final connected graph
  pos = nx.spring_layout(G, seed=seed, iterations=iterations)
  nx.draw(G, node_size=50, pos=pos)
  plt.title(graph_title + f" (K={k})")
  plt.show()

In [None]:
# show graph
distance_matrix_to_knn_graph(3, dist_mat, "Bach Prelude in C", 42, 50)

### Currently Trying to Put The Segment Data Into The Node so we can analyze grouped segments

In [None]:
# segments to graph function
def segments_to_graph(k: int, segments: list[pd.DataFrame], labeled_segments, cores=None):
  # Convert segments to a distance matrix
  distance_matrix = segments_to_distance_matrix(segments, cores=cores)

  # Compute the k-NN graph
  knn_graph = kneighbors_graph(distance_matrix, n_neighbors=k, mode='connectivity')

  # Convert the k-NN graph to a NetworkX graph
  G = nx.from_scipy_sparse_array(knn_graph)

  # Add segment data as attributes to each node
  for i in range(len(segments)):
    G.nodes[i]['segment'] = labeled_segments[i] # print shit

  # Detect if the graph is disjoint
  if not nx.is_connected(G):
      print("The KNN graph is disjoint. Ensuring connectivity...")

      # Calculate the connected components
      components = list(nx.connected_components(G))

      # Connect the components
      for i in range(len(components) - 1):
          min_dist = np.inf
          closest_pair = None
          for node1 in components[i]:
              for node2 in components[i + 1]:
                  dist = distance_matrix[node1, node2]
                  if dist < min_dist:
                      min_dist = dist
                      closest_pair = (node1, node2)

          # Add an edge between the closest pair of nodes from different components
          G.add_edge(closest_pair[0], closest_pair[1])

  return G, distance_matrix

In [None]:
# segments to graph
graph, distance_matrix = segments_to_graph(5, segments, labeled_segments)

## Trying to find ways to validate "Graph Identity"

besides average dtw distance Im trying to see if I can make something of a graph "silhouette score".

Basically get the communities in the graph then calculate the following:

* homogeneity: intra-cluster distance

* heterogeneity: inter-cluster distance

* "Graph Silhoette Score:" $\frac{Heterogeneity - Homogeneity} {max(Hetero, Homo)}$

Also might take a look at clustering coefficients

In [None]:
# graph metrics function
def graph_metrics(graph: nx.classes.graph.Graph, distance_matrix: np.array,
                  seed: int):
  avg_dtw_distance = distance_matrix.mean()
  avg_clustering_coef = nx.average_clustering(graph)

  communities = nx.community.louvain_communities(graph, seed)

  silhouette_scores = []
  for cluster in communities:
    for i in cluster:
      cluster_distances = distance_matrix[i, list(cluster - {i})]
      homogeneity = np.mean(cluster_distances)
      other_cluster_distances = [np.mean(distance_matrix[i, list(other_cluster)]) for other_cluster in communities if other_cluster != cluster]
      heterogeneity = min(other_cluster_distances) if other_cluster_distances else homogeneity
      silhouette_score = (heterogeneity - homogeneity) / max(heterogeneity, homogeneity)
      silhouette_scores.append(silhouette_score)

  # Average silhouette score for all nodes
  average_silhouette_score = np.mean(silhouette_scores)

  conductance_scores = []
  for cluster in communities:
    cluster_conductance = conductance(graph, cluster)
    conductance_scores.append(cluster_conductance)

  average_conductance = np.mean(conductance_scores)

  print("Average DTW Distance:", avg_dtw_distance)
  print("Average Clustering Coefficient:", avg_clustering_coef)
  print("Average Silhouette Score", average_silhouette_score)
  print("Average Conductance:", average_conductance)

In [None]:
# show graph metrics and segment data
graph_metrics(graph, distance_matrix, 42)
print(f"Graph Length: {len(graph)}")

for node in graph.nodes(data=True):
  node_id = node[0]
  segment_data = node[1]['segment']
  print(f"Node {node_id} segment data:")
  print(segment_data)

In [None]:
# plot graph function
def plot_graph(graph: nx.classes.graph.Graph,
               seed: int,
               iterations: int,
               title: str,
               node_size: int):
  pos = nx.spring_layout(graph, seed=seed, iterations=iterations)
  for node in graph.nodes(data=True):
    segment_data = node[1]['segment']
    label, matrix = segment_data
    title, color = label
    print(color)
    # G.nodes[node]['color'] = color
  nx.draw(G, node_size=node_size, node_color='red' ,pos=pos)
  plt.title(title)
  plt.show()

In [None]:
# show graph
plot_graph(graph, 3, 50, "Bach Prelud in C (K=5)", 50)

In [None]:
for i in range(len(graph.nodes())):
  graph.nodes[i]['segment'] = graph.nodes[i]['segment'].to_json(default_handler=str)

nt = Network('1000px', '1000px', notebook=True, cdn_resources = 'remote')
nt.from_nx(graph)
nt.show('sample_graph.html')   ### Still need to figure out the labels
display(HTML('sample_graph.html'))