In [1]:
import pandas as pd
import numpy as np
import requests
from pathlib import Path
from tqdm import tqdm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def process_metadata_template(csv_filename, lyrics=None, audio_links=None) -> pd.DataFrame:

    df = pd.read_csv(csv_filename, header=None, index_col=False)

    # set row 2 as columns
    df.columns = df.iloc[2]
    # remove unnecessary rows
    df = df.drop(index=[0, 1, 2, 3, 4, 5, 6])
    # remove extra column
    df = df.drop(df.columns[0], axis=1)
    # reorder indexes
    df = df.reset_index(drop=True)
    # remove unnecessary cols from template
    cols_to_remove = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 36]
    df = df.drop(df.columns[cols_to_remove], axis=1)
    # add audio links and song lyrics
    if lyrics is None:
        lyrics = np.zeros(shape=(len(df['Identifier'])))
        lyrics[lyrics==0]=['nan']
    if audio_links is None:
        audio_links = np.zeros(shape=(len(df['Identifier'])))
        audio_links[audio_links==0]=['nan']

    df['Lyrics'] = lyrics
    df['AudioLink'] = audio_links
    # save file
    df.to_csv('data.csv', index=False)
    return df

In [3]:
# searches for the column rearding the links of the recordings
def get_audio_links(csv_filename):
    df = pd.read_csv(csv_filename)
    columns = [col.lower() for col in list(df.columns)]
    keywords = ['link', 'links', 'audio', 'recording']
    # print(columns)
    for i in range(len(columns)):
        if any(keyword in columns[i] for keyword in keywords):
            # print(i)
            audio_links_col = df[df.columns[i]]
            # print(audio_links_col)
            return audio_links_col
            
    return None

# searches for a column related to the lyrics of the song
def get_lyrics(csv_filename):
    df = pd.read_csv(csv_filename)
    columns = [col.lower() for col in list(df.columns)]
    keywords = ['word', 'words', 'lyrics', 'lyric']
    # print(columns)
    for i in range(len(columns)):
        if any(keyword in columns[i] for keyword in keywords):
            # print(i)
            lyrics_col = df[df.columns[i]]
            # print(lyrics_col)
            return lyrics_col
            
    return None

In [4]:
audio_links = get_audio_links(csv_filename='IE-2019-D-HLS/song_list_lyrics_audio.csv')
lyrics = get_lyrics(csv_filename='IE-2019-D-HLS/song_list_lyrics_audio.csv')
data = process_metadata_template(csv_filename='IE-2019-D-HLS/metadata.csv', lyrics=lyrics, audio_links=audio_links)

FileNotFoundError: [Errno 2] No such file or directory: 'IE-2019-D-HLS/song_list_lyrics_audio.csv'

In [None]:
def download_audio_files(data, directory):
    audio_files = Path(directory) / 'audio_files'
    # create directory
    print('Creating a new directory...')
    if not audio_files.is_dir():
        audio_files.mkdir(exist_ok=True)

    df = data[['Identifier', 'AudioLink']]
    print('Starting download...')
    for i in tqdm(range(len(data['Identifier']))):
        url = data['AudioLink'][i]
        id = data['Identifier'][i]

        try:
            with open(audio_files / f'{id}.mp3', 'wb') as f:
                r = requests.get(url).content
                f.write(r)
        except Exception as e:
            print(f'An error occurred: {e}')

In [None]:
download_audio_files(data, 'IE-2019-D-HLS')

Creating a new directory...
Starting download...


100%|██████████| 64/64 [00:47<00:00,  1.35it/s]


# Class Score()

In [176]:
from music21 import *
import pandas as pd
from langdetect import detect


class Score():
    def __init__(self, file):
        self.file = file
        self.score = self.load_score()
        self.key = self.get_key()
        self.meter = None
        self.instruments = self.get_instruments()
        self.tempo = None
        self.title = self.get_title()
        self.composer = self.get_composer()
        self.arranger = self.get_arranger()
        self.time_signatures = self.get_time_signature()
        self.intervals = self.get_intervals()
        # self.chord_progression = self.get_chord_progression()
        self.num_measures = self.get_num_measures()
        self.notes = self.get_all_notes()
        self.count = self.get_notes_count()
        self.lyrics = self.search_lyrics()
        self.language = self.get_most_likely_language()

    def load_score(self) -> stream.Score: 
        return converter.parseFile(self.file)

    def get_time_signature(self):
        time_signatures = []
        for ts in self.score.flat.getElementsByClass(meter.TimeSignature):
            if ts.ratioString not in time_signatures:
                time_signatures.append(ts.ratioString)
        return time_signatures
    
    def get_num_measures(self):
        return len(self.score.parts[0].getElementsByClass(stream.Measure))        

    def get_instruments(self):
        instruments = []
        for part in self.score.parts:
            inst = part.getInstrument().instrumentName
            if inst != '' and inst not in instruments:
                instruments.append(inst)
        return instruments
    
    def get_key(self):
        return self.score.analyze('key')
    
    """ def get_chord_progression(self):
        chords = self.score.chordify().recurse().getElementsByClass(chord.Chord)
        progression = []
        for i, c in enumerate(chords):
            rn = roman.romanNumeralFromChord(c, self.key)
            progression.append(rn.figure)
        return progression """
    
    # Metadata
    def get_composer(self):
        return self.score.metadata.composer
    
    def get_arranger(self):
        return self.score.metadata.arranger
    
    def get_title(self):
        return self.score.metadata.title

    # Print
    def attributes(self):
        print(   f'Title: {self.title}\n'
               + f'Composer: {self.composer}\n'
               + f'Arranger: {self.arranger}\n'
               + f'Key: {self.key}\n'
               + f'Instruments: {self.instruments}\n' 
               + f'Time signatures: {self.time_signatures}\n' 
               + f'No. Measures: {self.num_measures}\n'     
               + f'Lyrics: {self.lyrics}\n'
               + f'Language: {self.language}\n'
               + f'Chord Progression: {self.chord_progression}\n'   
              )
    
    def get_all_notes(self):
        notes = {}
        measure = []
        offset = []
        quarterLength = []
        pitch = []
        volume = []

        for note in self.score.recurse().notes:
            if note.isNote:
                measure.append(note.activeSite.measureNumber)
                offset.append(note.offset)
                quarterLength.append(note.duration.quarterLength)
                pitch.append(note.pitches[0])
                volume.append(note.volume.realized)

                n = {'offset': note.offset,
                    'quarter_length': note.duration.quarterLength,
                    'pitch': note.pitches[0],
                    'volume': note.volume.realized}
                
                if f'measure_{note.activeSite.measureNumber}' not in notes.keys():
                    notes[f'measure_{note.activeSite.measureNumber}'] = [n]
                else:
                    notes[f'measure_{note.activeSite.measureNumber}'].append(n)

            df = {'tune': self.file.name,
                'measure': measure,
                'offset': offset,
                'quarterLength': quarterLength,
                'pitch': pitch,
                'volume': volume}
            
            df = pd.DataFrame(df).sort_values(by='measure')
            # print(df.to_string())
               
        # print(df)
        return df
        # print(len(notes['measure_2']))
        # print()

    def search_lyrics(self):
        lyric = ''
        for part in self.score.parts:
            lyric += search.lyrics.LyricSearcher(part).indexText
        
        # print(lyric)
        return lyric
    
    def get_most_likely_language(self):
        if self.lyrics != '':
            return detect(self.lyrics)
        else:
            return None
    
    def get_notes_count(self):
        pitches = self.notes['pitch']
        count = {}
        # print(pitches)
        for pitch in pitches:
            # print(pitch)
            if pitch not in count.keys():
                count[pitch] = 1
            else:
                count[pitch] += 1

        df = pd.DataFrame({'pitch': count.keys(),'count': count.values()})           
        # print(df.sort_values(by='count', ascending=False).to_string())
        return count
    
    def get_intervals(self):
        notes = self.get_all_notes()['pitch']
        
        intervals = {}
        for i in range(len(notes) - 1):
            i = interval.Interval(notes[i], notes[i + 1])
            # print(i.niceName)
            if i.niceName not in intervals.keys():
                intervals[i.niceName] = 1
            else:
                intervals[i.niceName] += 1
        return intervals

In [213]:

from pathlib import Path
tune = Path('mxml/IE-2019-D-HLS-015.xml')
score = Score(tune)
notes = list(score.get_all_notes()['pitch'])
info = score.get_all_notes()
key = score.get_key() 
intervals = score.get_intervals()

  return self.iter().getElementsByClass(classFilterList)


In [178]:
remove_duplicates_and_sort_notes = list(sorted(set(notes)))
remove_duplicates_and_sort_notes

[<music21.pitch.Pitch D4>,
 <music21.pitch.Pitch F#4>,
 <music21.pitch.Pitch G4>,
 <music21.pitch.Pitch A4>,
 <music21.pitch.Pitch B4>,
 <music21.pitch.Pitch D5>]

In [179]:
import re

ids = []
d = {'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'A': 6, 'B': 7}

for n in remove_duplicates_and_sort_notes:
    if n.accidental is not None:
        # print((n.accidental.name))
        if n.accidental.name == 'sharp':
            accidental = 0.5
        else:
            accidental = -0.5
    else:
        accidental = 0

    ids.append((n.octave * 12) + (accidental) + d[re.sub(r"[^a-zA-Z]+", "", n.name)])

ids
    

[50, 52.5, 53, 54, 55, 62]

In [180]:
notes_with_octave = []

for n in remove_duplicates_and_sort_notes:
    notes_with_octave.append(n.nameWithOctave)

notes_with_octave

['D4', 'F#4', 'G4', 'A4', 'B4', 'D5']

In [191]:
octaves = {}
d = {'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'A': 6, 'B': 7}

for i in range(9):
    octaves[f'C{i}'] = 12 * i + d['C']

min_note = min(ids)
max_note = max(ids)

keys = list(octaves.keys())
for i in range(len(keys) - 1):
    if min_note > octaves[keys[i]] and min_note < octaves[keys[i + 1]]:
        start_id = octaves[keys[i]]
        start_octave = i
    if max_note > octaves[keys[i]] and max_note < octaves[keys[i + 1]]:
        end_id = octaves[keys[i + 1]]
        end_octave = i + 1

print(start_octave)
print(end_octave)

4
6


In [192]:
name_ = []
id_ = []
for i in range(end_octave - start_octave + 1):
    print('*')
    octave = start_octave + i
    name_.append(f'C{octave}')
    id_.append((octave * 12) + d['C'])


new_d = {'name': name_, 'id': id_}
new_d

*
*
*


{'name': ['C4', 'C5', 'C6'], 'id': [49, 61, 73]}

In [205]:
count_dict = {}

name_merged_list = []
for item in name_ + notes_with_octave:
    if item not in name_merged_list:
        name_merged_list.append(item)

for n in name_merged_list:
    if n not in count_dict.keys():
        count_dict[n] = 0
    else:
        count_dict[n] +=1


id_merged_list = []
for item in id_ + ids:
    if item not in id_merged_list:
        id_merged_list.append(item)


for note in notes:
    if note.nameWithOctave in count_dict.keys():
        count_dict[note.nameWithOctave] +=1

frequency = list(count_dict.values())
print(count_dict)
print(frequency)

{'C4': 0, 'C5': 0, 'C6': 0, 'D4': 4, 'F#4': 1, 'G4': 9, 'A4': 3, 'B4': 2, 'D5': 2}
[0, 0, 0, 4, 1, 9, 3, 2, 2]


In [206]:
print(name_merged_list)
print(id_merged_list)    

['C4', 'C5', 'C6', 'D4', 'F#4', 'G4', 'A4', 'B4', 'D5']
[49, 61, 73, 50, 52.5, 53, 54, 55, 62]


In [208]:
nodes = {'id': id_merged_list, 'name': name_merged_list, 'frequency': frequency}
nodes = [dict(zip(nodes, i)) for i in zip(*nodes.values())]
nodes = sorted(nodes, key=lambda x: x['id'])


In [209]:
links = []
all_notes = []

for n in notes:
    all_notes.append(n.nameWithOctave)

for i in range(len(all_notes) - 1):
    source = all_notes[i]
    target = all_notes[i + 1]

    for d in nodes:
        if d['name'] == source:
            source = d['id']
        if d['name'] == target:
            target = d['id']

    
    found = False
    for d in links:
        if d['source'] == source and d['target'] == target:
            d['count'] += 1
            found = True
            break

    if not found:
        links.append({'source': source, 'target': target, 'count': 1})

In [304]:
harmonic_intervals_data = {'nodes': nodes, 'links': links, 'note_frequency_by_measure': info_about_measures}
harmonic_intervals_data

{'nodes': [{'id': 49, 'name': 'C4', 'frequency': 0},
  {'id': 50, 'name': 'D4', 'frequency': 4},
  {'id': 52.5, 'name': 'F#4', 'frequency': 1},
  {'id': 53, 'name': 'G4', 'frequency': 9},
  {'id': 54, 'name': 'A4', 'frequency': 3},
  {'id': 55, 'name': 'B4', 'frequency': 2},
  {'id': 61, 'name': 'C5', 'frequency': 0},
  {'id': 62, 'name': 'D5', 'frequency': 2},
  {'id': 73, 'name': 'C6', 'frequency': 0}],
 'links': [{'source': 50, 'target': 53, 'count': 3},
  {'source': 53, 'target': 53, 'count': 4},
  {'source': 53, 'target': 55, 'count': 2},
  {'source': 55, 'target': 62, 'count': 2},
  {'source': 62, 'target': 53, 'count': 2},
  {'source': 53, 'target': 54, 'count': 2},
  {'source': 54, 'target': 54, 'count': 1},
  {'source': 54, 'target': 52.5, 'count': 1},
  {'source': 52.5, 'target': 50, 'count': 1},
  {'source': 50, 'target': 50, 'count': 1},
  {'source': 54, 'target': 50, 'count': 1}],
 'note_frequency_by_measure': [{'id': 50,
   'measures': [{'measure': 1, 'counter': 1},
    {

In [305]:
import json
with open('harmonic_intervals_data.json', 'w') as f:
    json.dump(harmonic_intervals_data, f)

In [240]:
notes_per_measure = {}

for row in info.iterrows():
    measure = row[1][1]
    pitch = row[1][4]

    if measure not in notes_per_measure.keys():
        if pitch is not None:
            notes_per_measure[measure] = [pitch.nameWithOctave]
        else:
            notes_per_measure[measure] = []
    else:
        notes_per_measure[measure].append(pitch.nameWithOctave)

notes_per_measure
    

  measure = row[1][1]
  pitch = row[1][4]


{1: ['D4'],
 2: ['G4', 'G4'],
 3: ['G4', 'B4'],
 4: ['D5'],
 5: ['G4'],
 6: ['A4'],
 7: ['A4'],
 8: ['F#4'],
 9: ['D4', 'D4'],
 10: ['G4', 'G4'],
 11: ['G4', 'B4'],
 12: ['D5'],
 13: ['G4'],
 14: ['A4'],
 15: ['D4'],
 16: ['G4']}

In [303]:
# print(notes_with_octave)

info_about_measures = []
dic = {'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'A': 6, 'B': 7}

# print(info_about_measures)
for k in range(len(notes_with_octave)):
    a = []

    octave = int(re.sub(r"[^0-9]+", "", notes_with_octave[k]))
    acc = re.sub(r"[^#b]", "", notes_with_octave[k])
    note = re.sub(r"[^a-zA-Z]+", "", notes_with_octave[k])

    accidental = 0

    if acc == '#':
        accidental += 0.5
    if acc == 'b':
        accidental -= 0.5

    


    id = (octave * 12) + (accidental) + dic[note]
    # print(id)

    for key,val in notes_per_measure.items():
        d = {'measure': key, 'counter': 0}
        print(key,val)
        # print(notes_per_measure[k])
        print(elem)
        #print('**')
        for elem in val:
            if elem == notes_with_octave[k]:
                d['counter'] += 1
        a.append(d)
    # print(a)
    info_about_measures.append({'id': id, 'measures': a})

# info_about_measures = [dict(zip(info_about_measures, i)) for i in zip(*info_about_measures.values())]   
len(info_about_measures)
info_about_measures

1 ['D4']
G4
2 ['G4', 'G4']
D4
3 ['G4', 'B4']
G4
4 ['D5']
B4
5 ['G4']
D5
6 ['A4']
G4
7 ['A4']
A4
8 ['F#4']
A4
9 ['D4', 'D4']
F#4
10 ['G4', 'G4']
D4
11 ['G4', 'B4']
G4
12 ['D5']
B4
13 ['G4']
D5
14 ['A4']
G4
15 ['D4']
A4
16 ['G4']
D4
1 ['D4']
G4
2 ['G4', 'G4']
D4
3 ['G4', 'B4']
G4
4 ['D5']
B4
5 ['G4']
D5
6 ['A4']
G4
7 ['A4']
A4
8 ['F#4']
A4
9 ['D4', 'D4']
F#4
10 ['G4', 'G4']
D4
11 ['G4', 'B4']
G4
12 ['D5']
B4
13 ['G4']
D5
14 ['A4']
G4
15 ['D4']
A4
16 ['G4']
D4
1 ['D4']
G4
2 ['G4', 'G4']
D4
3 ['G4', 'B4']
G4
4 ['D5']
B4
5 ['G4']
D5
6 ['A4']
G4
7 ['A4']
A4
8 ['F#4']
A4
9 ['D4', 'D4']
F#4
10 ['G4', 'G4']
D4
11 ['G4', 'B4']
G4
12 ['D5']
B4
13 ['G4']
D5
14 ['A4']
G4
15 ['D4']
A4
16 ['G4']
D4
1 ['D4']
G4
2 ['G4', 'G4']
D4
3 ['G4', 'B4']
G4
4 ['D5']
B4
5 ['G4']
D5
6 ['A4']
G4
7 ['A4']
A4
8 ['F#4']
A4
9 ['D4', 'D4']
F#4
10 ['G4', 'G4']
D4
11 ['G4', 'B4']
G4
12 ['D5']
B4
13 ['G4']
D5
14 ['A4']
G4
15 ['D4']
A4
16 ['G4']
D4
1 ['D4']
G4
2 ['G4', 'G4']
D4
3 ['G4', 'B4']
G4
4 ['D5']
B4
5 ['G4']
D5
6 ['

[{'id': 50,
  'measures': [{'measure': 1, 'counter': 1},
   {'measure': 2, 'counter': 0},
   {'measure': 3, 'counter': 0},
   {'measure': 4, 'counter': 0},
   {'measure': 5, 'counter': 0},
   {'measure': 6, 'counter': 0},
   {'measure': 7, 'counter': 0},
   {'measure': 8, 'counter': 0},
   {'measure': 9, 'counter': 2},
   {'measure': 10, 'counter': 0},
   {'measure': 11, 'counter': 0},
   {'measure': 12, 'counter': 0},
   {'measure': 13, 'counter': 0},
   {'measure': 14, 'counter': 0},
   {'measure': 15, 'counter': 1},
   {'measure': 16, 'counter': 0}]},
 {'id': 52.5,
  'measures': [{'measure': 1, 'counter': 0},
   {'measure': 2, 'counter': 0},
   {'measure': 3, 'counter': 0},
   {'measure': 4, 'counter': 0},
   {'measure': 5, 'counter': 0},
   {'measure': 6, 'counter': 0},
   {'measure': 7, 'counter': 0},
   {'measure': 8, 'counter': 1},
   {'measure': 9, 'counter': 0},
   {'measure': 10, 'counter': 0},
   {'measure': 11, 'counter': 0},
   {'measure': 12, 'counter': 0},
   {'measure':