<a href="https://colab.research.google.com/github/EA-Digifolk/EA-Digifolk-Dataset/blob/main/EADigifolk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EA-Digifolk Explorer



Links:

* [EA-Digifolk dataset](https://github.com/EA-Digifolk/EA-Digifolk-Dataset.git)
* [Extract Features from MEI Parser](https://github.com/EA-Digifolk/MEIParser_features)
* [Presentation](https://)

## Setup

This section covers downloading the the [EA-Digifolk dataset](https://github.com/EA-Digifolk/EA-Digifolk-Dataset.git) and the [Parser](https://github.com/EA-Digifolk/MEIParser_features) to extract features from MEI files, and installing the required libraries for the parser to function, and the [Musescore](https://musescore.org) software for displaying the musical scores.

In [None]:
%%capture
#@title Download the EA-Digifolk Dataset from Github
%cd /content

import os
if os.path.exists('EA-Digifolk-Dataset'):
  !git -C EA-Digifolk-Dataset pull
else:
  !git clone https://github.com/EA-Digifolk/EA-Digifolk-Dataset.git

In [None]:
%%capture
%cd /content
#@title Download the MEI Parser

import os
if os.path.exists('MEIParser_features'):
  !git -C MEIParser_features pull
else:
  !git clone https://github.com/EA-Digifolk/MEIParser_features


!pip install -r MEIParser_features/requirements.txt -q

import sys
if not '/content/MEIParser_features' in sys.path:
  sys.path.append('/content/MEIParser_features')

In [None]:
%%capture
#@title Install Musescore
!apt-get update -q && apt-get install musescore lilypond -q
%env QT_QPA_PLATFORM=offscreen

In [None]:
#@title Install Music21 and setup Musescore in the Music21 Environment
!pip install music21 -q

import music21
env = music21.environment.Environment()
env['lilypondPath'] = '/usr/local/bin/lilypond'
env['pdfPath'] = '/usr/bin/musescore'
env['graphicsPath'] = '/usr/bin/musescore'
env['musicxmlPath'] = '/usr/bin/musescore'
env['musescoreDirectPNGPath'] = '/usr/bin/musescore'
env['autoDownload'] = 'allow'
env['warnings'] = 0

In [None]:
#@title Install fluidsynth for audio playback in Colab
!apt-get update -qq && apt-get install -qq libfluidsynth1 fluid-soundfont-gm
!pip install fluidsynth

## Extract features from MEI files

This section covers the processing of the dataset: extracting the features from the MEI files and save as a pandas dataframe for easy exploration.

This section is optional, as the saved pandas dataframe is provided in the EA-Digifolk dataset folder by default.

In [None]:
%%script false --no-raise-error
# comment above line if you want to process the dataset

# @title Process Dataset

# Import Libs from Python
import importlib
import glob
from fractions import Fraction
from tqdm import tqdm

# Import External Libs
import music21 as m21
import pandas as pd

# Import Parser
import parser_mei_features
from parser_mei_features import MeiParser

songs = reversed(sorted(list(glob.glob('EA-Digifolk-Dataset/Spanish/*.mei') + glob.glob('EA-Digifolk-Dataset/Portuguese/*.mei') + glob.glob('EA-Digifolk-Dataset/Mexican/*.mei'))))
songs = [so for so in songs if so not in [f'EA-Digifolk-Dataset/Spanish/{s}' for s in ['ES-1948-AS-FP-006.mei', 'ES-1948-CB-CO-376.mei', 'ES-1948-CB-CO-418.mei', 'ES-1991-CL-KS-147.mei']] ]

songs = list(reversed(songs))

errors = []
EADIGIFOLKNT = pd.DataFrame()

for song in tqdm(songs):

    try:
      mei_parser = MeiParser()
      song_features = mei_parser.parse_mei(song, verbose=False)
      EADIGIFOLKNT = pd.concat([EADIGIFOLKNT, pd.DataFrame().from_dict(song_features)], axis=1)
    except Exception as e:
      errors.append((song, e))

print('\n Files with errors:')
for err in errors:
  print(err)

# Transpose Dataframe so songs' IDs are now the index
EADIGIFOLK = EADIGIFOLKNT.T
EADIGIFOLK.set_index('id', inplace=True)

In [None]:
%%script false --no-raise-error
#@title Clean Dataset

# create country column from ID
EADIGIFOLK['country'] = EADIGIFOLK.index.to_series().apply(lambda x: x.split('-')[0])
# clean meters
EADIGIFOLK.loc[EADIGIFOLK['meter'].str.contains('bin|Bin', case=False, regex=True), 'meter'] = 'Binary'
EADIGIFOLK.loc[EADIGIFOLK['meter'].str.contains('ter|Ter|tri|Tri', case=False, regex=True), 'meter'] = 'Ternary'
EADIGIFOLK.loc[EADIGIFOLK['meter'].str.contains('com|Com', case=False, regex=True), 'meter'] = 'Compound'
EADIGIFOLK.loc[EADIGIFOLK['meter'].str.contains('fr|Fr', case=False, regex=True), 'meter'] = 'Free'
EADIGIFOLK.loc[EADIGIFOLK['meter'].str.contains('poly|Poly', case=False, regex=True), 'meter'] = 'Polyrhythmic'
EADIGIFOLK.loc[EADIGIFOLK['meter'].str.contains('irr|Irr', case=False, regex=True), 'meter'] = 'Polyrhythmic'
# clean genres
EADIGIFOLK.loc[EADIGIFOLK['genre'].str.contains('lul|Lul', case=False, regex=True), 'genre'] = 'Lullaby'
EADIGIFOLK.loc[EADIGIFOLK['genre'].str.contains('dan|Dan', case=False, regex=True), 'genre'] = 'Dance'
EADIGIFOLK.loc[EADIGIFOLK['genre'].str.contains('corro|Corro|canción|Canción|Child|child', case=False, regex=True), 'genre'] = 'Childhood Music'

In [None]:
%%script false --no-raise-error
#@title Save Dataset
# Save Dataframe to compressed file to save
EADIGIFOLK.to_pickle('EADIGIFOLK.gzip', compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})

## Exploring the EA-Digifolk Dataset

This section covers possible ways of exploring the dataset.

In [None]:
#@title Import saved pandas dataframe

import pandas as pd
EADIGIFOLK = pd.read_pickle("/content/EA-Digifolk-Dataset/EADIGIFOLK.gzip", compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})

In [None]:
#@title List all songs in the dataset

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

EADIGIFOLK

In [None]:
#@title View Song

#@markdown

from IPython.display import Audio

# Import Parser
import parser_mei_features
from parser_mei_features import MeiParser

ID = "ES-1948-CB-CO-297" # @param {"type":"string","placeholder":"MX-1951-00-VM-00001"}

song_path = f"EA-Digifolk-Dataset/{'Mexican' if 'MX' in ID else 'Spanish'}/{ID}.mei"
print('Showing: ' + song_path + '\n')

mei_parser = MeiParser()
song_features = mei_parser.parse_mei(song_path, verbose=False)

print('Score: \n')
mei_parser.mtc_extractor.music_stream.show()
#print('\nListen: \n')
#mei_parser.mtc_extractor.music_stream.show('midi')

print('\nMetadata Features: \n')
metadata = EADIGIFOLK.loc[ID]
metadata = metadata.replace('NaN', None).reindex(['id', 'country', 'title'] + metadata.index.to_list()[2:-2]).dropna()
display(metadata)

print('\nNote Features: \n')
features = pd.DataFrame(EADIGIFOLK.loc[ID, 'features'])
display(features)

In [None]:
# @title Show Distributions of Feature In Dataset {"run":"auto","vertical-output":true,"display-mode":"form"}

#@markdown This code block generates descriptive statistics and distributions for various musical and metadata features in the EA-Digifolk dataset.
#@markdown >The user can select both a subset of the dataset and a feature whose distribution will be analyzed.

from typing import Counter
import statistics

dataset = "all" # @param ["all", "mexican", "spanish"]
feature = "genre" # @param ["range", "number of phrases", "number of notes per phrase", "key", "mode", "key-mode", "meter", "country", "genre", "textual topics"]

import music21 as m21

temp_dataset = EADIGIFOLK.copy()
if dataset == 'spanish':
  temp_dataset = temp_dataset[temp_dataset['country'] == 'ES']
elif dataset == 'mexican':
  temp_dataset = temp_dataset[temp_dataset['country'] == 'MX']

if feature == 'range':
  temp_df = temp_dataset[['ambitus_highest', 'ambitus_lowest']].copy()
  temp_df['m21_H'] = temp_df['ambitus_highest'].apply(lambda x: m21.pitch.Pitch(x.replace(' flat','-')) if x is not None else None)
  temp_df['m21_L'] = temp_df['ambitus_lowest'].apply(lambda x: m21.pitch.Pitch(x.replace(' flat','-')) if x is not None else None)
  temp_df['range'] = temp_df.apply(lambda x: m21.interval.Interval(x['m21_H'], x['m21_L']).name, axis=1)
  display(temp_df['range'].describe())
  display(temp_df['range'].value_counts())
  display(temp_df['range'].value_counts().plot(kind='bar'))
elif feature == 'number of phrases':
  temp_df = temp_dataset[['features']].copy()
  temp_df['phrases'] = temp_df['features'].apply(lambda x: len(set(x['phrase_ix'])))
  display(temp_df['phrases'].describe())
  display(temp_df['phrases'].value_counts())
  display(temp_df['phrases'].value_counts().plot(kind='bar'))
elif feature == 'number of notes per phrase':
  temp_df = temp_dataset[['features']].copy()
  temp_df['notes_phrases'] = temp_df['features'].apply(lambda x: int(statistics.mean(Counter(x['phrase_ix']).values())))
  display(temp_df['notes_phrases'].describe())
  display(temp_df['notes_phrases'].value_counts())
  display(temp_df['notes_phrases'].value_counts().plot(kind='bar'))
elif feature == 'key-mode':
  temp_df = temp_dataset[['key', 'mode']].copy()
  temp_df['key-mode'] = temp_df[['key','mode']].apply(lambda x: x['key'].capitalize().replace(' ','') + ' ' + x['mode'].capitalize().replace(' ',''), axis=1)
  display(temp_df['key-mode'].describe())
  display(temp_df['key-mode'].value_counts())
  display(temp_df['key-mode'].value_counts().plot(kind='bar'))
elif feature == 'textual topics':
  display(temp_dataset['textual_topics'].explode().str.capitalize().describe())
  display(temp_dataset['textual_topics'].explode().str.capitalize().value_counts())
  display(temp_dataset['textual_topics'].explode().str.capitalize().value_counts().plot(kind='bar'))
elif feature in ['key', 'mode', 'meter', 'country', 'genre']:
  display(temp_dataset[feature].str.capitalize().describe())
  display(temp_dataset[feature].str.capitalize().value_counts())
  display(temp_dataset[feature].str.capitalize().value_counts().plot(kind='bar'))

In [None]:
# @title Search Dataset per Feature {"run":"auto"}

dataset = "all" # @param ["all", "mexican", "spanish"]
feature = "number of phrases" # @param ["range", "number of phrases", "number of notes per phrase", "key", "mode", "key-mode", "meter", "country", "textual topics"]
value = "1" # @param {"type":"string","placeholder":"Lullaby"}

import music21 as m21

temp_dataset = EADIGIFOLK.copy()
if dataset == 'spanish':
  temp_dataset = temp_dataset[temp_dataset['country'] == 'ES']
elif dataset == 'mexican':
  temp_dataset = temp_dataset[temp_dataset['country'] == 'MX']

mask = None

if feature == 'range':
  temp_df = temp_dataset[['ambitus_highest', 'ambitus_lowest']].copy()
  temp_df['m21_H'] = temp_df['ambitus_highest'].apply(lambda x: m21.pitch.Pitch(x.replace(' flat','-')) if x is not None else None)
  temp_df['m21_L'] = temp_df['ambitus_lowest'].apply(lambda x: m21.pitch.Pitch(x.replace(' flat','-')) if x is not None else None)
  temp_df['range'] = temp_df.apply(lambda x: m21.interval.Interval(x['m21_H'], x['m21_L']).name, axis=1)
  mask = EADIGIFOLK.index.isin(temp_df[temp_df['range'].str.contains(value) == True].index.to_list())
elif feature == 'number of phrases':
  temp_df = temp_dataset[['features']].copy()
  temp_df['phrases'] = temp_df['features'].apply(lambda x: len(set(x['phrase_ix'])))
  mask = EADIGIFOLK.index.isin(temp_df[temp_df['phrases'] == int(value)].index.to_list())
elif feature == 'number of notes per phrase':
  temp_df = temp_dataset[['features']].copy()
  temp_df['notes_phrases'] = temp_df['features'].apply(lambda x: int(statistics.mean(Counter(x['phrase_ix']).values())))
  mask = EADIGIFOLK.index.isin(temp_df[temp_df['notes_phrases'] == int(value)].index.to_list())
elif feature == 'key-mode':
  temp_df = temp_dataset[['key', 'mode']].copy()
  temp_df['key-mode'] = temp_df[['key','mode']].apply(lambda x: x['key'].capitalize().replace(' ','') + ' ' + x['mode'].capitalize().replace(' ',''), axis=1)
  mask = EADIGIFOLK.index.isin(temp_df[temp_df['key-mode'].str.contains(value) == True].index.to_list())
elif feature == 'textual topics':
  topics = [value] if ';' not in value else value.split('; ')
  mask = EADIGIFOLK.index.isin(temp_dataset.textual_topics.apply(lambda x: any(item for item in topics if item in x)).index.to_list())
elif feature in ['key', 'mode', 'meter', 'country']:
  mask = EADIGIFOLK.index.isin(temp_dataset[temp_dataset[feature].str.contains(value) == True].index.to_list())

display(EADIGIFOLK[mask])
print(f"Number of Rows: {EADIGIFOLK[mask].shape[0]}")

### Demos

In [None]:
time_signature_lullabies = EADIGIFOLK[EADIGIFOLK['genre'] == 'Lullaby'].apply(lambda x: x['features']['timesignature'][0], axis=1)
print(time_signature_lullabies.value_counts().plot(kind='bar'))

binary_lullabies = EADIGIFOLK[(EADIGIFOLK['genre'] == 'Lullaby') & (EADIGIFOLK['meter'] == 'Binary')]
total_lullabies = EADIGIFOLK[(EADIGIFOLK['genre'] == 'Lullaby')]
print(f"Number of Binary Lullabies: {(binary_lullabies.shape[0]*100)/total_lullabies.shape[0]}%")

In [None]:
time_signature_children = EADIGIFOLK[EADIGIFOLK['genre'] == 'Childhood Music'].apply(lambda x: x['features']['timesignature'][0], axis=1)
print(time_signature_children.value_counts().plot(kind='bar'))

binary_children = EADIGIFOLK[(EADIGIFOLK['genre'] == 'Childhood Music') & (EADIGIFOLK['meter'] == 'Binary')]
total_children = EADIGIFOLK[(EADIGIFOLK['genre'] == 'Childhood Music')]
print(f"Number of Binary Childhood Music: {(binary_children.shape[0]*100)/total_children.shape[0]}%")

In [None]:
display(EADIGIFOLK[(EADIGIFOLK['genre'] == 'Lullaby')]['textual_topics'].explode().str.capitalize().value_counts())
display(EADIGIFOLK[(EADIGIFOLK['genre'] == 'Childhood Music')]['textual_topics'].explode().str.capitalize().value_counts())

In [None]:
display(EADIGIFOLK[(EADIGIFOLK['genre'] == 'Lullaby')]['mode'].str.lower().str.replace(r'[-~!?]', '', regex=True).str.extract(r'(\w+\s\w+[^0-9]|\w+[^0-9])').value_counts())
display(EADIGIFOLK[(EADIGIFOLK['genre'] == 'Childhood Music')]['mode'].str.lower().str.replace(r'[-~!?]', '', regex=True).str.extract(r'(\w+\s\w+[^0-9]|\w+[^0-9])').value_counts())