We have dataset from Asle, processed with 2 pointers to find repeatitions in Lakh MIDI Clean midi files

In [1]:
import os
from collections import Counter

In [28]:
def analyze_datasets_folder(root_dir='/datasets'):
    """
    Analyzes the dataset folder and provides statistics about the number of files, MIDI files, and file extension distribution.

    Args:
        root_dir (str): The root directory of the dataset. Default is '/datasets'.
    """
    total_files = 0
    midi_files = 0
    folder_count = 0
    file_extensions = Counter()

    for root, dirs, files in os.walk(root_dir):
        folder_count += 1
        total_files += len(files)
        
        for file in files:
            _, ext = os.path.splitext(file)
            if ext:
                file_extensions[ext.lower()] += 1
            else:
                file_extensions['(no extension)'] += 1
            
            if ext.lower() in ['.mid', '.midi']:
                midi_files += 1

    print(f"Total number of folders: {folder_count}")
    print(f"Total number of files: {total_files}")
    print(f"Number of MIDI files: {midi_files}")
    
    print("\nFile extension distribution:")
    for ext, count in file_extensions.most_common():
        print(f"{ext}: {count}")
    
    print("\nPercentage breakdown:")
    sorted_extensions = sorted(file_extensions.items(), key=lambda x: x[1], reverse=True)
    
    for ext, count in sorted_extensions[:5]:  # Show top 5 extensions
        percentage = (count / total_files) * 100
        print(f"{ext}: {percentage:.2f}%")

    other_count = sum(count for ext, count in sorted_extensions[5:])
    other_percentage = (other_count / total_files) * 100
    print(f"Others: {other_percentage:.2f}%")

In [16]:
# checking the dataset folder's path
!ls -la

total 0
drwxr-xr-x@ 3 serdegsenere  staff   96 Sep 26 12:56 [34m.[m[m
drwxr-xr-x  7 serdegsenere  staff  224 Sep 26 12:55 [34m..[m[m
-rw-r--r--@ 1 serdegsenere  staff    0 Sep 26 12:56 Clustering_repeated_motifs.ipynb


In [17]:
os.getcwd()

'/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/testing_tools/clustering'

ipynb has different paths concept, ok, let's use full path instead

In [30]:
# checking the dataset folder, NOTE that it is in gitignore, so it is not pushed to github
analyze_datasets_folder("/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only") 

Total number of folders: 3484
Total number of files: 7669
Number of MIDI files: 7662

File extension distribution:
.mid: 7662
(no extension): 7

Percentage breakdown:
.mid: 99.91%
(no extension): 0.09%
Others: 0.00%


The plan for this ipynb something like this

	1.	Preprocessing
			Parsing MIDI files with mido or better pretty_midi
			Extract key features -- note sequences, rhythm, harmony
			Normalize key and tempo
	2.	Feature representation
			Convert MIDI files to fixed-length vectors (bag-of-notes, TF-IDF, autoencoders??)
			Use dimensionality reduction if needed?
	3.	Similarity measurement
			Distance metrics like Euclidean, Cosine, DTW
	4.	Efficient retrieval
			Indexing structures like FAISS or LSH for fast searches?
	5.	Building the program for integreation later

(add references)

In [31]:
!pip install mido
!pip install pretty_midi
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install faiss-cpu  # for similarity search

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.7 kB)
Collecting numpy<2.0,>=1.0 (from faiss-cpu)
  Downloading numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Downloading faiss_cpu-1.8.0.post1-cp312-cp312-macosx_11_0_arm64.whl (6.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hDownloading numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl (13.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: numpy, faiss-cpu
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.0
    Uninstalling numpy-2.0.0:
      Successfully uninstalled numpy-2.0.0
Successfully installed faiss-cpu-1.8.0.post1 numpy-1.26.4


In [32]:
import os
import numpy as np
import pretty_midi
import faiss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [33]:
def extract_features(file_path):
    """
    Extract features from a MIDI file.

    Args:
        file_path (str): The path to the MIDI file.

    Returns:
        numpy.ndarray: A feature vector representing the MIDI file.
    """
    try:
        midi_data = pretty_midi.PrettyMIDI(file_path)
    except:
        print(f"Error loading {file_path}")
        return None
    
    # Initialize lists to hold features
    pitches = []
    durations = []
    velocities = []
    
    for instrument in midi_data.instruments:
        if not instrument.is_drum:
            for note in instrument.notes:
                pitches.append(note.pitch)
                durations.append(note.end - note.start)
                velocities.append(note.velocity)
    
    if not pitches:
        return None  # Skip files with no notes
    
    # Convert lists to numpy arrays
    pitches = np.array(pitches)
    durations = np.array(durations)
    velocities = np.array(velocities)
    
    # Combine features into a single vector
    feature_vector = np.concatenate((pitches, durations, velocities))
    return feature_vector

In [34]:
def process_dataset(dataset_folder):
    """
    Process a dataset folder and extract features from all MIDI files.

    Args:
        dataset_folder (str): The path to the dataset folder.

    Returns:
        tuple: A tuple containing two lists - feature_list and file_names.
    """
    feature_list = []
    file_names = []

    for root, _, files in os.walk(dataset_folder):
        for file_name in files:
            if file_name.lower().endswith(('.mid', '.midi')):
                file_path = os.path.join(root, file_name)
                features = extract_features(file_path)
                if features is not None:
                    feature_list.append(features)
                    file_names.append(file_path)
                else:
                    print(f"Skipping {file_name} due to extraction issues.")
    
    return feature_list, file_names

In [35]:
# note that this folder is in gitignore, so it is not pushed to github, I took it from Teams
dataset_folder = "/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only"

In [None]:
os.listdir(dataset_folder)

['/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns/Stuck.mid/track1.mid',
 '/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns/Stuck.mid/track0.mid',
 '/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns/Faith.mid/track2.mid',
 '/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns/Faith.mid/track1.mid',
 '/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns/Faith.mid/track0.mid']

In [47]:
# let's try to process only a part of the dataset
feature_list, file_names = process_dataset("/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns")


In [50]:
# let's see the first 10 file names
len(file_names)

22

In [66]:
# let's see 10 files
print("total files itn the folder is " + str(len(file_names)))
file_names[0:10]



total files itn the folder is 22


['/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns/Stuck.mid/track1.mid',
 '/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns/Stuck.mid/track0.mid',
 '/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns/Faith.mid/track2.mid',
 '/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns/Faith.mid/track1.mid',
 '/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns/Faith.mid/track0.mid',
 '/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns/N_2_Gether_Now.mid/track1.mid',
 '/Volumes/C/Algoritmi/SaMuGeD-Algoritmi-DrDreSamplerAI-2024/datasets/two_pointers_repeats_only/Limp_Bizkit_Patterns/N_2_Gether_Now.mid/track0.mid',
 '

In [54]:
# vector of features for the 1st file
feature_list[0:1]

[array([ 38.        ,  45.        ,  50.        ,  38.        ,
         45.        ,  50.        ,  47.        ,  54.        ,
         59.        ,  46.        ,  53.        ,  58.        ,
         38.        ,  45.        ,  50.        ,  38.        ,
         45.        ,  50.        ,  40.        ,  47.        ,
         52.        ,  38.        ,  45.        ,  41.        ,
         48.        ,  53.        ,  38.        ,  45.        ,
         40.        ,  47.        ,  52.        ,  41.        ,
         48.        ,  53.        ,  49.        ,  50.        ,
         38.        ,  49.        ,  50.        ,  38.        ,
         49.        ,  50.        ,  38.        ,  49.        ,
         50.        ,  38.        ,  49.        ,  50.        ,
         49.        ,  50.        ,  50.        ,  57.        ,
         62.        ,  51.        ,  58.        ,  63.        ,
         50.        ,  57.        ,  62.        ,  51.        ,
         58.        ,  63.        ,  63.

In [70]:
# let's see the shape of the 1st, 2nd and 3rd file
print(np.shape(feature_list[0:1]))  
print(np.shape(feature_list[1:2]))
print(np.shape(feature_list[2:3]))



(1, 270)
(1, 204)
(1, 108)


oi, oi oi, this difference in vector sizes will create problems as many ML algorithms expect fixed length inputs


In [72]:
# analyzing the lengths of the feature vectors
feature_lengths = [len(f) for f in feature_list]
fixed_length = min(6666, max(feature_lengths))  # set an upper limit here
print(f"Using fixed length: {fixed_length}")

Using fixed length: 288


In [74]:
def pad_or_truncate(features, fixed_length):
    """
    Pad or truncate the feature vector to the fixed length.

    Args:
        features (numpy.ndarray): The feature vector to pad or truncate.
        fixed_length (int): The length to which the feature vector should be padded or truncated.

    Returns:
        numpy.ndarray: The padded or truncated feature vector.
    """
    if len(features) >= fixed_length:
        return features[:fixed_length]
    else:
        padding = np.zeros(fixed_length - len(features))
        return np.concatenate((features, padding))

In [75]:
# Apply to all feature vectors
processed_features = [pad_or_truncate(f, fixed_length) for f in feature_list]
processed_features = np.array(processed_features)

# Normalize the features
scaler = StandardScaler()
processed_features = scaler.fit_transform(processed_features)

### Similarity Search Index
FAISS for efficient similarity searches in high-dimensional spaces

FAISS -- Facebook AI Similarity Search (https://ai.meta.com/tools/faiss/)

In [None]:
# Convert features to float32 as required by FAISS
processed_features = processed_features.astype('float32')

# Use PCA to reduce dimensionality for efficiency
pca_dimensions = 256  # adjust based on performance
pca = PCA(n_components=pca_dimensions)
processed_features_pca = pca.fit_transform(processed_features)

# Initialize the FAISS index
dimension = pca_dimensions
index = faiss.IndexFlatL2(dimension)

# Add vectors to the index
index.add(processed_features_pca)