In [60]:
from music21 import converter, instrument, note, chord, stream
import numpy
from matplotlib import pyplot as plt
import keras
from sklearn.preprocessing import OneHotEncoder
from keras import Sequential, Model, regularizers
from keras.layers import LSTM, Dropout, Dense, Activation, Input, concatenate
import pandas as pd
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Flatten, Attention
import os
from tensorflow.keras.models import Model
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Embedding
import tensorflow as tf
from tqdm import tqdm
import numpy as np

In [None]:
!unzip dataset.zip

In [2]:
def get_notes(midi):
  notes = []

  notes_from_midi = midi.flatten().notesAndRests
  notes_sorted = sorted(notes_from_midi, key=lambda note: note.offset)

  prev_start = notes_sorted[0].offset

  for element in notes_sorted:
    if isinstance(element , note.Note):
      notes.append((float(element.offset), float(element.volume.velocity),float(element.seconds), str(element.pitch), float(element.offset-prev_start)))
    elif isinstance(element, chord.Chord):
      #notes.append((float(element.offset), float(element.volume.velocity), float(element.seconds), '+'.join(str(n) for n in element.normalOrder)))
      for noteChord in element.pitches:
        notes.append((float(element.offset),float(element.volume.velocity), float(element.seconds), str(noteChord), float(element.offset-prev_start)))

    elif isinstance(element, note.Rest):
        # Bizarre car main gauche et droite flatten (réunie)
        # Voir autre implémentation
        notes.append((float(element.offset), 0,float(element.seconds), "REST", float(element.offset-prev_start)))

    prev_start = element.offset
  return notes



In [3]:
input_note, output_note = [], []
input_offset, output_offset = [], []
input_volume, output_volume = [], []
input_duration, output_duration = [], []

note_all_songs = []
for folder in tqdm(os.walk("dataset")):
    if len(folder) != 3:
      continue
    else:
      for file in folder[2]:
        file_path = str(folder[0])+"/"+file

        # convertit la piste midi
        midi = converter.parse(file_path)

        # récupère les notes, volume, dureées...
        notes = get_notes(midi)

        # ajoute à une liste globale
        note_all_songs.append(notes)

1it [00:02,  2.36s/it]


In [4]:
# fais un dataframe à partir de toutes les notes (sans distinctions)
df = pd.DataFrame([], columns=['debut_note','volume','durée(s)', 'pitch/chord', 'offset'])
df = df.reset_index(drop=True)
for elem in note_all_songs:
    df2 = pd.DataFrame(elem, columns=['debut_note','volume','durée(s)', 'pitch/chord', 'offset'])
    df2 = df2.reset_index(drop=True)
    df =pd.concat([df,df2])
    df = df.reset_index(drop=True)

In [5]:
df['nombre_de_notes_simultanees'] = 0

# Triez le DataFrame par colonne 'offset'
df = df.sort_values(by='debut_note')

def compter_notes_simultanees(row):
    return df[(df['debut_note'] <= row['debut_note']) & (df['debut_note'] + df['durée(s)'] >= row['debut_note'])].shape[0]

# Appliquez la fonction compter_notes_simultanees à chaque ligne du DataFrame
df['nombre_de_notes_simultanees'] = df.apply(compter_notes_simultanees, axis=1)


In [6]:
df

Unnamed: 0,debut_note,volume,durée(s),pitch/chord,offset,nombre_de_notes_simultanees
0,0.0,22.0,1.250000,E4,0.0,7
517,0.0,55.0,1.250000,E2,0.0,7
818,0.0,56.0,0.416667,A2,0.0,7
516,0.0,55.0,2.500000,G3,0.0,7
1074,0.0,54.0,0.227273,E4,0.0,7
...,...,...,...,...,...,...
1752,228.0,50.0,1.818182,C4,4.0,3
1754,228.0,78.0,1.818182,E2,0.0,3
1756,232.0,78.0,1.818182,E1,0.0,3
1755,232.0,64.0,1.818182,B3,4.0,3


In [7]:
#train les encoders pour les notes, volumes et durées en catégorielles (one hot)
# sur intégralité des données (obligé)
# Voir si on peut pas enregistré le transformateur
oh_notes =  OneHotEncoder().fit(df[['pitch/chord']])


# seuil/catégories modifiable
dict_volume_class = {
    "very low":10,
    "low":30,
    "low medium":50,
    "medium":70,
    "high": 90,
    "very high":128
}
bins_volume = list(dict_volume_class.values())
bins_volume.insert(0,-0.1)
values_volume = list(dict_volume_class.keys())
df["volume_class"] = pd.cut(df["volume"], bins=bins_volume, labels=values_volume)
"""df["volume_class_interval"] = pd.cut(df["volume"], bins=20)
intervals = df["volume_class_interval"]
median_values = [(interval.left + interval.right) / 2 for interval in intervals]
df["volume_class"] = median_values"""
oh_volume =  OneHotEncoder().fit(df[['volume_class']])


dict_duration_class = {
"very short" : 0.1,
"short" :0.25,
"medium":0.5,
"medium-long":0.75,
"long":1.0,
"very long": 1.5,
"kilometer long":2.0,
"yearlight distance":3.0,
"super mega long":df['durée(s)'].max()
}
bins_duration = list(dict_duration_class.values())
bins_duration.insert(0,-0.1)
values_duration = list(dict_duration_class.keys())
df["duration_class"] = pd.cut(df["durée(s)"], bins=bins_duration, labels=values_duration)
"""df["duration_class_interval"] = pd.cut(df["durée(s)"], bins=20)
intervals = df["duration_class_interval"]
median_values = [(interval.left + interval.right) / 2 for interval in intervals]
df["duration_class"] = median_values"""
oh_duration =  OneHotEncoder().fit(df[['duration_class']])


In [8]:
def prepare_sequence(data, sequence_length, network_input, network_output):
  # create input sequences and the corresponding outputs
  for i in range(0, len(data) - sequence_length, 1):
      sequence_in = data[i:i + sequence_length]
      sequence_out = data[i + sequence_length]
      network_input.append(sequence_in)
      network_output.append(sequence_out)
  return (network_input, network_output)

def reshape_array_input(array_tensor, sequence_length):
  n_patterns = len(array_tensor)
  array_reshaped = numpy.reshape(array_tensor, (n_patterns, sequence_length, -1))
  return array_reshaped

def reshape_array_output(array_tensor, sequence_length):
  n_patterns = len(array_tensor)
  array_reshaped = numpy.reshape(array_tensor, (len(array_tensor), -1))
  return array_reshaped


In [9]:
sequence_length = 50

In [10]:
input_note, output_note = [], []
input_offset, output_offset = [], []
input_volume, output_volume = [], []
input_duration, output_duration = [], []
input_step, output_step = [], []
input_simul, output_simult = [],[]

# pour chaque music, on la transforme en dataframe, transforme les notes, durées et volumes
for music in note_all_songs:
  df_music = pd.DataFrame(music, columns=['debut_note','volume','durée(s)', 'pitch/chord', 'step'])

  # encode le volume et la durée
  df_music["volume_class"] = pd.cut(df_music["volume"], bins=bins_volume, labels=values_volume)
  df_music["duration_class"] = pd.cut(df_music["durée(s)"], bins=bins_duration, labels=values_duration)
  #df_music['nombre_de_notes_simultanees'] = df_music.apply(compter_notes_simultanees, axis=1)


  # transform en one hot à partir des modèles entrainés
  notes_encoded = oh_notes.transform(df_music[['pitch/chord']]).toarray()
  volume_encoded = oh_volume.transform(df_music[['volume_class']]).toarray()
  duration_encoded = oh_duration.transform(df_music[['duration_class']]).toarray()

  # prépare les input (longeur = sequence_length) pour chaque morceau
  # garde certaine cohérence au morceau même
  input_offset, output_offset = prepare_sequence(df_music['debut_note'].values, sequence_length, input_offset, output_offset)
  input_note, output_note = prepare_sequence(notes_encoded, sequence_length, input_note, output_note)
  input_volume, output_volume = prepare_sequence(volume_encoded, sequence_length, input_volume, output_volume)
  input_duration, output_duration = prepare_sequence(duration_encoded, sequence_length, input_duration, output_duration)
  input_step, output_step = prepare_sequence(df_music['step'].values, sequence_length, input_step, output_step)
  #input_simul, output_simult = prepare_sequence(df_music['nombre_de_notes_simultanees'].values, sequence_length, input_simul, output_simult)

In [11]:
# on reshape tout
input_note_reshaped= reshape_array_input(input_note, sequence_length)
output_note_reshaped = reshape_array_output(output_note, sequence_length)

input_offset_reshaped = reshape_array_input(input_offset, sequence_length)
output_offset_reshaped = reshape_array_output(output_offset, sequence_length)

input_offset_reshaped_d = input_offset_reshaped - 86
output_offset_reshaped_d = output_offset_reshaped - 86

input_volume_reshaped= reshape_array_input(input_volume, sequence_length)
output_volume_reshaped = reshape_array_output(output_volume, sequence_length)

input_duration_reshaped= reshape_array_input(input_duration, sequence_length)
output_duration_reshaped = reshape_array_output(output_duration, sequence_length)


input_step_reshaped= reshape_array_input(input_step, sequence_length)
output_step_reshaped = reshape_array_output(output_step, sequence_length)

"""input_simul = reshape_array_input(input_simul, sequence_length)
output_simult = reshape_array_output(output_simult, sequence_length)
"""

'input_simul = reshape_array_input(input_simul, sequence_length)\noutput_simult = reshape_array_output(output_simult, sequence_length)\n'

In [75]:
#création du modèle (à alléger si overfitting)
inputNotes_layer = Input(shape=(input_note_reshaped.shape[1], input_note_reshaped.shape[2]))
inputNotes = LSTM(
        32,
        input_shape=(input_note_reshaped.shape[1], input_note_reshaped.shape[2]),
        dropout=0.3,

        return_sequences=True
    )(inputNotes_layer)

inputVolume_layer = Input(shape=(input_volume_reshaped.shape[1], input_volume_reshaped.shape[2]))
inputVolume = LSTM(
        32,
        input_shape=(input_volume_reshaped.shape[1], input_volume_reshaped.shape[2]),
        dropout=0.3,
        return_sequences=True
    )(inputVolume_layer)

regularization_strength = 0.01
inputDuration_layer = Input(shape=(input_duration_reshaped.shape[1], input_duration_reshaped.shape[2]))
inputDuration = LSTM(
        32,
        input_shape=(input_duration_reshaped.shape[1], input_duration_reshaped.shape[2]),
        dropout=0.3,
        return_sequences=True,
    )(inputDuration_layer)

"""inputOffset_layer = Input(shape=(input_offset_reshaped.shape[1], input_offset_reshaped.shape[2]))
inputOffset = LSTM(
        32,
        input_shape=(input_offset_reshaped.shape[1], input_offset_reshaped.shape[2]),
        return_sequences=True
    )(inputOffset_layer)"""

inputStep_layer = Input(shape=(input_step_reshaped.shape[1], input_step_reshaped.shape[2]))
inputStep = LSTM(
        32,
        dropout=0.3,
        input_shape=(input_step_reshaped.shape[1], input_step_reshaped.shape[2]),
        return_sequences=True
    )(inputStep_layer)



attention_notes = Attention(use_scale=True,dropout=0.1)([inputNotes, inputStep, inputDuration])

inputs = concatenate([inputNotes, inputVolume, inputDuration, inputStep, attention_notes])
x = LSTM(128, return_sequences=True, dropout=0.3)(inputs)
x = LSTM(128)(inputs)
x = Dropout(0.3)(x)
x = Dense(128)(x)

outputNotes = Dense(16)(x)
outputNotes = Dropout(0.25)(outputNotes)
outputNotes = Dense(output_note_reshaped.shape[1], activation='softmax', name="Note")(outputNotes)

outputVolume = Dense(16)(x)
"""outputVolume = BatchNormalization()(outputVolume)
outputVolume = Dropout(0.5)(outputVolume)
outputVolume = Dense(16)(outputVolume)"""
outputVolume = Dense(output_volume_reshaped.shape[1], activation='softmax', name="Volume")(outputVolume)

outputDuration = Dense(16)(x)
"""outputDuration = BatchNormalization()(outputDuration)
outputDuration = Dropout(0.5)(outputDuration)
outputDuration = Dense(16)(outputDuration)"""
outputDuration = Dense(output_duration_reshaped.shape[1], activation='softmax', name="Duration")(outputDuration)

outputOffset =  Dense(16)(x)
"""outputOffset = BatchNormalization()(outputOffset)
outputOffset = Dropout(0.5)(outputOffset)
outputOffset = Dense(16)(outputOffset)"""
outputOffset = Dense(output_step_reshaped.shape[1], name="Offset")(outputOffset)



model = Model(inputs=[inputNotes_layer,inputStep_layer,  inputVolume_layer, inputDuration_layer], outputs=[outputNotes, outputOffset, outputVolume, outputDuration])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)

def mse_with_positive_pressure(y_true: tf.Tensor, y_pred: tf.Tensor):
  mse = (y_true - y_pred) ** 2
  positive_pressure = 10 * tf.maximum(-y_pred, 0.0)
  return tf.reduce_mean(mse + positive_pressure)


In [72]:
loss_weights = [1,1.0,1,1.0]
#loss_weights = [0.5,2.0,0.3,1.0]

In [76]:
#Adam seems to be faster than RMSProp and learns better too
model.compile(loss=["categorical_crossentropy", mse_with_positive_pressure, "categorical_crossentropy", "categorical_crossentropy"], optimizer=optimizer, loss_weights=loss_weights)

In [None]:
model.summary()

In [13]:
import os

cp_callback = keras.callbacks.ModelCheckpoint(
    'model_weights_epoch.h5',  # Nom du fichier de sauvegarde avec un espace réservé pour le numéro de l'époque
    save_best_only=True,  # Sauvegarder à chaque époque, pas seulement les meilleurs modèles
    save_weights_only=True,  # Sauvegarder uniquement les poids, pas l'ensemble du modèle
    verbose=1  # Afficher un message lors de la sauvegarde
    )

In [None]:
model.fit([input_note_reshaped, input_step_reshaped,  input_volume_reshaped, input_duration_reshaped], [output_note_reshaped, output_step_reshaped, output_volume_reshaped, output_duration_reshaped], epochs=400, callbacks=[cp_callback], validation_split=0.1)

In [78]:
#prend une séquence de notes, volume, durées
pattern_note = input_note[-1]
pattern_offset = input_offset[-1]
pattern_volume = input_volume[-1]
pattern_duration = input_duration[-1]
pattern_step = input_step[-1]


In [79]:
# prediction time
prediction_output = []


prev_start = pattern_offset[-1]
for i in tqdm(range(200)):

    #on reshape les input à prédire
    note_prediction_input = numpy.reshape(pattern_note, (1, len(pattern_note), -1))
    volume_prediction_input = numpy.reshape(pattern_volume, (1, len(pattern_volume), -1))
    duration_prediction_input = numpy.reshape(pattern_duration, (1, len(pattern_duration), -1))
    offset_prediction_input = numpy.reshape(pattern_offset, (1, len(pattern_offset), 1))
    step_prediction_input = numpy.reshape(pattern_step, (1, len(pattern_step), 1))

    #prédit ici
    prediction = model.predict([note_prediction_input, step_prediction_input, volume_prediction_input, duration_prediction_input], verbose=0)

    # prédis la note en récupérant l'index max du softmax et en faisant la transofr inverse
    # à partir du one hot train sur les notes

    temperature = 1.0

    note_softmax = prediction[0]

    notes_pred = np.zeros(len(pattern_note[0]))
    notes_pred[np.argmax(prediction[0])] = 1
    #notes_pred[note_soft] = 1
    result_note = oh_notes.inverse_transform(notes_pred.reshape(1, -1))
    pattern_note = numpy.concatenate([pattern_note, [notes_pred]])
    pattern_note = pattern_note[1:]


    """offset_predict = prediction[1][0][0]
    pattern_offset = numpy.concatenate([pattern_offset, prediction[1][0]])
    pattern_offset = pattern_offset[1:]
    print(pattern_offset)"""



    step_predict = prediction[1][0][0]
    pattern_step = numpy.concatenate([pattern_step, prediction[1][0]])
    prev_start += step_predict
    pattern_offset = numpy.concatenate([pattern_offset, [prev_start]])

    pattern_step = pattern_step[1:]
    pattern_offset = pattern_offset[1:]

    # la même avec le volume
    volume_pred = np.zeros(len(pattern_volume[1]))
    volume_pred[np.argmax(prediction[2])] = 1
    result_volume = oh_volume.inverse_transform(volume_pred.reshape(1, -1))
    pattern_volume = numpy.concatenate([pattern_volume, [volume_pred]])
    pattern_volume = pattern_volume[1:]

    # la même avec la durée
    duration_pred = np.zeros(len(pattern_duration[0]))
    duration_pred[np.argmax(prediction[3])] = 1
    result_duration = oh_duration.inverse_transform(duration_pred.reshape(1, -1))
    pattern_duration = numpy.concatenate([pattern_duration, [duration_pred]])
    pattern_duration = pattern_duration[1:]

    # comme on prédit un mot associé au volume, on associe au mot une valeur/intensité
    #(manque des catégories)
    volume_encoded = result_volume[0][0]
    volume_decoded = dict_volume_class[volume_encoded]

    # la même avec la durée
    #(manque des catégories)
    duration_encoded = result_duration[0][0]
    duration_decoded = dict_duration_class[duration_encoded]


    #on ajoute la note, le volyme et la durée
    prediction_output.append([result_note[0][0], step_predict, volume_decoded, duration_decoded])


100%|██████████| 200/200 [00:33<00:00,  5.89it/s]


In [None]:
prediction_output

In [81]:
from music21.duration import Duration


In [82]:
offset = 0
output_notes = []
# on passes des prédicitons à un format écoutable à l'oreille

prev_start = 0
# pour chaque note prédite
for note_p in prediction_output:
    pattern = note_p[0]
    # si note = chord
    if ('+' in pattern) or pattern.isdigit():
        notes_in_chord = pattern.split('+')
        notes = []
        for current_note in notes_in_chord:
            new_note = note.Note(int(current_note))
            new_note.storedInstrument = instrument.Piano()
            notes.append(new_note)
        new_chord = chord.Chord(notes)
        new_chord.offset = prev_start + note_p[1]
        #new_chord.offset = note_p[1]
        # met le volume associé
        new_chord.volume.velocity = note_p[2]
        # met la durée associée
        new_chord.duration = Duration(note_p[3])
        output_notes.append(new_chord)
    # si note est un rest
    elif('REST'in pattern):
      note_rest = note.Rest()
      note_rest.offset = prev_start + note_p[1]
      #note_rest.offset = note_p[1]
      # durée associée (pas de volume car silencieux)
      note_rest.duration = Duration(note_p[3])
      output_notes.append(note_rest)
    else:
        new_note = note.Note(pattern)
        new_note.offset = prev_start + note_p[1]
        #new_note.offset = note_p[1]
        new_note.volume.velocity = note_p[2]
        new_note.duration = Duration(note_p[3])
        new_note.storedInstrument = instrument.Piano()
        output_notes.append(new_note)
    prev_start = prev_start + note_p[1]
    # increase offset each iteration so that notes do not stack
    #offset += 0.5

In [84]:
#into midi
midi_stream = stream.Stream(output_notes)
midi_stream.write('midi', fp='test_outputminecraft28-attention-sameweights-withL2.mid')

'test_outputminecraft28-attention-sameweights-withL2.mid'