# 1.Data Set & Processing

In [None]:
#Libraries
#FOR Model Training
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adamax
import tensorflow.keras.backend as K

#Data Processing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from music21 import converter, stream, note, chord


import random
from collections import Counter
import os
import pickle
import matplotlib.pyplot as plt

np.random.seed(4)

In [None]:


import kagglehub
import shutil

# Download the latest version of the dataset
path = kagglehub.dataset_download("soumikrakshit/classical-music-midi")

# Move the dataset to the Colab root directory
shutil.move(path, "/content/dataset")
print("Path to dataset files:", "/content/dataset")


Downloading from https://www.kaggle.com/api/v1/datasets/download/soumikrakshit/classical-music-midi?dataset_version_number=1...


100%|██████████| 2.33M/2.33M [00:00<00:00, 112MB/s]

Extracting files...
Path to dataset files: /content/dataset





In [None]:
composer_name = "debussy"

In [None]:
#Loading the list of chopin's midi files as stream
filepath = os.path.join("/content/dataset/", f"{composer_name}")
#Getting midi files
all_midis= []
for i in os.listdir(filepath):
    if i.endswith(".mid"):
        tr = filepath+i
        midi = converter.parse(tr)
        all_midis.append(midi)

In [None]:
#Helping function
def extract_notes(file):
    notes = []
    pick = None
    for j in file:
        songs = instrument.partitionByInstrument(j)
        for part in songs.parts:
            pick = part.recurse()
            for element in pick:
                if isinstance(element, note.Note):
                    notes.append(str(element.pitch))
                elif isinstance(element, chord.Chord):
                    notes.append(".".join(str(n) for n in element.normalOrder))

    return notes
#Getting the list of notes as Corpus
Corpus= extract_notes(midi)
print("Total notes in all the Chopin midis in the dataset:", len(Corpus))

Total notes in all the Chopin midis in the dataset: 7551


In [None]:
#Creating a count dictionary
count_num = Counter(Corpus)
print("Total unique notes in the Corpus:", len(count_num))

Total unique notes in the Corpus: 235


In [None]:
#Getting a list of rare chords
rare_note = []
for index, (key, value) in enumerate(count_num.items()):
    if value < 100:
        m =  key
        rare_note.append(m)

print("Total number of notes that occur less than 100 times:", len(rare_note))

#Eleminating the rare notes
for element in Corpus:
    if element in rare_note:
        Corpus.remove(element)

print("Length of Corpus after elemination the rare notes:", len(Corpus))

Total number of notes that occur less than 100 times: 211


# 2.LSTM PARAMETERS

In [None]:
# Storing all the unique characters present in my corpus to bult a mapping dic.
symb = sorted(list(set(Corpus)))

L_corpus = len(Corpus) #length of corpus
L_symb = len(symb) #length of total unique characters

#Building dictionary to access the vocabulary from indices and vice versa
mapping = dict((c, i) for i, c in enumerate(symb))
reverse_mapping = dict((i, c) for i, c in enumerate(symb))

print("Total number of characters:", L_corpus)
print("Number of unique characters:", L_symb)

Total number of characters: 5378
Number of unique characters: 196


In [None]:
#Splitting the Corpus in equal length of strings and output target
length = 40
features = []
targets = []
for i in range(0, L_corpus - length, 1):
    feature = Corpus[i:i + length]
    target = Corpus[i + length]
    features.append([mapping[j] for j in feature])
    targets.append(mapping[target])


L_datapoints = len(targets)
print("Total number of sequences in the Corpus:", L_datapoints)

Total number of sequences in the Corpus: 5338


In [None]:
# reshape X and normalize
X = (np.reshape(features, (L_datapoints, length, 1)))/ float(L_symb)
# one hot encode the output variable
y = tensorflow.keras.utils.to_categorical(targets)

In [None]:
#Taking out a subset of data to be used as seed
# train on 20% of our dataset
X_train, X_seed, y_train, y_seed = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
"""LSTM MODEL"""

#Initialising the Model
model = Sequential(). #for feedback as an input


# 512 - The number of neurons in the LSTM layer.
# The number of input sequence, The number of features
model.add(LSTM(512, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))


model.add(Dropout(0.1)) #  sets 10% input units to zero for not overfitting.


model.add(LSTM(256)) # Additional LSTM layer , 256 neurons in this layer
#reducing complexity as the model progresses deeper.


model.add(Dense(256)) # map the LSTM output to a higher-dimensional space for feature extraction
model.add(Dropout(0.1))

model.add(Dense(y.shape[1], activation='softmax')) # activation function converts raw scores into probabilities

#Compiling the model for training
opt = Adamax(learning_rate=0.01)

model.compile(loss='categorical_crossentropy', optimizer=opt) # multi-class classification

In [None]:
#Training the Model
history = model.fit(X_train, y_train, batch_size=128, epochs=200)

# 3.Melody Generation

In [None]:
def chords_n_notes(Snippet):
    Melody = []
    offset = 0 #Incremental
    for i in Snippet:
        #If it is chord
        if ("." in i or i.isdigit()):
            chord_notes = i.split(".") #Seperating the notes in chord
            notes = []
            for j in chord_notes:
                inst_note=int(j)
                note_snip = note.Note(inst_note)
                notes.append(note_snip)
                chord_snip = chord.Chord(notes)
                chord_snip.offset = offset
                Melody.append(chord_snip)
        # pattern is a note
        else:
            note_snip = note.Note(i)
            note_snip.offset = offset
            Melody.append(note_snip)
        # increase offset each iteration so that notes do not stack
        offset += 1
    Melody_midi = stream.Stream(Melody)
    return Melody_midi

Melody_Snippet = chords_n_notes(Corpus[:100])

In [None]:
def Malody_Generator(Note_Count):
    mdl_dir = '/content/drive/MyDrive/MIDI/Model'  # Update this path if needed
    mdl_filename = f"{composer_name}.keras"
    model_path = os.path.join(mdl_dir, mdl_filename)
    model = load_model(model_path)

    seed = X_seed[np.random.randint(0,len(X_seed)-1)]
    Music = ""
    Notes_Generated=[]
    for i in range(Note_Count):
        seed = seed.reshape(1,length,1)
        prediction = model.predict(seed, verbose=0)[0]
        prediction = np.log(prediction) / 1.0 #diversity
        exp_preds = np.exp(prediction)
        prediction = exp_preds / np.sum(exp_preds)
        index = np.argmax(prediction)
        index_N = index/ float(L_symb)
        Notes_Generated.append(index)
        Music = [
            reverse_mapping[char] if char in reverse_mapping else random.choice(list(reverse_mapping.values()))
            for char in Notes_Generated
        ]
        seed = np.insert(seed[0],len(seed[0]),index_N)
        seed = seed[1:]
    #Now, we have music in form or a list of chords and notes and we want to be a midi file.
    Melody = chords_n_notes(Music)
    Melody_midi = stream.Stream(Melody)
    return Music,Melody_midi


#getting the Notes and Melody created by the model
Music_notes, Melody = Malody_Generator(100)

In [None]:
Melody.write('midi','Melody_Generated.mid')

'Melody_Generated.mid'

# 4.Model Validation

In [None]:


# Extract metrics
train_loss = history.history['loss']
val_loss = history.history['val_loss']

# Plot training vs validation loss
plt.figure(figsize=(8, 6))
plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
