In [1]:
from music21 import converter, instrument, note, chord, stream, midi, instrument
from scipy import sparse
import time
import tqdm.auto
import glob
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import math

In [2]:
def DataToStream(data_mat, time_signature = 0.25):
    melody_stream = stream.Stream()
    melody_stream.append(instrument.Piano())
    t,k = data_mat.shape
    old_element = None
    counter = 1
    for i in range(t):
        arr = data_mat[i,:]
        r = np.int(np.sum(arr))
        
        if r == 1:
            #print(np.argmax(arr))
            new_element = note.Note(np.argmax(arr))
        elif r > 1:
            #print(arr)
            arr = arr.todense()
            #print(np.where(arr == 1)[1])
            pitches = np.where(arr == 1)[1]
            all_notes = np.array([note.Note(i) for i in pitches])
            new_element = chord.Chord(all_notes)
        else:
            #print(r)
            new_element = note.Rest()
        
        if new_element == old_element and counter < 6:
            counter += 1
        else:
            if old_element:
                old_element.quarterLength = time_signature*counter
                melody_stream.append(old_element)
            counter = 1
            
        old_element = new_element
            
    return melody_stream

In [3]:
classes = ['bach','backstreetboys','beatles','beethoven','brahms','britneyspears',
             'chopin','coldplay','debussy','haydn','liszt','mendelssohn',
            'mozart','nirvana','paganini','queen','rachmaninow','schubert',
            'schumann','tchaikovsky']
datasets = [folder + '_dataset.npz' for folder in classes]
labels = np.arange(20)

In [4]:
song_length = 1*60*6
n_classes = len(classes)
device = "cuda:0"

In [5]:
data = []
for file in datasets:
    d = []
    subset = np.load(file,allow_pickle=True)
    for item in subset.files:
        d.append(subset[item])
    data.append(d[0])
    

In [6]:
len(data)

20

In [7]:
data[19][5]

<509x128 sparse matrix of type '<class 'numpy.float64'>'
	with 735 stored elements in Compressed Sparse Row format>

In [8]:
def random_snippet(c):
    n_songs = len(data[c])
    song_no = np.random.randint(n_songs)
    song = data[c][song_no]
    L,D = song.shape
    start_max = L-song_length
    if start_max > 0:
        song_start = np.random.randint(start_max)
    else:
        #print(c,song_no)
        return random_snippet(c)
    
    return song[song_start:song_start+song_length,:]

In [9]:
random_snippet(3)

<360x128 sparse matrix of type '<class 'numpy.float64'>'
	with 1222 stored elements in Compressed Sparse Row format>

In [10]:
def standard_batch():
    batch = []
    for i in range(n_classes):
        snippet = random_snippet(i)
        batch.append(snippet)
    return batch

In [11]:
batch1 = standard_batch()

In [12]:
stream1 = DataToStream(batch1[19], time_signature = 0.25)

In [14]:
stream1.show('midi')

In [15]:
batch2 = np.array([a.todense() for a in batch1])
batch2.shape

(20, 360, 128)

In [16]:
labels = torch.arange(20)
labels

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19])

In [17]:
def sample():
    batch1 = standard_batch()
    batch2 = np.array([a.todense() for a in batch1])
    device = "cuda:0"
    batch3 = torch.from_numpy(batch2).to(device).float()
    return batch3
    

In [18]:
def torch_convert(song):
    batch2 = song.todense()
    device = "cuda:0"
    batch3 = torch.from_numpy(batch2).to(device).float()
    return batch3.view(1,song.shape[0],song.shape[1])

In [19]:
batch = sample()
batch.shape

torch.Size([20, 360, 128])

In [227]:
#encoder_layer = nn.TransformerEncoderLayer(d_model=128, nhead=8)
#transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
#transformer_encoder.to(device)

TransformerEncoder(
  (layers): ModuleList(
    (0): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): Linear(in_features=128, out_features=128, bias=True)
      )
      (linear1): Linear(in_features=128, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=128, bias=True)
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): Linear(in_features=128, out_features=128, bias=True)
      )
      (linear1): Linear(in_features=128, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=128, bias=True)
      (norm1): LayerNorm((128,

In [229]:
#out = transformer_encoder(batch)
#out.shape

torch.Size([20, 360, 128])

In [20]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model = 128, dropout=0.00, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        #print(self.pe.shape)
        return self.dropout(x)


In [21]:
class LingLing_Encoder(nn.Module):
    def __init__(self, dim = 128, nheads = 8, nlayers = 6,n_classes = 20):
        super(LingLing_Encoder, self).__init__()
        
        device = "cuda:0"
        self.dim = dim
        
        #self.pos_encoder = PositionalEncoding()
        self.positional_encodings = nn.Embedding(song_length, dim)
        
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=dim, nhead=nheads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=nlayers)
        self.fc_layer = nn.Linear(dim*song_length,n_classes)
        
        #self.pos_encoder.to(device)
        self.positional_encodings.to(device)
        self.transformer_encoder.to(device)
        self.fc_layer.to(device)
        
        

    def forward(self, song):
        #print(song.size(0))
        #pos_song = self.pos_encoder(song)
        pos = torch.arange(0, song_length).unsqueeze(0).repeat(20, 1).to(device)
        pos_embeddings = self.positional_encodings(pos)
        pos_song = song + pos_embeddings
        #print(pos_song[0,:,127])
        feature_rep = self.transformer_encoder(pos_song)
        scores = self.fc_layer(feature_rep.view(-1,self.dim*song_length))
        
        return scores

In [22]:
model = LingLing_Encoder()
optimizer = optim.Adam(model.parameters(), lr=1e-5, betas =(0.9,0.999), weight_decay = 0.0005, eps=1e-08)
#optimizer = optim.SGD(model.parameters(), lr=1e-5, momentum=0.9)

In [23]:
model.load_state_dict(torch.load('Ling_ling_AI.pt'))

<All keys matched successfully>

In [266]:
optimizer.zero_grad()
labels = torch.arange(20)
labels = labels.to(device)
iterations = 10000
total_loss = 0
for i in range(iterations):
    model.zero_grad()
    sample_batch = sample()

    scores = model(sample_batch)
    
    loss_fn = nn.CrossEntropyLoss()
    loss = loss_fn(scores, labels)
    total_loss += loss.item()
    
    loss.backward()
    optimizer.step()
    
    if i%250 == 249:
        preds = torch.argmax(scores, axis=1)
        accuracy = (preds == labels).sum().item()/n_classes
        print("Iteration :",i, " Loss :", loss.item(), " Accuracy :", accuracy, " Avergae Loss :", total_loss/(250))
        total_loss = 0

Iteration : 249  Loss : 0.7408860325813293  Accuracy : 0.65  Avergae Loss : 0.707398054599762
Iteration : 499  Loss : 0.7324044704437256  Accuracy : 0.9  Avergae Loss : 0.7241087056994439
Iteration : 749  Loss : 0.6242073178291321  Accuracy : 0.8  Avergae Loss : 0.700646186709404
Iteration : 999  Loss : 0.6013311147689819  Accuracy : 0.75  Avergae Loss : 0.6868517787456513
Iteration : 1249  Loss : 0.8134172558784485  Accuracy : 0.8  Avergae Loss : 0.7247982162237168
Iteration : 1499  Loss : 0.8199619054794312  Accuracy : 0.7  Avergae Loss : 0.6998545570373536
Iteration : 1749  Loss : 0.656814455986023  Accuracy : 0.75  Avergae Loss : 0.6708198999166488
Iteration : 1999  Loss : 0.24602322280406952  Accuracy : 0.9  Avergae Loss : 0.69312797665596
Iteration : 2249  Loss : 0.47412458062171936  Accuracy : 0.7  Avergae Loss : 0.6815001353621483
Iteration : 2499  Loss : 0.9102722406387329  Accuracy : 0.7  Avergae Loss : 0.7112409090995788
Iteration : 2749  Loss : 0.5534290075302124  Accuracy 

In [267]:
torch.save(model.state_dict(), 'Ling_ling_AI.pt')

In [31]:
batch1 = standard_batch()
song = batch1[18]
stream1 = DataToStream(song, time_signature = 0.25)

In [32]:
stream1.show('midi')

In [79]:
new_song = torch_convert(song)
new_song = new_song.to(device)
new_song = new_song.requires_grad_()

In [80]:
#new_song = torch.rand(1,song_length,128)
#new_song = new_song.to(device)
#new_song = new_song.requires_grad_()

In [81]:
new_song_optimizer = torch.optim.Adam([new_song], lr=1e-4)

In [82]:
target_y = 0
l2_reg = 0.05
continuity_constant = 10

for i in range(3000):
    model.zero_grad() 
    score = model(new_song) 
    
    target_score = score[0,target_y] 
    
    reg_loss = l2_reg*torch.sum((new_song*new_song))/song_length
    
    continuity_drops = new_song[0,1:,:] - new_song[0,:-1,:]
    continuity_loss = continuity_constant*torch.sum((continuity_drops*continuity_drops))/(song_length-1)
    loss = -target_score + continuity_loss + reg_loss
    if i%99 == 0:
        print(-reg_loss.item(), target_score.item(), -continuity_loss.item()) 
    loss.backward() 
    new_song_optimizer.step()

-0.16583333909511566 2.2906670570373535 -16.740947723388672
-0.16480804979801178 3.498722553253174 -16.131704330444336
-0.16538195312023163 6.828791618347168 -15.80954647064209
-0.16762611269950867 8.030191421508789 -15.500617027282715
-0.1714262217283249 10.73816204071045 -15.130468368530273
-0.1768830120563507 12.46987533569336 -14.726530075073242
-0.18410250544548035 15.507140159606934 -14.311625480651855
-0.19312381744384766 17.108675003051758 -13.910296440124512
-0.20397017896175385 20.016637802124023 -13.535079956054688
-0.21669165790081024 22.954137802124023 -13.194001197814941
-0.2313285619020462 26.027236938476562 -12.89670467376709
-0.24780625104904175 28.03739356994629 -12.643054962158203
-0.2660655379295349 31.247072219848633 -12.429404258728027
-0.2860461473464966 33.71356964111328 -12.248661041259766
-0.30765679478645325 36.25697326660156 -12.098265647888184
-0.33073073625564575 38.55687713623047 -11.974254608154297
-0.3551429510116577 40.48818588256836 -11.86494350433349

In [83]:
test = new_song.clone()

In [84]:
test = test[0]
test.shape

torch.Size([360, 128])

In [85]:
a, b = test.max(axis = 1)

In [86]:
def make_it_music(not_a_song, max_notes = 3, song_length = 360):
    threshold = -1
    avg_notes = 100
    while avg_notes > max_notes:
        threshold = threshold + 0.01
        s = (not_a_song>threshold).sum()
        avg_notes = s.item()/song_length

    print(avg_notes, threshold)
    return (not_a_song>threshold).int()

In [87]:
test_song = make_it_music(test, max_notes=3.5)

2.408333333333333 0.540000000000001


In [88]:
test_song.shape

torch.Size([360, 128])

In [89]:
test_song = test_song.detach().cpu().numpy()

In [90]:
test_song = sparse.csr_matrix(test_song)

In [91]:
test_song.shape

(360, 128)

In [92]:
test_stream = DataToStream(test_song, time_signature = 0.25)

In [93]:
test_stream.show('midi')

In [94]:
test_stream.show('text')

{0.0} <music21.instrument.Piano 'Piano'>
{0.0} <music21.note.Note E->
{0.25} <music21.note.Note C>
{0.5} <music21.chord.Chord C1 E1 A2 G3 C4 D4 D5>
{0.75} <music21.chord.Chord A2 C4 D4 D5>
{1.0} <music21.chord.Chord A2 F#3 C4 D4 D5>
{1.25} <music21.chord.Chord E-1 A2 F#3 C4 D5>
{1.5} <music21.chord.Chord C0 E-1 E1 A2 C4 D5>
{1.75} <music21.note.Note D>
{2.0} <music21.chord.Chord F4 G4 D5>
{2.5} <music21.chord.Chord C0 F4 G4 D5 C8>
{2.75} <music21.chord.Chord E3 G3 G4 D5>
{3.0} <music21.chord.Chord G4 C#5 B5>
{3.25} <music21.chord.Chord G#4 B8>
{3.5} <music21.chord.Chord C4 E-4 G4>
{3.75} <music21.chord.Chord C3 C4 G4 C5>
{4.0} <music21.chord.Chord C3 G#3 C4 G#4 B8>
{4.25} <music21.chord.Chord C4 E-4 G4 B8>
{4.5} <music21.chord.Chord C4 E-4 G4 C5 B8>
{4.75} <music21.chord.Chord C4 E-4>
{5.0} <music21.chord.Chord E-4 C5 C8>
{5.25} <music21.chord.Chord C4 E-4>
{5.5} <music21.chord.Chord C1 C2 E-4 G4 E-5>
{5.75} <music21.chord.Chord C1 C#3 E-4 G#4 E-5>
{6.0} <music21.chord.Chord F#2 G2 E-5

In [65]:
stream1.show('text')

{0.0} <music21.instrument.Piano 'Piano'>
{0.0} <music21.chord.Chord G2 B-2 E-3 E-4 E-5>
{0.25} <music21.chord.Chord A2 D3 F#3 C4 D4 D5>
{1.75} <music21.chord.Chord B-2 G3 F4 G4 D5>
{3.25} <music21.chord.Chord C3 G#3 C4 E-4 G#4 C5>
{4.5} <music21.chord.Chord C3 C4 E-4 C5>
{5.5} <music21.chord.Chord C3 B-3 E-4 B-4>
{5.75} <music21.chord.Chord C3 G#3 E-4 G#4>
{6.25} <music21.chord.Chord D3 G3 B3 F4 G4>
{7.0} <music21.chord.Chord E-3 G3 B3 F4 G4>
{7.25} <music21.chord.Chord F3 G3 B3 F4 G4>
{7.5} <music21.chord.Chord C3 G3 B3 F4 G4>
{7.75} <music21.chord.Chord C3 F3 B-3 C4 G4>
{8.5} <music21.chord.Chord C3 E3 B-3 C4 G4>
{8.75} <music21.chord.Chord C3 B-3 C4 D4 G4>
{9.0} <music21.chord.Chord F2 C3 B-3 C4 D4 G4>
{9.25} <music21.chord.Chord F2 E-3 A3 C4 F4>
{10.25} <music21.chord.Chord F2 E-3 A3 D4 F4>
{10.5} <music21.chord.Chord F2 E-3 A3 C4 D4 F4>
{10.75} <music21.chord.Chord F2 E-3 A3 C4 D4 E-4>
{11.5} <music21.chord.Chord F2 E-3 A3 D4>
{11.75} <music21.chord.Chord F2 E-3 A3 C4 F4>
{12.25} 