In [19]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM
from keras.optimizers import RMSprop, Adam
from keras.utils import to_categorical
from unidecode import unidecode
import numpy as np
import matplotlib.pyplot as plt
import random, sys, io, re, csv

In [2]:
csv_reader = csv.reader(open('data\\lyrics.csv', encoding='utf-8'))

In [3]:
# return a dictionary of song: lyrics
def get_tokenized_lines(csv):
    lyrics = {}
    for r in csv:
        words = []
        row = str(r[2]).lower()
        for line in row.split('|-|'):
            new_words = re.findall(r"\b[a-z']+\b", unidecode(line))
            words = words + new_words
        lyrics[r[1]] = words
    return lyrics

In [4]:
all_lyric_words = get_tokenized_lines(csv_reader)

In [73]:
# total word number: 173631, total lines of lyrics: 22688, average word per line: 7.652988
SEQ_LENGTH = 32 + 1 # this one will be deleted during prediction
sequences = list()

def get_all_sequences():
    for song in all_lyric_words:
        if len(all_lyric_words[song]) < SEQ_LENGTH:
            sequences.append(all_lyric_words[song])
        else:
            for i in range(SEQ_LENGTH, len(all_lyric_words[song])):
                seq = all_lyric_words[song][i - SEQ_LENGTH: i]
                sequences.append(seq)
    
    return sequences

In [74]:
sequences = get_all_sequences()
print('Total Sequences: %d' % len(sequences))

Total Sequences: 162985


In [75]:
# store all the unique words and match them with indices
all_words = []
for song in all_lyric_words:
    for word in all_lyric_words[song]:
        all_words.append(word)
unique_word = set(all_words)
word_to_index = {w: i for i, w in enumerate(unique_word)}
index_to_word = {i: w for w, i in word_to_index.items()}
word_indices = [word_to_index[word] for word in unique_word]
word_size = len(unique_word)

print('vocabulary size: {}'.format(word_size))

vocabulary size: 9681


In [76]:
# this funtion change the words into matrix wise data, and each position of the matrix stands for the index of the word in index_to_word

def data_to_matrix(lines, seq_len):
    matrix = np.zeros((len(lines), seq_len))
    
    for r, line in enumerate(lines):
        for c, word in enumerate(line):
            matrix[r, c] = word_to_index[word]

    return matrix

In [77]:
matrix_data = data_to_matrix(sequences, SEQ_LENGTH)

In [78]:
matrix_data[:,-1].shape

(162985,)

In [79]:
# get the y = Wx data, X is the input data, and y is the target data
X, y = matrix_data[:, :-1], matrix_data[:, -1]
y = to_categorical(y, num_classes=word_size) # to_categorical: for categorical_crossentropy optimiser
seq_length = len(X[0]) # 32 in our case, stands for sequence length

In [80]:
print("X_shape", X.shape)
print("y_shape", y.shape)

X_shape (162985, 32)
y_shape (162985, 9681)


In [82]:
# establish the network, using LSTM and compile it
model = Sequential()
model.add(Embedding(word_size, 32, input_length=seq_length)) # Embedding(input_dim, output_dim, input_length), which respecively stands for: the possible value of the word, the output vector size, sequence length
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(word_size, activation='softmax'))
print(model.summary())

adam = Adam(0.002)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(X, y, batch_size=128, epochs=50)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 32, 32)            309792    
_________________________________________________________________
lstm_11 (LSTM)               (None, 32, 100)           53200     
_________________________________________________________________
lstm_12 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_11 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_12 (Dense)             (None, 9681)              977781    
Total params: 1,431,273
Trainable params: 1,431,273
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoc

In [83]:
model.save('model.h5')

In [84]:
# transfer text to a numpy matrix
def text_to_matrix(texts, word_to_index):
    indices = np.zeros((1, len(texts)), dtype=int)
    
    for i, text in enumerate(texts):
        indices[:, i] = word_to_index[text]
        
    return indices

In [85]:
# ensure seach sequence is no longer than max length
def my_pad_sequences(seq, maxlen):
    start = seq.shape[1] - maxlen
    
    return seq[:, start: start + maxlen]

In [86]:
# generate lyrics
def generate_text(model, word_to_index, seq_length, seed_text, num_of_words):
    result = list()[1:]
    input_text = seed_text.lower()

    for num in range(num_of_words):
        encoded = text_to_matrix(input_text.split(), word_to_index)
        encoded = my_pad_sequences(encoded, maxlen=seq_length)
        
        yhat = model.predict_classes(encoded, verbose=0)
        out_word = ''
    
        for word, index in word_to_index.items():
            if index == yhat:
                out_word = word
                break
        
        input_text += ' ' + out_word
        result.append(out_word)
        
    return ' '.join(result)

In [88]:
seed_text = "All I wanted was somebody to hear me and all I wanted was somebody to feel me and everybody wanna tell me that I'm out of my mind When I'm on the mic that's fine"
generated = generate_text(model, word_to_index, SEQ_LENGTH, seed_text, 50)
print(generated)

fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fine fore awards have to to to to to to to to to to to to to to to to to to


In [34]:
print(word_to_index)

{"america's": 0, 'opposite': 1, 'playas': 2, 'reception': 3, 'horsepower': 4, 'wedding': 5, 'featured': 6, 'cushion': 7, 'cursed': 8, "amar'e": 9, 'robocop': 10, 'eyes': 11, 'toyin': 12, 'worst': 13, 'stamina': 14, 'suitless': 15, 'remains': 16, 'buried': 17, 'spoken': 18, 'monkey': 19, 'liftoff': 20, 'blue': 21, 'bs': 22, 'were': 23, 'stiflin': 24, 'painters': 25, "metoo'd": 26, 'oeur': 27, 'fiiine': 28, 'tweet': 29, 'ethnicity': 30, 'brethren': 31, 'cafe': 32, 'balcony': 33, 'scars': 34, 'theirs': 35, 'serena': 36, 'sippy': 37, 'survey': 38, 'bullheaded': 39, 'lyrically': 40, 'farrakhan': 41, 'smtih': 42, 'dons': 43, 'will': 44, "i'ma": 45, 'donnie': 46, 'zonin': 47, 'grams': 48, 'candyman': 49, 'dreamers': 50, 'geniuses': 51, 'father': 52, 'sled': 53, 'hall': 54, "don's": 55, 'tanned': 56, 'buscemi': 57, 'unthawed': 58, 'unwelcome': 59, 'electro': 60, 'added': 61, 'sluts': 62, 'prove': 63, 'shay': 64, 'uglies': 65, 'blouses': 66, 'coats': 67, 'crunchy': 68, 'beemer': 69, 'route': 70