# 2 - Word-level LSTM french rap lyrics generator

We start by a simple  word-level LSTM french rap lyrics generator

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
'''
the first step is to read the lyric corpus,
removing the ponctuation and spliting it into words

We used a reduced list of artists contained in lyricsbis as we just had too much lyrics for our computing power...
'''

import glob
from tqdm import tqdm 
import os

lyrics_path = '/content/drive/MyDrive/ProjetDL/lyrics/*.txt'
lyrics_path_bis = '/content/drive/MyDrive/ProjetDL/lyricsbis/*.txt'


files = glob.glob(lyrics_path_bis)
corpus = []
for file in tqdm(files, position=0, leave=True) :
    with open(file) as f :
        text = f.read().lower().replace('\n', ' \n ')
        text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']
        corpus = corpus + text_in_words


100%|██████████| 54/54 [00:32<00:00,  1.64it/s]


In [None]:
print("There are",len(corpus),"words in the corpus")

There are 4309886 words in the corpus


In [None]:
'''
Now we compute the word frequency of our corpus
and we remove the words that appear less than a 
chosen threshold : they will be of no interest
for the learning of the algorithm
'''

word_freq = {}
for word in corpus:
    word_freq[word] = word_freq.get(word, 0) + 1

MIN_WORD_FREQUENCY = 10 # this is already huge, but we have a lot of words...

ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)

words = set(corpus)
print('Unique words before ignoring:', len(words))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words = sorted(set(words) - ignored_words)
print('Unique words after ignoring:', len(words))

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

Unique words before ignoring: 215562
Ignoring words with frequency < 10
Unique words after ignoring: 25680


In [None]:
'''
We will teach our Neural network to predict the next word
given a sequence of previous word, of a chosen length.
'''
STEP = 4 # we only pick one word every four other
sentences = []
next_words = []
ignored = 0

SEQUENCE_LEN = 15
for i in tqdm(range(0, len(corpus) - SEQUENCE_LEN, STEP), position=0, leave=True):
    # Only add sequences where no word is in ignored_words
    if len(set(corpus[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences.append(corpus[i: i + SEQUENCE_LEN])
        next_words.append(corpus[i + SEQUENCE_LEN])
    else:
        ignored = ignored+1
print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences))

100%|██████████| 1077468/1077468 [00:03<00:00, 284537.84it/s]

Ignored sequences: 736988
Remaining sequences: 340480





In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=2):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)

(sentences_train, next_words_train), (sentences_test, next_words_test) = shuffle_and_split_training_set(sentences, next_words)


Shuffling sentences
Size of training set = 333670
Size of test set = 6810


In [None]:
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping           
from keras.models import Sequential                                                  
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding  

dropout = 0.2

model = Sequential()
model.add(Embedding(input_dim=len(words), output_dim=512))
model.add(Bidirectional(LSTM(128)))
if dropout > 0:
    model.add(Dropout(dropout))
model.add(Dense(len(words)))
model.add(Activation('softmax'))
model.compile(loss='sparse_categorical_crossentropy',
                optimizer="adam", metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 512)         13148160  
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               656384    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 25680)             6599760   
_________________________________________________________________
activation (Activation)      (None, 25680)             0         
Total params: 20,404,304
Trainable params: 20,404,304
Non-trainable params: 0
_________________________________________________________________


In [None]:
# we need data generators to feed the model
# otherwise it would cause a memory error
import numpy as np
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from operator import itemgetter

BATCH_SIZE = 32

def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.int32)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t] = word_indices[w]
            y[i] = word_indices[next_word_list[index % len(sentence_list)]]
            index = index + 1
        yield x, y


In [None]:
'''
We use a generator function and a sparse representation of the labels to train our model
This is a great way to gain time.
sparse representation => sparse_categorical_crossentropy as loss function
'''


early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)
#file_path = '/content/drive/MyDrive/ProjetDL/Modelbis'     #Changer le nom avant
model_checkpoint = ModelCheckpoint(filepath=file_path + '.hdf5', monitor='val_accuracy',
                                                   save_best_only=True)

callbacks_list = [early_stopping,model_checkpoint]

model.fit(generator(sentences, next_words, BATCH_SIZE),
                        steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
                        epochs=100,
                        callbacks=callbacks_list,
                        validation_data=generator(sentences_test, next_words_test, BATCH_SIZE),
                        validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1)


'''
usually, it is google cloud who choose when the training stop... 
but thanks to our callback function we can start to retrain the saved model
'''

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
  556/10641 [>.............................] - ETA: 24:43 - loss: 1.3644 - accuracy: 0.7099

KeyboardInterrupt: ignored

In [None]:
# if a trained model already exists, we can load it here, then execute the above cell

file_path = '/content/drive/MyDrive/ProjetDL/Modelbis'
#Faut execute la cell de la création du modèle plus haut avant
model.load_weights(file_path + '.hdf5')

In [None]:
#Here we generate lyrics for different temperatures of the LSTM.

seed_index = np.random.randint(len(sentences))
seed= ['mes','textes','sont','écrits','par','une','machine','\n','mon','sons','vient','de','la','rue','gros']
#seed = (sentences)[seed_index]
print(seed)
print('')

def sample(preds, temperature=1.0):
    # sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

for diversity in [0.1,0.2,0.3, 0.4, 0.5, 0.6, 0.7,0.9]:
    sentence = seed

    lyric = []
    for i in range(75):
        x_pred = np.zeros((1, SEQUENCE_LEN))
        for t, word in enumerate(sentence):
            x_pred[0, t] = word_indices[word]

        preds = model.predict(x_pred, verbose=0)[0] # reconstructed_model or model
        next_index = sample(preds, diversity)
        next_word = indices_word[next_index]

        sentence = sentence[1:]
        sentence.append(next_word)

        lyric.append(next_word)
    print('generation pour température',diversity,':')
    print('')
    print(' '.join(seed))
    print(' '.join(lyric))
    print('')


['mes', 'textes', 'sont', 'écrits', 'par', 'une', 'machine', '\n', 'mon', 'sons', 'vient', 'de', 'la', 'rue', 'gros']

generation pour température 0.1 :

mes textes sont écrits par une machine 
 mon sons vient de la rue gros
en un peu de moi 
 temps de temps pour les gens 
 en moi je sais que je me sens beaucoup 
 mais quand mon équipe est bonne si je me veux 
 là où je me sens seul seul au pire 
 si tu peux faire plus de respect, que dieu nous a seul 
 a plus de gens qui me check 
 les gens sont pas les gens les gens sont trop

generation pour température 0.2 :

mes textes sont écrits par une machine 
 mon sons vient de la rue gros
un peu de ce qui est en 
 a quoi le choix de la vie 
 en moins de la vie des de rêve 
 et le mal à dire 
 et on est très mal et on fait mal 
 on est tout pour ça pour tout on fait tout 
 on fait tout seul, tout le monde 
 tout le monde est mort 
 tout le monde est prêt à la maison 


generation pour température 0.3 :

mes textes sont écrits par une machine 