In [1]:
import os
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

In [2]:
SEQUENCE_LENGTH = 16

X_test_unseen = np.load(os.path.sep.join(['data', 'unseen_sequences.npy']))
y_test_unseen = np.load(os.path.sep.join(['data', 'unseen_labels.npy']))
X_test_eval = np.load(os.path.sep.join(['data', 'eval_sequences.npy']))
y_test_eval = np.load(os.path.sep.join(['data', 'eval_labels.npy']))

#Load the tokenizer
with open("data/tokenizer.json") as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

reverse_word_index = dict([(value, key) for key, value in tokenizer.word_index.items()])

filepath = "saved_model_cfg_8"

print("Unseen:", X_test_unseen.shape, y_test_unseen.shape)

Unseen: (44895, 16) (44895,)


In [3]:
model = load_model(filepath)

## Final predictions
Do final predictions on unseen data and compare the accuracy on evaluation data, i.e., how well the model generalizes across these splits.

In [18]:
_, accuracy_unseen = model.evaluate(X_test_unseen, y_test_unseen)
print('Unseen accuracy: %.2f' % (accuracy_unseen*100))
_, accuracy_eval = model.evaluate(X_test_eval, y_test_eval)
print('Evaluation accuracy: %.2f' % (accuracy_eval*100))

Unseen accuracy: 30.53
Evaluation accuracy: 30.56


In [5]:
#Get predictions from evaluation data
predictions = model.predict(X_test_unseen)

accuracy_array = np.array([])

#Add the individual prediction accuracies to a numpy array
for i in range(0, predictions.shape[0]):
  accuracy_array = np.append(accuracy_array, predictions[i][y_test_unseen[i]])

In [6]:
#Find 10 predictions with the highest and lowest accuracies
max_10_index = accuracy_array.argsort()[-10:][::-1]
min_10_index = accuracy_array.argsort()[:10]

max_10_sequences = X_test_unseen[max_10_index]
max_10_labels = y_test_unseen[max_10_index]
min_10_sequences = X_test_unseen[min_10_index]
min_10_labels = y_test_unseen[min_10_index]

In [7]:
#Print the highest accuracy predictions
print("Highest accuracy predictions:\n")
print("Index. Sequence -> Label. Accuracy")

for i in range(0, 10):
  print(f"{i}. {' '.join([reverse_word_index[num] for num in max_10_sequences[i]])} ->  {reverse_word_index[max_10_labels[i]]}. {accuracy_array[max_10_index[i]]}")

Highest accuracy predictions:

Index. Sequence -> Label. Accuracy
0. <br> like a children <br> im running out of my soul with all my best sensations ->  <br>. 0.9965533018112183
1. this city go <br> if you want flying <br> energy is <OOV> for the new sensations ->  <br>. 0.9940819144248962
2. i will be alone <br> in new york city <br> dead or alive breakin the ice ->  <br>. 0.9931383728981018
3. running like a nightmare <br> lonely boy <br> your heart is full of passion <br> full ->  of. 0.9924903512001038
4. for a lovely game <br> sweet and sensual <br> now im ready waiting for your extasy ->  <br>. 0.9921045303344727
5. the ocean <br> deep inside your love <br> deep inside this romantic feeling of real devotion ->  <br>. 0.9903891682624817
6. you <br> just around the corner <br> along the way i meet your love hot fire ->  <br>. 0.989725649356842
7. the way <br> tell you something <br> im falling down from grace <br> its a kind ->  of. 0.9879541397094727
8. till tomorrow free <br> lov

In [8]:
#Print the lowest accuracy predictions
print("Lowest accuracy predictions:\n")
print("Index. Sequence -> Label. Accuracy")

for i in range(0, 10):
  print(f"{i}. {' '.join([reverse_word_index[num] for num in min_10_sequences[i]])} ->  {reverse_word_index[min_10_labels[i]]}. {accuracy_array[min_10_index[i]]}")

Lowest accuracy predictions:

Index. Sequence -> Label. Accuracy
0. see youre born to be wild <br> baby try <br> usa <br> be my lady danger ->  usa. 4.4633467386444303e-13
1. fly to you my darlin <br> let me keep my dream believe me <br> i love ->  america. 1.8869521830472413e-12
2. <OOV> ready to believe <br> tell me that my dream can be real <br> i love ->  america. 3.1947281192018506e-12
3. power of life <br> its the game of the time <br> for us to believe in ->  someones. 1.994679421635226e-11
4. and me <br> forever and ever <br> forever and ever <br> is a very long time ->  pooh. 2.1991136342491835e-11
5. meant youre ready to be tough <br> id like to play your game <br> come on ->  instead. 2.470177361268977e-11
6. all over the line my babe <br> bad cow bad cow <br> wont you be my ->  cow. 2.537836954474848e-11
7. <br> to feel my affection <br> i wanna come back to the <OOV> <br> sexy im ->  clever. 3.511544874323924e-11
8. new reason to live <br> but i know that you can blow my m

### Lisätapoja

In [9]:
predictions = model.predict(X_test_unseen)
predicted_labels = np.argmax(predictions, axis=1)

In [10]:
correct_idx_acc = [(i, predictions[i].max(), predicted_labels[i]) for i, value in enumerate(predicted_labels == y_test_unseen) if value == True]
correct_idx_acc_max = sorted(correct_idx_acc, key = lambda tup: tup[1], reverse=True)

#Print the highest accuracy predictions (correct)
print("Highest accuracy, correct predictions:\n")
print("Index. Sequence -> Label. Accuracy")

for i, acc, label in correct_idx_acc_max[:10]:
    print(f"{i}. {' '.join([reverse_word_index[num] for num in X_test_unseen[i]])} ->  {reverse_word_index[y_test_unseen[i]]}. {acc}")

Highest accuracy, correct predictions:

Index. Sequence -> Label. Accuracy
17520. <br> like a children <br> im running out of my soul with all my best sensations ->  <br>. 0.9965533018112183
40196. this city go <br> if you want flying <br> energy is <OOV> for the new sensations ->  <br>. 0.9940819144248962
21897. i will be alone <br> in new york city <br> dead or alive breakin the ice ->  <br>. 0.9931383728981018
11195. running like a nightmare <br> lonely boy <br> your heart is full of passion <br> full ->  of. 0.9924903512001038
5005. for a lovely game <br> sweet and sensual <br> now im ready waiting for your extasy ->  <br>. 0.9921045303344727
22350. the ocean <br> deep inside your love <br> deep inside this romantic feeling of real devotion ->  <br>. 0.9903891682624817
33924. you <br> just around the corner <br> along the way i meet your love hot fire ->  <br>. 0.989725649356842
27884. the way <br> tell you something <br> im falling down from grace <br> its a kind ->  of. 0.9879541

In [11]:
wrong_idx_acc = [(i, predictions[i].max(), predicted_labels[i]) for i, value in enumerate(predicted_labels == y_test_unseen) if value == False]
wrong_idx_acc_max = sorted(wrong_idx_acc, key = lambda tup: tup[1], reverse=True)

#Print the highest accuracy predictions (wrong)
print("Highest accuracy, wrong predictions:\n")
print("Index. Sequence -> Label. Accuracy")

for i, acc, label in wrong_idx_acc_max[:10]:
    print(f"{i}. {' '.join([reverse_word_index[num] for num in X_test_unseen[i]])} ->  {reverse_word_index[label]}. {acc}.\nCorrect label was: {reverse_word_index[y_test_unseen[i]]} ({y_test_unseen[i]})\n")

Highest accuracy, wrong predictions:

Index. Sequence -> Label. Accuracy
17503. my secret feeling <br> like a children <br> some little kisses of you my best desire ->  <br>. 0.9911227226257324.
Correct label was: babe (76)

18243. the <OOV> <br> then take to me <OOV> show <br> or we will be the part ->  of. 0.9890502691268921.
Correct label was: <br> (2)

16878. to the right <br> heres somebody with the track <br> to the back to the front ->  of. 0.9885883331298828.
Correct label was: <br> (2)

20786. from my weapon tonight <br> get right get right <br> pistol man pistol man pistol pistol ->  <br>. 0.9875919222831726.
Correct label was: man (143)

443. your love <br> wonderful night <br> wonderful night <br> never ending passion <br> a magic full ->  of. 0.9865700602531433.
Correct label was: obsession (1024)

10083. heart <br> my sensations reactions i need you <br> tonite <br> what the <OOV> of reaction ->  <br>. 0.9851915836334229.
Correct label was: to (8)

17531. with all my best

In [12]:
from collections import Counter
#Print the most common labels in the dataset
[(label, reverse_word_index[label], count) for label, count in Counter(y_test_unseen).most_common()[:20]]

[(2, '<br>', 6589),
 (3, 'you', 2055),
 (6, 'me', 1441),
 (5, 'i', 1364),
 (4, 'the', 1332),
 (7, 'my', 1037),
 (8, 'to', 935),
 (10, 'your', 853),
 (12, 'love', 805),
 (9, 'and', 760),
 (17, 'baby', 504),
 (11, 'a', 500),
 (13, 'in', 464),
 (18, 'be', 451),
 (15, 'for', 447),
 (14, 'is', 438),
 (21, 'dont', 365),
 (24, 'can', 350),
 (20, 'im', 348),
 (19, 'on', 344)]

In [13]:
#Print the most common predictions
[(label, reverse_word_index[label], count) for label, count in Counter(predicted_labels).most_common()[:20]]

[(2, '<br>', 11676),
 (5, 'i', 6407),
 (3, 'you', 3025),
 (4, 'the', 2273),
 (6, 'me', 2068),
 (18, 'be', 1323),
 (12, 'love', 1152),
 (8, 'to', 1044),
 (7, 'my', 738),
 (24, 'can', 703),
 (29, 'wanna', 700),
 (11, 'a', 631),
 (14, 'is', 538),
 (26, 'night', 476),
 (10, 'your', 469),
 (9, 'and', 431),
 (25, 'feel', 397),
 (36, 'heart', 385),
 (16, 'of', 380),
 (19, 'on', 334)]

In [14]:
#Print the most overfitting labels (How many more were found in the predicted than the real dataset)
[(label, reverse_word_index[label], count) for label, count in (Counter(predicted_labels) - Counter(y_test_unseen)).most_common()[:10]]

[(2, '<br>', 5087),
 (5, 'i', 5043),
 (3, 'you', 970),
 (4, 'the', 941),
 (18, 'be', 872),
 (6, 'me', 627),
 (29, 'wanna', 436),
 (24, 'can', 353),
 (12, 'love', 347),
 (59, 'gonna', 197)]

In [15]:
#Print the most underfitting labels (How many more were found in the real dataset than the predicted dataset)
[(label, reverse_word_index[label], count) for label, count in (Counter(y_test_unseen) - Counter(predicted_labels)).most_common()[:10]]

[(10, 'your', 384),
 (20, 'im', 336),
 (9, 'and', 329),
 (7, 'my', 299),
 (17, 'baby', 298),
 (27, 'now', 241),
 (23, 'all', 239),
 (1, '<OOV>', 236),
 (42, 'tonight', 220),
 (32, 'just', 202)]

In [16]:
#Average length of sentence before false prediction (Length after last <br>).
a = [item[np.where(item == 2)[0][-1]:].size for item in X_test_unseen[[i for i, _, _ in wrong_idx_acc_max[:50]]]]
print(f"Average length of a sentence before false prediction: {sum(a) / len(a)}")

Average length of a sentence before false prediction: 7.02


## Generate lyrics using the trained model
Generate new lyrics using the trained model.
1. Load the saved model.
2. Give the model seed text of at least 16 words (\n counts as a word). Only the last 16 words affect the generation. The generator truncates the rest.

`temperature` parameter controls the randomness of the generation. 

In [None]:
def generate_lyrics(model, seed_text, n_words=1, temperature=1.0):
  """Generate lyrics using a model and seed text.

  Keyword arguments:
  model -- trained generator
  seed_text -- sample text from which to predict the next words. At least 16 words.
  n_words -- how many words to generate (defaults to 1)
  temperature -- randomness of the new text (defaults to 1.0)
  """
  
  seed_text.replace("\n", " <br> ")
  result = []
  for _ in range(n_words):
    #Encode the seed_text
    encoded = tokenizer.texts_to_sequences([seed_text])[0]
    encoded = pad_sequences([encoded], maxlen=SEQUENCE_LENGTH, truncating='pre')

    #Get the prediction vector
    predictions = model.predict(encoded, verbose=0)[0]
    predictions = np.asarray(predictions).astype('float64')

    #Reweight the distribution
    predictions = np.log(predictions) / temperature
    exp_predictions = np.exp(predictions)
    predictions = exp_predictions / np.sum(exp_predictions)
    probabilities = np.random.multinomial(1, predictions, 1)

    #Predict the next word
    index = np.argmax(probabilities)

    #Add the predicted word to the result
    predicted_word = reverse_word_index[index]
    seed_text += " " + predicted_word
    result.append(predicted_word)

  return (" ".join(result)).replace(" <", "<").replace("> ", ">").replace("<br>", "\n").replace("<OOV>", " ? ")

In [None]:
test_seed = "I found the rest of me in what I thought was fantasy\nWith the last of doubt I can freely shout "
generated_text = test_seed + "\n" + generate_lyrics(model, test_seed, 64, 1.0)
print(generated_text)

I found the rest of me in what I thought was fantasy
With the last of doubt I can freely shout 

im gonna tell fall in love
ready for my love
looking time forever this time
day by day tonight you know so long
you choose the game of destiny
you try to take me higher
lonely kind of dreams and ever
when you stay into the mirror right
magic night i feel lonely
this
