In [91]:
import io

import os

import sys

import string

import numpy as np

import pandas as pd

from tensorflow import keras

from __future__ import print_function

from tensorflow.keras.models import Sequential

from sklearn.model_selection import train_test_split

from tensorflow.keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping

from tensorflow.keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding

In [93]:
translator = str.maketrans('', '', string.punctuation)
df = pd.read_csv("lyrics.csv", sep="\t")
df.head()

Unnamed: 0,song_id,lyrics
0,3e9HZxeyfWwjeyPAMmWSSQ,['[Verse 1]\nThought I\'d end up with Sean\nBu...
1,5p7ujcrUXASCNwRaWNHR1C,"[""[Verse 1]\nFound you when your heart was bro..."
2,2xLMifQCjDGFmkHkpNLD9h,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun..."
3,3KkXRkHbMCARz0aVfEt68P,
4,1rqqCSm0Qe4I9rUvWncaom,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t..."


In [95]:
def split_text(x):
    text = x['lyrics']
    sections = text.split('\\n\\n')
    
    keys = {'Verse 1': np.nan,'Verse 2':np.nan,'Verse 3':np.nan,'Verse 4':np.nan, 'Chorus':np.nan}
    
    lyrics = str()
    single_text = []
    res = {}
    
    for s in sections:
        key = s[s.find('[') + 1:s.find(']')].strip()
        if ':' in key:
            key = key[:key.find(':')]
        if key in keys:
            single_text += [x.lower().replace('(','').replace(')','').translate(translator) for x in s[s.find(']')+1:].split('\\n') if len(x) > 1]
          
        res['single_text'] =  ' \n '.join(single_text)
    return pd.Series(res)

In [96]:
df = df.dropna()

In [97]:
df = df.join(df.apply(split_text, axis=1))

In [98]:
df.head()

Unnamed: 0,song_id,lyrics,single_text
0,3e9HZxeyfWwjeyPAMmWSSQ,['[Verse 1]\nThought I\'d end up with Sean\nBu...,thank you next next \n thank you next next \n ...
1,5p7ujcrUXASCNwRaWNHR1C,"[""[Verse 1]\nFound you when your heart was bro...",tell me hows it feel sittin up there \n feelin...
2,2xLMifQCjDGFmkHkpNLD9h,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun...",woo made this here with all the ice on in the ...
4,1rqqCSm0Qe4I9rUvWncaom,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t...",had to have high high hopes for a living \n sh...
5,0bYg9bo50gSsH3LtXe2SQn,"[""[Intro]\nI-I-I don't want a lot for Christma...",i dont want a lot for christmas \n there is ju...


In [100]:
sum_df = pd.DataFrame( df['single_text'] )
sum_df.dropna(inplace=True)

In [101]:
sum_df.head()

Unnamed: 0,single_text
0,thank you next next \n thank you next next \n ...
1,tell me hows it feel sittin up there \n feelin...
2,woo made this here with all the ice on in the ...
4,had to have high high hopes for a living \n sh...
5,i dont want a lot for christmas \n there is ju...


In [102]:
text_as_list = []

frequencies = {}

uncommon_words = set()

MIN_FREQUENCY = 7

MIN_SEQ = 5

BATCH_SIZE =  32

In [103]:
def extract_text(text):
    global text_as_list
    text_as_list += [w for w in text.split(' ') if w.strip() != '' or w == '\n']

In [104]:
sum_df['single_text'].apply( extract_text )

print('Total words: ', len(text_as_list))

Total words:  2339637


In [105]:
for w in text_as_list:
    frequencies[w] = frequencies.get(w, 0) + 1

In [106]:
uncommon_words = set([key for key in frequencies.keys() if frequencies[key] < MIN_FREQUENCY])

words = sorted(set([key for key in frequencies.keys() if frequencies[key] >= MIN_FREQUENCY]))


num_words = len(words)

word_indices = dict((w, i) for i, w in enumerate(words))

indices_word = dict((i, w) for i, w in enumerate(words))

print('Words with less than {} appearances: {}'.format( MIN_FREQUENCY, len(uncommon_words)))

print('Words with more than {} appearances: {}'.format( MIN_FREQUENCY, len(words)))

Words with less than 7 appearances: 26272
Words with more than 7 appearances: 9163


In [107]:
valid_seqs = []

end_seq_words = []

for i in range(len(text_as_list) - MIN_SEQ ):
    end_slice = i + MIN_SEQ + 1

    if len( set(text_as_list[i:end_slice]).intersection(uncommon_words) ) == 0:
        valid_seqs.append(text_as_list[i: i + MIN_SEQ])
        end_seq_words.append(text_as_list[i + MIN_SEQ])

In [108]:
print('Valid sequences of size {}: {}'.format(MIN_SEQ, len(valid_seqs)))


X_train, X_test, y_train, y_test = train_test_split(valid_seqs, end_seq_words, test_size=0.02, random_state=42)

print(X_train[2:5])

Valid sequences of size 5: 2087702
[['minds', 'my', 'nine', 'my', 'pens'], ['pass', 'me', 'by', '\n', 'do'], ['money', 'burnin', 'up', '\n', 'they']]


In [109]:
def sample(preds, temperature=1.0):

   # helper function to sample an index from a probability array

    preds = np.asarray(preds).astype('float64')

    preds = np.log(preds) / temperature

    exp_preds = np.exp(preds)

    preds = exp_preds / np.sum(exp_preds)

    probas = np.random.multinomial(1, preds, 1)

    return np.argmax(probas)

In [110]:
def get_model():

    print('Build model...')

    model = Sequential()

    model.add(Embedding(input_dim=len(words), output_dim=1024))

    model.add(Bidirectional(LSTM(128)))
 
    model.add(Dense(len(words)))

    model.add(Activation('softmax'))

    return model

In [111]:
model = get_model()
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

Build model...


In [112]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 1024)        9382912   
                                                                 
 bidirectional_2 (Bidirectio  (None, 256)              1180672   
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 9163)              2354891   
                                                                 
 activation_2 (Activation)   (None, 9163)              0         
                                                                 
Total params: 12,918,475
Trainable params: 12,918,475
Non-trainable params: 0
_________________________________________________________________


In [113]:
model.load_weights('my_model_weights.h5')

In [114]:
seed_index = np.random.randint(len(X_train+X_test))
seed = (X_train+X_test)[seed_index]

In [117]:
examples_file = open('examples.txt', "w")
for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:

    sentence = seed

    examples_file.write('----- Diversity:' + str(diversity) + '\n')

    examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')

    examples_file.write(' '.join(sentence))


    for i in range(50):

        x_pred = np.zeros((1, MIN_SEQ))

        for t, word in enumerate(sentence):

            x_pred[0, t] = word_indices[word]


        preds = model.predict(x_pred, verbose=0)[0]

        next_index = sample(preds, diversity)

        next_word = indices_word[next_index]


        sentence = sentence[1:]

        sentence.append(next_word)


        examples_file.write(" "+next_word)

    examples_file.write('\n')

examples_file.write('='*80 + '\n')

examples_file.flush()

  preds = np.log(preds) / temperature
