In [1]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense,Dropout,Embedding
from sklearn.model_selection import train_test_split
import pickle 
from tensorflow.keras.utils import to_categorical
import numpy as np

### Load preprocessed data 

In [2]:
with open('/content/drive/MyDrive/lyrics-generator/Data/sequences.txt', 'rb') as fp:
  sequences= pickle.load(fp)

In [3]:
with open('/content/drive/MyDrive/lyrics-generator/Data/next_word.txt', 'rb') as fp:
  next_word= pickle.load(fp)

In [4]:
with open('/content/drive/MyDrive/lyrics-generator/Data/word_indices.txt', 'rb') as fp:
  word_indices= pickle.load(fp)

In [5]:
with open('/content/drive/MyDrive/lyrics-generator/Data/indices_word.txt', 'rb') as fp:
  indices_word= pickle.load(fp)

In [6]:
sequences[:10]

[['but', 'he', 'wasnt', 'a', 'match'],
 ['he', 'wasnt', 'a', 'match', 'wrote'],
 ['wasnt', 'a', 'match', 'wrote', 'some'],
 ['a', 'match', 'wrote', 'some', 'songs'],
 ['match', 'wrote', 'some', 'songs', 'about'],
 ['wrote', 'some', 'songs', 'about', 'ricky'],
 ['some', 'songs', 'about', 'ricky', 'now'],
 ['songs', 'about', 'ricky', 'now', 'i'],
 ['about', 'ricky', 'now', 'i', 'listen'],
 ['ricky', 'now', 'i', 'listen', 'and']]

In [7]:
next_word[:10]

['wrote',
 'some',
 'songs',
 'about',
 'ricky',
 'now',
 'i',
 'listen',
 'and',
 'laugh']

### ***Imp Note***- Before we build our model we need to create data generator fuction to avoid out of memmory error.

In [8]:
MIN_SEQ=5
VOCAB_SIZE=7233
BATCH_SIZE= 128

In [9]:
def generator(sequences_list, next_word_list, batch_size=BATCH_SIZE):
  index=0
  while True:
      x = np.zeros((batch_size, MIN_SEQ), dtype=np.int32)
      y= np.zeros((batch_size), dtype=np.int32)

      for i in range(batch_size):
        for t, w in enumerate(sequences_list[index % len(sequences_list)]):
            x[i, t]= word_indices[w]
        y[i] = word_indices[next_word_list[index % len(sequences_list)]]

        index = index+1
      yield x, y      


   

### Split data into train and test set

In [10]:
X_train,X_test, y_train,y_test = train_test_split(sequences,next_word, test_size=0.02, random_state=0)

In [11]:
len(X_train),len(X_test), len(y_train),len(y_test)

(1438147, 29350, 1438147, 29350)

## Model1

In [None]:
model = Sequential()
# # add embedding layer
model.add(Embedding(VOCAB_SIZE, 100))
# adding bidirectional lstm
model.add(Bidirectional(LSTM(150, return_sequences=True)))
# Add dropout reguralization
model.add(Dropout(0.2))
# add lstm layyer
model.add(LSTM(100))
# add output layer
model.add(Dense(VOCAB_SIZE, activation='softmax'))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         723300    
_________________________________________________________________
bidirectional (Bidirectional (None, None, 300)         301200    
_________________________________________________________________
dropout (Dropout)            (None, None, 300)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense (Dense)                (None, 7233)              730533    
Total params: 1,915,433
Trainable params: 1,915,433
Non-trainable params: 0
_________________________________________________________________


***IMP note :- if loss categorical crossentropy used then shapes incompatible error will occcur beause we have to use sparse_categorical_crossentropy when we have all diffrent labels categorical is for mutliclass labels***


In [None]:

model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, LambdaCallback

In [None]:
checkpoint = ModelCheckpoint("/content/drive/MyDrive/Projects/lyrics_generation/model_checkpoints/", monitor='val_accuracy', save_best_only=True)

early_stopping = EarlyStopping(monitor='val_accuracy', patience=10)

callbacks_list = [checkpoint, early_stopping]

In [None]:
EPOCHS= 30 #@param {type:'slider', max:100, min:10, step:10}

### **IMP NOTE** - here to avoid unknown samples need to use sptes per epoch else model will fit for infinite samples

In [12]:
steps_per_epoch = np.ceil(len(X_train)/BATCH_SIZE,)
steps_per_epoch_val = np.ceil(len(X_test)/BATCH_SIZE)

In [13]:
steps_per_epoch

11236.0

In [None]:
history1= model.fit(generator(X_train,y_train,batch_size=BATCH_SIZE),
                            epochs=EPOCHS,
                            callbacks=callbacks_list,
                    steps_per_epoch=steps_per_epoch,
                            validation_data=generator(X_test, y_train, BATCH_SIZE),
                            validation_steps=steps_per_epoch_val)

## Hypertune model
* changing output dim for embbedding
* added one dense layer 

### Follwing function are taken from  keras-team/keras/blob/master/examples/lstm_text_generation.py


In [14]:
def sample(preds, temperature=1.0):

   # helper function to sample an index from a probability array

   preds = np.asarray(preds).astype('float64')

   preds = np.log(preds) / temperature

   exp_preds = np.exp(preds)

   preds = exp_preds / np.sum(exp_preds)

   probas = np.random.multinomial(1, preds, 1)

   return np.argmax(probas)



In [15]:

def on_epoch_end(epoch, logs):

   # Function invoked at end of each epoch. Prints generated text.

   examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)


   # Randomly pick a seed sequence

   seed_index = np.random.randint(len(X_train+X_test))

   seed = (X_train+X_test)[seed_index]


   for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:

       sentence = seed

       examples_file.write('----- Diversity:' + str(diversity) + '\n')

       examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')

       examples_file.write(' '.join(sentence))


       for i in range(50):

           x_pred = np.zeros((1, MIN_SEQ))

           for t, word in enumerate(sentence):

               x_pred[0, t] = word_indices[word]


           preds = model2.predict(x_pred, verbose=0)[0]

           next_index = sample(preds, diversity)

           next_word = indices_word[next_index]


           sentence = sentence[1:]

           sentence.append(next_word)


           examples_file.write(" "+next_word)

       examples_file.write('\n')

   examples_file.write('='*80 + '\n')

   examples_file.flush()

###read more about embedding [here](https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/#:~:text=The%20Embedding%20has%20a%20vocabulary,an%20input%20length%20of%204.&text=Importantly%2C%20the%20output%20from%20the,to%20the%20Dense%20output%20layer.)

In [16]:
model2 = Sequential()
# # add embedding layer
model2.add(Embedding(VOCAB_SIZE, 1000))
# adding bidirectional lstm
model2.add(Bidirectional(LSTM(150, return_sequences=True)))
# Add dropout reguralization
model2.add(Dropout(0.2))
# add lstm layyer
model2.add(LSTM(100))
## add fc layer
model2.add(Dense(VOCAB_SIZE/2, activation='relu'))
# add output layer
model2.add(Dense(VOCAB_SIZE, activation='softmax'))

In [17]:
model2.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 1000)        7233000   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 300)         1381200   
_________________________________________________________________
dropout (Dropout)            (None, None, 300)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense (Dense)                (None, 3616)              365216    
_________________________________________________________________
dense_1 (Dense)              (None, 7233)              26161761  
Total params: 35,301,577
Trainable params: 35,301,577
Non-trainable params: 0
____________________________________________

In [18]:
model2.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

In [19]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping,LambdaCallback

In [20]:
checkpoint = ModelCheckpoint("/content/drive/MyDrive/lyrics-generator/model_checkpoints/model2", monitor='val_accuracy', save_best_only=True)

early_stopping = EarlyStopping(monitor='val_accuracy', patience=10)

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

callbacks_list = [checkpoint, early_stopping, print_callback]

In [21]:
EPOCHS= 30 #@param {type:'slider', max:100, min:10, step:10}

In [22]:
steps_per_epoch = np.ceil(len(X_train)/BATCH_SIZE,)
steps_per_epoch_val = np.ceil(len(X_test)/BATCH_SIZE)

In [23]:
# before fitting wee need to create examples file
examples_file = open('examples.txt', "w")

In [None]:
history2= model2.fit(generator(X_train,y_train,batch_size=BATCH_SIZE),
                  epochs=EPOCHS,
                  callbacks=callbacks_list,
                  steps_per_epoch=steps_per_epoch,
                  validation_data=generator(X_test, y_train, BATCH_SIZE),
                   validation_steps=steps_per_epoch_val)

In [43]:
with open('/content/drive/MyDrive/lyrics-generator/Data/examples.txt') as f:
    outputs = f.readlines()

### Here is generated lyrics ny epochs

In [44]:
outputs


['\n',
 '----- Generating text after Epoch: 0\n',
 '----- Diversity:0.3\n',
 '----- Generating with seed:\n',
 '"is there no place for"\n',
 'is there no place for a little bit of love and i cant get enough of your heart i dont know if i could never change you and i know i dont know what i want to do i do i do i do do do do do do do do do do do do\n',
 '----- Diversity:0.4\n',
 '----- Generating with seed:\n',
 '"is there no place for"\n',
 'is there no place for me to be the same and i know that im not a little bit of love and i know you know how to do youre the only one that i had to make you go and i just cant get no need to go and i dont know what to\n',
 '----- Diversity:0.5\n',
 '----- Generating with seed:\n',
 '"is there no place for"\n',
 'is there no place for a good time for a little bit of love and i swear i could be your sweet girl now that you dont know me like a little too late to take my hand for a girl and i got a little time for me to say to you and i\n',
 '----- Dive