In [14]:
import pandas as pd
import os
import string
import re
import numpy as np
from google.colab import drive

drive.mount('/content/gdrive')

!git clone https://github.com/davordavidovic/NLP-lyrics-generator.git
  
!sudo pip install h5py


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
fatal: destination path 'NLP-lyrics-generator' already exists and is not an empty directory.


In [0]:
def load_songs(genre, max_tokens):
  df1 = pd.read_csv('./NLP-lyrics-generator/data/lyrics_part1.csv')
  df2 = pd.read_csv('./NLP-lyrics-generator/data/lyrics_part2.csv')
  df3 = pd.read_csv('./NLP-lyrics-generator/data/lyrics_part3.csv')
  df4 = pd.read_csv('./NLP-lyrics-generator/data/lyrics_part4.csv')

  df_part_1 = pd.concat([df1, df2])
  df_part_2 = pd.concat([df3, df4])

  df = pd.concat([df_part_1, df_part_2])
  df.drop(columns=['index','Unnamed: 0'], inplace=True) #we dont need these columns

  df = df.dropna() #there were around 10000 rows with no lyrics so drop them
  
  df_songs = df[df.genre==genre]
  
  df_songs['preprocessed'] = df_songs['lyrics'].map(prepare_text)
  
  songs = df_songs.preprocessed.values
  
  count = 0
  cut = 0
  for i,song in enumerate(songs):
      tokens = song.split()
      count += len(tokens) 
      if count >= max_tokens:
        cut = i - 1
        break
  return songs[:cut]


In [0]:
def prepare_text(text):
    text = text.lower()
    text = text.replace('\n', 'newline')
  
    text = text.split()
  
    for index, word in enumerate(text):
        #remove non alphabetic characters at the end or beginning of a word
        word = word.strip(string.punctuation)
    
        #replace non alhpanumeric chars with space
        word = re.sub(r"[\W]",' ',word)
        text[index] = word 
   
    #concatenate again
    text = " ".join(text)
    return text

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import OrderedDict

def build_vocab(songs, min_frq):
  #token pattern to also count one-character words
  vectorizer = CountVectorizer(stop_words=[],min_df=min_frq,token_pattern=u"(?u)\\b\\w+\\b")
  X = vectorizer.fit_transform(songs)


  vocab_dict = vectorizer.vocabulary_
  vocab_list =  list(vocab_dict)

  return vocab_list, vocab_dict


In [0]:
def index2sen(seq,vocab):
    tokens = [vocab[int(t)] for t in seq]
    sen = " ".join(tokens)
    return sen

In [0]:
from keras.utils import np_utils
import numpy as np

def songs_to_supervised(seq_len, songs, vocab_dict, vocab_list):
  data_x = []
  data_y = []
  seq_words = []

  for song in songs:
      tokens = song.split()
      for i in range(0, len(tokens) - seq_len):
          seq_in = tokens[i:i+seq_len]
          seq_out = tokens[i + seq_len]
          seq_data = []
          
          for word in seq_in:
              if word in vocab_dict:
                  seq_data.append(vocab_list.index(word))
              else:
                  break
                  
          #check if all words in sequence are in dict
          if len(seq_data) == seq_len and seq_out in vocab_dict:
              data_x.append(seq_data)
              data_y.append(vocab_list.index(seq_out))
              seq_words.append((seq_in,seq_out))
              
          '''
          #return if enough sequences were created
          if len(data_x) == n_seq:
            return data_x, data_y 
          '''
          
  return data_x, data_y, seq_words

In [0]:

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense,Dropout, CuDNNLSTM
from keras.preprocessing.text import Tokenizer

from keras.models import Sequential
import keras.utils as ku 

def create_model(layers, units, inp_shape, out_shape):
  #lstm sequence to categoriemodel
  model = Sequential()
  
  for l in range(layers-1):
    model.add(CuDNNLSTM(units,return_sequences=True, input_shape = inp_shape))
    model.add(Dropout(0.2))
    
  model.add(CuDNNLSTM(units,return_sequences=False))
  model.add(Dropout(0.2)) 
  model.add(Dense(out_shape, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
  
  return model


In [0]:
def generate_text(seed_text, next_words, model, vocab_list):
    n_vocab = len(vocab_list)
    seq_in = prepare_text(seed_text).split()
    x = np.array([vocab_list.index(word) for word in seq_in])/n_vocab
    
    output_word = ""
    predictions = []
    for i in range(next_words):
        input_seq = np.reshape(np.append(x[i:],predictions),(1,len(x),1))
        predicted = model.predict_classes(input_seq, verbose=0)
        predictions.append(predicted[0])
        output_word = vocab_list[predicted[0]]
        seed_text += " " + output_word
        #print(output_word, vocab_list[np.argmax(model.predict(input_seq,verbose=0))])
        
    return seed_text



In [0]:
from keras.utils import np_utils
import numpy as np 
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import pickle
 
  
def run_experiment(n_sequences, n_epochs, genre, seq_len, n_layers, max_vocab_size, directory):

  print("Running", n_sequences,"sequences", n_epochs,"epochs",genre, seq_len,"sequence length", n_layers, "layers", max_vocab_size, "vocab size", directory, "directory") 
  
  #load lyrics with this many tokens
  max_tokens = n_sequences-seq_len
  
  #load song lyrics
  songs = load_songs(genre, max_tokens)
  
  #create the right-sized vocabulary from the songs 
  min_frq = 1
  n_vocab = np.inf
  while n_vocab > max_vocab_size:
    vocab_list, vocab_dict = build_vocab(songs, min_frq)
    n_vocab = len(vocab_dict)
    min_frq += 1
  
  #songs to sequences and labels
  data_x, data_y, seq_words = songs_to_supervised(seq_len, songs, vocab_dict, vocab_list)
  
  #reshape input to samples, timesteps, features
  X = np.reshape(data_x, (len(data_x), seq_len, 1))
  #normalize input
  X = X/float(n_vocab)
  #categorical labels 
  y = np_utils.to_categorical(data_y)

  inp_shape = X[0].shape
  out_shape = y[0].shape[0]
  print("X shape",X.shape)
  #create the lstm model
  model = create_model(n_layers, units=400, inp_shape =inp_shape, out_shape=out_shape)
  
  # checkpoint
  #TODO adapt filepath
  filepath = directory + "weights-improvement-{epoch:02d}-{acc:.2f}.hdf5"
  checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='min')

  #early stopping 
  es = EarlyStopping(monitor='val_acc', mode='min', verbose=1, patience=100)

  callbacks_list = [es]
  
  #train model
  history = model.fit(X, y, epochs=n_epochs, verbose=1,batch_size=1024,callbacks=callbacks_list, validation_split=0.1)
  
  #save model TODO namin
  model.save(directory +"model.h5")
  
  #save history
  with open(directory+"hist", 'wb') as file_pi:
        pickle.dump(history.history, file_pi)
  
  #generate validation texts and training texts
  val_words = seq_words[:-10]
  for t in val_words:
    sentence = " ".join(t[0])
    label = t[1]
    output = generate_text(sentence, next_words = seq_len, model = model, vocab_list = vocab_list)
    with open(directory + "generated.txt","w") as file:
      file.write(sentence + " out: " + output + "\n")
      #also save the actual number of sequences that were used
      file.write(str(len(data_x)))
  
  
  #TODO save plot on training curve
  plt.plot(history.history['acc'])
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['training', 'test'], loc='upper left')
  plot_path = directory + "plot.png"
  plt.savefig(plot_path, bbox_inches='tight', format='png')

In [0]:
data_sizes = [10000, 100000, 500000] #num of sequences
epochs = [50,200]
genres = ['Pop', 'Hip-Hop', 'Metal', 'Country']
seq_lens = [5,20]
layers = [4, 8] #400 units each
max_vocab_size = [300, 800] #change min document frq until size fits

experiments = []

#big dataset on all genres with different vocabulary sizes
for g in genres:
  for m in max_vocab_size:
    exp = {"seqs" : data_sizes[2],
           "epochs" : epochs[1],
           "genre" : g,
           "seq_lens" : seq_lens[1],
           "layers" : layers[1],
           "vocab" : m,
           "dir" : "./gdrive/My Drive/Colab Notebooks/exps2/_" + str(m) + g
          }
    experiments.append(exp)

#different data sizes on hip hop
for d in data_sizes:
  exp = {"seqs" : d,
         "epochs" : epochs[1],
         "genre" : genres[1],
         "seq_lens" : seq_lens[1],
         "layers" : layers[1],
         "vocab" : max_vocab_size[1],
         "dir" : "./gdrive/My Drive/Colab Notebooks/exps2/_" + str(d) + "sequences"
          }
  experiments.append(exp)

#different sequence lengths on hip hop
for s in seq_lens:
  exp = {"seqs" : data_sizes[2],
         "epochs" : epochs[1],
         "genre" : genres[1],
         "seq_lens" : seq_lens[1],
         "layers" : layers[1],
         "vocab" : max_vocab_size[1],
         "dir" : "./gdrive/My Drive/Colab Notebooks/exps2/_" + str(d) + "sequences"
          }
  experiments.append(exp)
  
        
print("Running", len(experiments), "experiments")
            
for e in experiments:
  try:
    n_seqs = e["seqs"]
    n_epochs = e["epochs"]
    genre = e["genre"]
    seq_len = e["seq_lens"]
    n_layers = e["layers"]
    max_vocab_size = e["vocab"]
    dir_ = e["dir"]
    run_experiment(n_sequences = n_seqs, n_epochs = n_epochs, genre = genre, seq_len = seq_len, n_layers = n_layers, max_vocab_size = max_vocab_size, directory = dir_)
  except Exception as ex:
    print(ex)
    pass

Running 13 experiments
Running 500000 sequences 200 epochs Pop 20 sequence length 8 layers 300 vocab size ./gdrive/My Drive/Colab Notebooks/exps2/_300Pop directory


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


X shape (617, 20, 1)
Train on 555 samples, validate on 62 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 7

In [0]:
#load from last checkpoint
model.load_weights('content/gdrive/My Drive/Colab Notebooks/weights-improvement-77-0.41.hdf5')

In [0]:
#train and save model
history=model.fit(X, y, epochs=200, verbose=1,batch_size=1024,callbacks=callbacks_list, validation_split=0.1)
model.save("./gdrive/My Drive/Colab Notebooks/200ep_4_lay_model_10000_pop_15seq.h5")

In [0]:
print(generate_text("Oh baby, baby, how was I supposed to know That something wasn't right here",10,model))


In [0]:
from keras.models import load_model
model = load_model('./gdrive/My Drive/Colab Notebooks/bigmodel.h5')

In [0]:
input_words