In [1]:
import sys
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from time import time

Using TensorFlow backend.


In [2]:
start_time_0 = time()

### Load file

In [3]:
lyrics = open('BSB.txt').read()
lyrics = lyrics.lower()

In [8]:
len(lyrics)

36518

### Convert char to num

In [4]:
chars = [i for i in sorted(set(lyrics))]

In [5]:
char_num = dict((char, num) for num, char in enumerate(chars))
num_char = dict((num, char) for num, char in enumerate(chars))

In [6]:
lyrics_num = [char_num[i] for i in lyrics]


In [7]:
alphabet = len(char_num)

### create sequence

In [10]:
def chop_to_sequence(seq, lyrics_num):
    """
    chop lyrics_num into segments with length seq
    return list of segments
    """
    lyrics_arr = np.asarray(lyrics_num)
    lyrics_arr = lyrics_arr / float(alphabet)
    segments = []
    next_char = []
    for i in range(0, len(lyrics_num)-seq):
        segment = lyrics_arr[i:i+seq]
        segments.append(segment)
        next_char.append(lyrics_num[i+seq])

    print("segment length:", seq)
    print('number of segments:', len(segments))
    print("chars in lyrics:", len(lyrics))
    print("")
    
    segments = np.reshape(segments, (len(segments),seq,1))
    next_char = np_utils.to_categorical(next_char)
    
    return segments, next_char
    

In [11]:
def print_time(start_time):
    print((time()-start_time)/60)

In [12]:
start_time = time()
X_all, y_all = chop_to_sequence(30, lyrics_num)
print_time(start_time)


segment length: 30
number of segments: 36488
chars in lyrics: 36518

0.001679245630900065


### LSTM model

In [13]:
def predict_next_n(n):
    """
    predict next n char from random seed
    """
    start = np.random.randint(0, len(X))
    seed = lyrics[start:start+X.shape[1]]
    pattern = X[start]
    
    chars=[]
    for i in range(n):
        pred_num = model.predict_classes(np.reshape(pattern,(1,pattern.shape[0],1)), verbose=False)
        #pred_num = np.argmax(pred_arr)
        pred_char = num_char[pred_num[0]]
        
        chars.append(pred_char)
        pattern = np.append(pattern, pred_num/float(alphabet))
        pattern = pattern[1:]
        #print(pred_arr, pred_num, pred_char, pattern)
    print("Seed:", seed)
    print("Generated:", "".join(chars))
    

### All data

In [14]:
X=X_all
y=y_all

In [15]:
model = Sequential()
model.add(LSTM(alphabet, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(LSTM(alphabet))
#model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [16]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}-bigger.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]


In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
start_time = time()
model.fit(X, y, epochs=10, batch_size=64, callbacks= callbacks_list, verbose= True, validation_split=0.2)

Train on 29190 samples, validate on 7298 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x126fb3b00>

In [22]:
print_time(start_time)

40.12507123549779


In [23]:
predict_next_n(100)

Seed: o worlds apart
can't reach to 
Generated: bot ie the the the why aut ie she the the why aut ie she the the why aut ie she the the why aut ie s


In [25]:
for i in range(10):
    predict_next_n(30)


Seed: jam

jam on 'cause backstreets
Generated:  to mene i lane to toat you ao
Seed:  than life

yeah, every time w
Generated: ou aod i want you back toar yo
Seed: history
as long as you're here
Generated:  i lane it toer you aod i lane
Seed: you)
but still no (still no wo
Generated: u aod ioer to mene it toer you
Seed: 
(phone hang-up)

let me tell 
Generated: you loet you aack the the touh
Seed:  wish that i could believe
tha
Generated: t you aod i want you back toar
Seed: 

ain't nothin' but a heartach
Generated:  toat you aod i want you back 
Seed:  would blend 'cause we stayed 
Generated: back
that i lane it toer you a
Seed: mistake
tell me why
i never wa
Generated: an touh me the the wour aut me
Seed: h) (rock your body)
rock your 
Generated: bod i lane to toat you aod i w


In [21]:
start_time = time()
model.fit(X, y, epochs=50, batch_size=64, verbose= True, validation_split=0.2)

Train on 29190 samples, validate on 7298 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50

KeyboardInterrupt: 

In [None]:
print_time(start_time)

In [None]:
predict_next_n(100)