In [3]:
import codecs

In [40]:
filename = "rap_lyrics.txt"
raw_text = codecs.open(filename).read()[:1000000]
raw_text = raw_text.lower()

In [41]:

# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [42]:
chars[:100]

['\n',
 ' ',
 '!',
 '"',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '8',
 ':',
 ';',
 '?',
 '[',
 '\\',
 ']',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'ó',
 'ą',
 'ć',
 'ę',
 'ł',
 'ń',
 'ś',
 'ź',
 'ż',
 '–',
 '‘',
 '’',
 '…']

In [43]:
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  1000000
Total Vocab:  66


In [44]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  999900


In [45]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [46]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')


In [47]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [53]:
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from 2.98728 to 2.74304, saving model to weights-improvement-01-2.7430.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.74304 to 2.42455, saving model to weights-improvement-02-2.4246.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.42455 to 2.26671, saving model to weights-improvement-03-2.2667.hdf5
Epoch 4/20

Epoch 00004: loss did not improve from 2.26671
Epoch 5/20

Epoch 00005: loss did not improve from 2.26671
Epoch 6/20

Epoch 00006: loss did not improve from 2.26671
Epoch 7/20

Epoch 00007: loss improved from 2.26671 to 2.12349, saving model to weights-improvement-07-2.1235.hdf5
Epoch 8/20

Epoch 00008: loss did not improve from 2.12349
Epoch 9/20

Epoch 00009: loss did not improve from 2.12349
Epoch 10/20

Epoch 00010: loss did not improve from 2.12349
Epoch 11/20

Epoch 00011: loss did not improve from 2.12349
Epoch 12/20

Epoch 00012: loss improved from 2.12349 to 2.04692, saving model to weights-improvement-12-2.0469.hdf5
Epoch 13/

<keras.callbacks.History at 0x7f89cf9dc7f0>

In [54]:
# load the network weights
filename = "weights-improvement-19-1.7681.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [55]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [58]:
import sys
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
"  będziesz mieć powody oto co robił tata jak rapował gdy był młody a tu kolejny tato, kolejny baton o "
 ci ciewy so ca bi niemie a tozadz na na nie otpor do wiamie  tie paradz  mak troo no die zniem z sisowoke sie za co zię śminne siesz co mo poradie sie postod nie zatmani na sie iiety pa no bi pe pownaty ta sar ta ciesae so jakt wie cada ranom sarazyj pa sarie ta balaky pako pozysajan sarie poai sie casi raje poyta my iietto to jast tak jie po pa sobr  bi ta tie toja ta sierie  c pak jas po pap to siesze  a ta myłaa miem sc mierar ta coerie sora to jest niedy mie wi moee mozesiea ta swam cetyjasz ti so sabazą azy ca kizanolajy zap tam do baje jak sezrz  taraza słana  rzej stapa to zięt nak za mociekty nak pa talawie za to mom tam to mas ta scły nas ta star pazy za ocr m mam ja dsitam s pal wa ciedze sie ta calda, ch ma tae tem ter zo nas ca ba nie nomcę na sa mie oopocz sa po sie mozaro zabacik  c ta moły my wamię, toraj be mi zilmc   mu wamaszo jast nasze, to za sie wes ma ta