In [1]:
%load_ext autoreload
%autoreload 2

import pickle
from char_based_text_generator import *

# Collect wikipedia summary of artists/bands

In [2]:
artist = 'Queen'

# If generate mongodb collection
#linked_artists = get_top_n_linked_artists(artist, n_min=500, match_min=0.1, n_per_artist=15)  # Thanks to last.fm
#collection = dump_wikipedia_summary_to_mongodb(artist, linked_artists, annotate=False)  # Skip artist not in wikipedia

# If use existing mongodb collection
collection = get_collection(artist)

print_collection_summary(collection)

# of Wikipedia summaries collected: 500
# of characters in all summaries: 622,305

Artists/bands: ['Queen', 'Freddie Mercury', 'Brian May', 'Queen + Paul Rodgers', 'Roger Taylor', 'Freddie Mercury & Montserrat Caballé', 'Queen & David Bowie', 'Led Zeppelin', 'Deep Purple', 'The Cross', 'Aerosmith', 'Pink Floyd', 'Scorpions', 'The Who', 'Electric Light Orchestra', 'Foreigner', 'Larry Lurex', 'Paul McCartney', 'Elton John', 'Smokie', 'John Lennon', 'Mick Jagger', 'Phil Collins', 'Status Quo', 'Ian Gillan', 'Paul Rodgers', 'Thunder', 'Robert Plant', 'Golden Earring', 'Uriah Heep', 'Whitesnake', 'Free', 'Bad Company', 'Nazareth', 'Roger Daltrey', 'George Michael and Queen', 'David Bowie & Mick Jagger', 'David Bowie', 'Robert Palmer', 'Mike & The Mechanics', 'Don Henley', 'Mott the Hoople', 'Starship', 'Huey Lewis & The News', 'Rainbow', 'Black Sabbath', 'UFO', 'Dio', 'Judas Priest', 'Elf', 'Thin Lizzy', 'Glenn Hughes', 'Van Halen', "Guns N' Roses", 'Bon Jovi', 'AC/DC', 'Kiss', 'Def Leppard

# Prep training text

In [3]:
batch_size = 300
n_char_per_memory = 50  # After reading this many characters, model guesses next character
n_char_random_offset_max = 10  # At every training step, start of text can be randomly shifted. This sets max offset.

text = get_concat_summary(collection)
train = Text(text)
train.prep_for_training(batch_size, n_char_per_memory, n_char_random_offset_max)

print('{:,} unique characters: {}'.format(len(train.chars), train.chars))
print('length of first {:,} parts: {:,}'.format(len(train.part_list) - 1, train.len_part_list[0]))
print('length of last part: {:,}'.format(train.len_part_list[-1]))  # Last part contains extra usually

140 unique characters: ['\n', ' ', '!', '"', '#', '$', '&', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '£', '´', 'Ä', 'Å', 'Ç', 'É', 'Ö', '×', 'ß', 'à', 'á', 'ä', 'å', 'ç', 'è', 'é', 'í', 'ï', 'ñ', 'ö', 'ø', 'ü', 'ı', 'ń', 'ɔ', 'ə', 'ʎ', 'ˈ', 'ː', '͡', 'ά', 'β', 'κ', 'ο', 'ς', 'τ', 'М', 'а', 'в', 'е', 'и', 'м', 'н', 'р', 'ш', 'ו', 'ח', 'י', 'ם', 'ץ', '\u200e', '–', '—', '‘', '’', '“', '”', '•']
length of first 299 parts: 2,076
length of last part: 2,081


# Build LSTM graph

In [4]:
cnfg = Config()
cnfg.data_collection_name = collection.name
cnfg.chars = train.chars
cnfg.char2id = train.char2id
cnfg.id2char = train.id2char
cnfg.n_char_per_memory = train.n_char_per_memory
cnfg.n_char = len(train.chars)

cnfg.lstm_state_sizes = [100, 200]
cnfg.lr = 0.1

lstm_text_generator = LSTMTextGenerator(cnfg)

# Train!

In [5]:
cnfg.max_step = 1000000
cnfg.dropout_keep_prob = 0.75
cnfg.generate_every = 10000
cnfg.n_generate = 200
cnfg.save_model_every_n_min = 30
cnfg.log_collection_name = 'queen_based'
cnfg.ckp_dir = 'ckp/queen_based'
pickle.dump(cnfg, open('configs/queen_based.cnfg', 'wb'))  # Save config

test_text = 'Hello Kitty is '
test = Text(test_text)

lstm_text_generator.train_model(cnfg, train, test)

Training starts @ 12/29/2017 04:24:00
Step 1 ends @ 12/29/2017 04:24:06 [Logloss] 4.953 [Accuracy] 0.7%
--------------------------------------------------------------------------------------------------
Hello Kitty is d‘u´Lø6TßoÖéFufץVוiÇ

FU5J×1нdwYGːl‘;w[Cu3xåOHGκf.ÉIнhÖn£-s+əY0j1cBE"x´ÄhHеLWńיрïм7öàÖнqrfחםÉXHuץbvFX•2'eτcL3c×•&ˈiÖE7?TDQ0VfTnKиʎ([Ö—äLAHFнם?!é5á—oöU3Mä:zZm?øjmмu"äoZв‘YW7 '2év,yˈSGZ4Ö0нi•i-8lyäqɔçÉем
Step 10,000 ends @ 12/29/2017 04:41:19 [Logloss] 2.228 [Accuracy] 38.7%
--------------------------------------------------------------------------------------------------
Hello Kitty is 1980). His ern-canger Gremp an 1999 recossed dathe k of Jmineyfuareit anducted vevies, the band fost tem-lece-on muntics son musical sesuns us accesatibur afstiss, a fown Qweer, snngrice (buss, exupsi
Step 20,000 ends @ 12/29/2017 04:58:09 [Logloss] 2.231 [Accuracy] 38.0%
--------------------------------------------------------------------------------------------------
Hello Kitty is abpanri

Step 170,000 ends @ 12/29/2017 09:08:36 [Logloss] 1.536 [Accuracy] 54.3%
--------------------------------------------------------------------------------------------------
Hello Kitty is lave player of album, a drummer Workeyt incertse riitation, to the Ur. Hus success. They released five sour with Lecin Atalst Hight success, known sound at hir ovical single, "Diy albums He band's "19
Step 180,000 ends @ 12/29/2017 09:25:16 [Logloss] 1.695 [Accuracy] 52.3%
--------------------------------------------------------------------------------------------------
Hello Kitty is make are a plass from one of the mid-1980s and serts-selling artists in also one of the "nlummer Gest has dead mored aboshance on the enve. Filln Mestr fham in 2018.

The arh accicted his seven oun if 
Step 190,000 ends @ 12/29/2017 09:41:54 [Logloss] 1.829 [Accuracy] 45.3%
--------------------------------------------------------------------------------------------------
Hello Kitty is an English writer, songwriter, bassi

Step 340,000 ends @ 12/29/2017 13:50:58 [Logloss] 1.660 [Accuracy] 49.3%
--------------------------------------------------------------------------------------------------
Hello Kitty is known as the folk guitar. The fame released two single "Ring Love Sones" (1992),

US  often Englonds (2012), "On Rodes", "Chanic, a lead vocals and "Ian Lood" (all overse). Stewart in Lade's Mjazz appr
Step 350,000 ends @ 12/29/2017 14:07:33 [Logloss] 1.817 [Accuracy] 46.0%
--------------------------------------------------------------------------------------------------
Hello Kitty is an inglrch but remuined  new Bands, "Don't Me"Wa List was Deven" and "Rock My Rower" and one of the pop of 100 million.

Jinivis'qunny member, Nirst Doreligni, Miger by mukically and contented for the 
Step 360,000 ends @ 12/29/2017 14:24:08 [Logloss] 1.606 [Accuracy] 56.3%
--------------------------------------------------------------------------------------------------
Hello Kitty is the oung Bonhord that nem the group

KeyboardInterrupt: 