In [1]:
# Packages for scrapping, cleaning and preprocessing data
import os
import re
import random as rd
import requests
from bs4 import BeautifulSoup

# Packages for LSTM building
import numpy
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.callbacks import CSVLogger
from keras.callbacks import LambdaCallback
import time
import sys

Using TensorFlow backend.


### Define functions for scrapping 
### Works only for amalgama-lab ~ structure type: site/first_letter_of_artist_name/artist_name

In [None]:
def get_soup_by_artist(site, artist):
    base = site + artist[0] + '/' + artist #get address in needed structure
    site = requests.get(base) #get from base
    return BeautifulSoup(site.text, 'lxml'), base #return FULL HTML code of got from base

def get_raw_lyrics(soup):
    songs_list = soup.find("div", {"id":"songs_nav"})
    s = str(list(songs_list)[9])
    found_songs = re.findall(r'(?<=<a href=")[^"]*', s)
    return found_songs

def parse_raw_lyrics(retrsongs, base):
    songs = {}
    for songname in retrsongs:
        songraw = requests.get(base + '/' + songname)
        parrallel_songs = BeautifulSoup(songraw.text, 'lxml').find("div", {"id": "click_area"})
        if parrallel_songs:
            lines = []
            for p in parrallel_songs.findAll("div", {"class": "original"}):
                lines.append(str(p.get_text()).replace('\n',''))
            songs[songname.replace('.html', '')] = lines
    return songs

def write_lyrics_to_file(artist, songs):
    newpath =  "lyrics/" + artist
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    i = 0
    for s in songs.keys():
        with open(newpath + '//' + artist + '_{0}.txt'.format(s), 'w', encoding='UTF-8') as ffff:
            for l in songs[s]:
                ffff.write(l + "\n")
        ffff.close()
        i += 1

### Getting all songs in separate files and in separate folders for each artist in folder .../lyrics

In [None]:
site = 'http://www.amalgama-lab.com/songs/'

artists = ['a_rocket_to_the_moon', 'abba', 'ac_dc', 'adam_lambert', 'adele', 'aerosmith', 'anti_flag',
           'arctic_monkeys', 'bring_me_the_horizon', 'britney_spears', 'bon_jovi', 'bob_dylan',
          'backstreet_boys', 'blink_182', 'black_sabbath', 'depeche_mode', 'david_bowie', 'doors', 'evanescence',
          'elvis_presley', 'elton_john', 'elton_john', 'frank_sinatra', 'foo_fighters', 'green_day', 'gorillaz',
          'hurts', 'justin_bieber', 'justin_timberlake', 'korn', 'kasabian', 'kiss', 'linkin_park', 'lana_del_rey',
          'limp_bizkit', 'metallica', 'maroon_5', 'michael_jackson', 'marilyn_manson', 'nirvana', 'nickelback',
          'nightwish', 'onerepublic', 'placebo', 'papa_roach', 'red_hot_chili_peppers', 'rasmus', 'scorpions',
          'system_of_a_down', 'three_days_grace', 'u2', 'whitney_houston', 'weeknd']

for artist in artists:
    print("now handling: {0}... ".format(artist), end='')
    try:
        soup, base = get_soup_by_artist(site, artist)
        raw_lyrics = get_raw_lyrics(soup)
        lyrics_dict = parse_raw_lyrics(raw_lyrics, base)
        write_lyrics_to_file(artist, lyrics_dict)
        print('done.')
    except Exception as e:
        print('failed with {0}.'.format(e))

### Getting one big corpus in 1 file from all song texts

In [52]:
os.chdir('/home/nikolay/Python/NLP lyrics and coffee/lyrics')

# Corpus that constists of songs of ALL artists (~70) is too large
# I got random sample of 5 artists and got:
# ['scorpions', 'anti_flag', 'bob_dylan', 'green_day', 'bruno_mars']
# I will attach full corpus before and after cleaning 


# list_dir = rd.sample(os.listdir(), 5)
k = 0
with open('/home/nikolay/Python/NLP lyrics and coffee/result_corpus_large', 'w') as outfile:
    for folder in list_dir:
        os.chdir('.../NLP lyrics and coffee/lyrics')
        filenames = os.listdir(folder)
        os.chdir(folder)
        for fname in filenames:
            infile = open(fname, 'r')
            outfile.write(infile.read())
outfile.close()
os.chdir('/home/nikolay/Python/NLP lyrics and coffee')

In [53]:
# Cleaning result corpus from Russian letters, digits and special symbols

test_text = open('/home/nikolay/Python/NLP lyrics and coffee/result_corpus_large', 'r', encoding='UTF-8')

with open('/home/nikolay/Python/NLP lyrics and coffee/result_corpus_large_cleaned', 'w') as outfile:
    for line in test_text:
        line = re.sub(r'[а-яА-Я]|\d|[\?\.\_\,\:\;\^\$\#\@\&\(\)\*\!\<\>\"\'\{\}\[\]\~\+\-\=\/ñ|à|é|ó|ß|á|â|ä|ç|è|é|ê|ì|í|î|ô|ö|ù|ú|ü|œ|ё|ґ|ṗ|‒|“|”|„|き|げ|し|ち|て|上|持|放|\'|\`|\…|\—|\–|\‘|\’\¦\«\°\´\»\×]', '', line)
        line = re.sub(r'  ', ' ', line)
        outfile.writelines(line)
outfile.close()    

In [54]:
# Loading the dataset

filename = "result_corpus_large_cleaned"
raw_text = open(filename, encoding='utf-8').read()
raw_text = raw_text.lower()
csv_logger = CSVLogger('log.csv', append=True, separator=';')

In [55]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)


Total Characters:  650161
Total Vocab:  30


In [56]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

Total Patterns:  650061


In [67]:
# define the LSTM model
# 2-layers LSTM Neural Network

model = Sequential()
model.add(LSTM(128, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [68]:
# define the checkpoint
# filepath="weights-improvement-1.hdf5"
filepath="weights-improvement-bigger.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

#keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=32, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)

import json
json_log = open('loss_log.json', mode='wt', buffering=1)
json_logging_callback = LambdaCallback(
    on_batch_begin=lambda epoch, logs: json_log.write(
        json.dumps({'time' : time.ctime(), 'epoch': epoch, 'loss': logs}) + '\n'),
    on_epoch_end=lambda epoch, logs: json_log.write(
        json.dumps({'epoch': epoch, 'loss': logs['loss']}) + '\n'),
    on_train_end=lambda logs: json_log.close()
)

callbacks_list = [checkpoint, csv_logger, json_logging_callback]

# Number of epochs and batch_size
model.fit(X, y, batch_size=64, callbacks=callbacks_list, epochs=5)

print('done');

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
done


In [80]:
# load the network weights
filepath="weights-improvement-bigger.hdf5"
model.load_weights(filepath)
model.compile(loss='categorical_crossentropy', optimizer='adam')

int_to_char = dict((i, c) for i, c in enumerate(chars))
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

# generate characters
for i in range(200):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" ow and bow
sara oh sara
dont ever leave me dont ever go
one by one they followed the sun
one by one  "
thing i wanna be the sain

i wanna be the wind 

i wanna be the wind 

i wanna be the wind 

i wanna be the wind 

i wanna be the wind 

i wanna be the wind 

i wanna be the wind 

i wanna be the wind
Done.
