In [132]:
import filereader

from sklearn.feature_extraction import DictVectorizer
import pandas as pd
from nltk import word_tokenize
import re
import html
from functools import reduce
from nltk.corpus import stopwords
import numbers

import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer

In [40]:
def custom_scaler(X:np.ndarray) -> np.ndarray:
    X_scaled = QuantileTransformer().fit_transform(X)
    X_scaled = MinMaxScaler(feature_range=[-1,1]).fit_transform(X_scaled)
    return X_scaled

In [2]:
# parse lyrics to segment-line-structure, assuming lines are separated by line_border_indicator and
# segments are separated by multiple consecutive line_border_indicator occurences
# assuming line_border_indicator is <br> (standard in lyrics.wikia.com)
def tree_structure(text):
    #normalize segment border encoding
    segment_border_encoder = '<segmentborder>'
    line_border_encoder = '<lineborder>'
    tree_string = re.sub('(( )*<br>( )*){2,}', segment_border_encoder, text)
    tree_string = re.sub('( )*<br>( )*', line_border_encoder, tree_string)
    #parse tree_string
    segment_structure = tree_string.split(segment_border_encoder)
    tree_structure = list(map(lambda segment: segment.split(line_border_encoder), segment_structure))
    return tree_structure

#flattened tree structure, does not differentiate between segment and line border
def line_structure(lyric_tree):
    return reduce(lambda x, segment: x + segment, lyric_tree, [])

# flattened line_structure
def token_structure(lyric_tree, tokenizer=word_tokenize):
    return reduce(lambda x, line: extend_with_return(x, tokenizer(line)), line_structure(lyric_tree), [])

def extend_with_return(some_list, other_list):
    some_list.extend(other_list)
    return some_list

def normalize_lyric(lyric):
    lyric = html.unescape(lyric)
    lyric = lyric.lower()
    return lyric

In [55]:
def parse_document(document: list, parser):
    parsed_sentences = []
    for sentence in document:
        parsed_sentences.extend(parser(sentence))
    return parsed_sentences

def doc_to_dict(doc: list, parser) -> dict:    
    d = dict()
    doc_triples = parse_document(doc, parser)
    for rel in doc_triples:
        d[rel] = d[rel] + 1 if rel in d else 1
    return d

def document_to_dictionary(doc_atoms: list) -> dict:
    d = dict()
    for atom in doc_atoms:
        d[atom] = d[atom] + 1 if atom in d else 1
    return d

def documents_to_dictionaries(documents: list, min_doc_freq=1, max_doc_freq=1., limit=None) -> list:
    dicts = []
    atom_doc_freq = {}
    n_doc = len(documents)
    i = 0
    for doc_atoms in documents:
        d = dict()
        for atom in doc_atoms:
            d[atom] = d[atom] + 1 if atom in d else 1
        dicts.append(d)
        # increase doc_freq for atoms
        for atom in d.keys():
            atom_doc_freq[atom] = atom_doc_freq[atom] + 1 if atom in atom_doc_freq else 1
        if i % 10000 == 0:
            print('Progress: ' + str(round(i / n_doc * 100, 1)) + ' % of a total ' + str(n_doc))
        i += 1
    
    # from CountVectorizer
    max_doc_count = (max_doc_freq if isinstance(max_doc_freq, numbers.Integral) else max_doc_freq * n_doc)
    min_doc_count = (min_doc_freq if isinstance(min_doc_freq, numbers.Integral) else min_doc_freq * n_doc)
    mask = lambda item: min_doc_count <= atom_doc_freq[item[0]] <= max_doc_count
    dicts = list(map(lambda x: filter_dict(x, mask), dicts))
    
    if not limit is None:
        min_doc_count = sorted(atom_doc_freq.values(), reverse=True)[limit - 1] if len(atom_doc_freq) >= limit else min_doc_count
        mask2 = lambda item: min_doc_count <= atom_doc_freq[item[0]]
        dicts = list(map(lambda x: filter_dict(x, mask2), dicts))
    return dicts

def filter_dict(d: dict, item_property) -> dict:
    d_filtered = dict()
    for item in d.items():
        if item_property(item):
            d_filtered[item[0]] = item[1]
    return d_filtered

In [207]:
mpd = pd.read_csv('../resources/recsys_challenge_2018/mpd_wasabi_aligned_langdetect.csv', sep='\t', encoding='utf-8')
mpd = mpd[mpd['langdetect'] == 'en']
print('Total song count:', len(mpd))
mpd.tail()

Total song count: 367229


Unnamed: 0.1,Unnamed: 0,spotify_track_uri,artist,title,urlSong,lyrics,langdetect
416116,416116,71zm5sWvDc3222XEvCvkXF,50 Cent,Rowdy Rowdy,http://lyrics.wikia.com/50_Cent:Rowdy_Rowdy,[50 Cent]<br>Yo LA niggaz are the rowdy niggaz...,en
416117,416117,6xGPZJmMpwLc6XcOGFygzd,50 Cent,The Enforcer,http://lyrics.wikia.com/50_Cent:The_Enforcer,Who wanna play with the enforcer?<br>Who wanna...,en
416118,416118,6bxvZDiuXOAwz5Fgb2a2Je,50 Cent,Wait Until Tonight,http://lyrics.wikia.com/50_Cent:Wait_Until_Ton...,If you think you&apos;re lonely now<br>Wait un...,en
416119,416119,6fGjvkhTXsV5II8ynoSSfG,50 Cent,When It Rains It Pours,http://lyrics.wikia.com/50_Cent:When_It_Rains_...,"Intro]<br>Yeah..<br>Its 50<br>One shot, One ki...",en
416120,416120,590LENpdOHTNGyYaytpy4S,50 Cent,You Should Be Here,http://lyrics.wikia.com/50_Cent:You_Should_Be_...,DJ Whoo Kid:Yea (?) Lloyd Banks 50 Cent G-Unit...,en


In [7]:
all_lyrics = list(map(line_structure,\
                  map(tree_structure,\
                  map(normalize_lyric,\
                      mpd['lyrics'].values))))

In [336]:
#stopwords_set = set(stopwords.words('english'))
documents_as_atoms = []
i = 0
for lyric in all_lyrics:
    i += 1
    if i % 1000 == 0:
        print('Progress:', round(i/len(all_lyrics)*100, 1), '% of a total', len(all_lyrics))
    lyric_atoms = []
    for line in lyric:
        tokenized_line = word_tokenize(line)
        #tokenized_line = [word for word in tokenized_line if word not in stopwords_set]
        # add parsing step here
        lyric_atoms.extend(tokenized_line)
    documents_as_atoms.append(lyric_atoms)

Progress: 0.3 % of a total 367229
Progress: 0.5 % of a total 367229
Progress: 0.8 % of a total 367229
Progress: 1.1 % of a total 367229
Progress: 1.4 % of a total 367229
Progress: 1.6 % of a total 367229
Progress: 1.9 % of a total 367229
Progress: 2.2 % of a total 367229
Progress: 2.5 % of a total 367229
Progress: 2.7 % of a total 367229
Progress: 3.0 % of a total 367229
Progress: 3.3 % of a total 367229
Progress: 3.5 % of a total 367229
Progress: 3.8 % of a total 367229
Progress: 4.1 % of a total 367229
Progress: 4.4 % of a total 367229
Progress: 4.6 % of a total 367229
Progress: 4.9 % of a total 367229
Progress: 5.2 % of a total 367229
Progress: 5.4 % of a total 367229
Progress: 5.7 % of a total 367229
Progress: 6.0 % of a total 367229
Progress: 6.3 % of a total 367229
Progress: 6.5 % of a total 367229
Progress: 6.8 % of a total 367229
Progress: 7.1 % of a total 367229
Progress: 7.4 % of a total 367229
Progress: 7.6 % of a total 367229
Progress: 7.9 % of a total 367229
Progress: 8.2 

Progress: 64.5 % of a total 367229
Progress: 64.8 % of a total 367229
Progress: 65.1 % of a total 367229
Progress: 65.4 % of a total 367229
Progress: 65.6 % of a total 367229
Progress: 65.9 % of a total 367229
Progress: 66.2 % of a total 367229
Progress: 66.4 % of a total 367229
Progress: 66.7 % of a total 367229
Progress: 67.0 % of a total 367229
Progress: 67.3 % of a total 367229
Progress: 67.5 % of a total 367229
Progress: 67.8 % of a total 367229
Progress: 68.1 % of a total 367229
Progress: 68.3 % of a total 367229
Progress: 68.6 % of a total 367229
Progress: 68.9 % of a total 367229
Progress: 69.2 % of a total 367229
Progress: 69.4 % of a total 367229
Progress: 69.7 % of a total 367229
Progress: 70.0 % of a total 367229
Progress: 70.3 % of a total 367229
Progress: 70.5 % of a total 367229
Progress: 70.8 % of a total 367229
Progress: 71.1 % of a total 367229
Progress: 71.3 % of a total 367229
Progress: 71.6 % of a total 367229
Progress: 71.9 % of a total 367229
Progress: 72.2 % of 

In [151]:
len(documents_as_atoms)

367229

In [197]:
# parameters are comparable to those in sklearn's CountVectorizer
atom_dictionaries = documents_to_dictionaries(documents_as_atoms, min_doc_freq=2, max_doc_freq=0.95, limit=6000)

atom_dictionaries_train = atom_dictionaries

dvect = DictVectorizer()
X_train = dvect.fit_transform(atom_dictionaries_train)
print(X_train.shape)

Progress: 0.0 % of a total 367229
Progress: 2.7 % of a total 367229
Progress: 5.4 % of a total 367229
Progress: 8.2 % of a total 367229
Progress: 10.9 % of a total 367229
Progress: 13.6 % of a total 367229
Progress: 16.3 % of a total 367229
Progress: 19.1 % of a total 367229
Progress: 21.8 % of a total 367229
Progress: 24.5 % of a total 367229
Progress: 27.2 % of a total 367229
Progress: 30.0 % of a total 367229
Progress: 32.7 % of a total 367229
Progress: 35.4 % of a total 367229
Progress: 38.1 % of a total 367229
Progress: 40.8 % of a total 367229
Progress: 43.6 % of a total 367229
Progress: 46.3 % of a total 367229
Progress: 49.0 % of a total 367229
Progress: 51.7 % of a total 367229
Progress: 54.5 % of a total 367229
Progress: 57.2 % of a total 367229
Progress: 59.9 % of a total 367229
Progress: 62.6 % of a total 367229
Progress: 65.4 % of a total 367229
Progress: 68.1 % of a total 367229
Progress: 70.8 % of a total 367229
Progress: 73.5 % of a total 367229
Progress: 76.2 % of a to

In [203]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

n_samples = X_train.shape[0]
n_features = X_train.shape[1]
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

    

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=60, max_iter=100,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0,
                                n_jobs=1)
t0 = time()
X_train_lda = lda.fit_transform(X_train)
print("done in %0.3fs." % (time() - t0))

print('\nPerplexity:', lda.perplexity(X_train))
print('Score     :', lda.score(X_train))

print("\nTopics in LDA model:")
tf_feature_names = dvect.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting LDA models with tf features, n_samples=367229 and n_features=6003...
done in 29917.949s.

Perplexity: 336.4649236484019
Score     : -558654639.4800696

Topics in LDA model:
Topic #0: ’ i s t m it you don re ll can my ve that ‘ know but d won me
Topic #1: dead head alive wake inside hate fucking face sick killing up tired waste brain enemy troubles awake wasted bury grave
Topic #2: up get the out it to and off on down now in make take this beat stop so put 'em
Topic #3: rock play hear music roll sound soul listen oooh loud rhythm guitar playing 'n bass louder crush lion ow hardest
Topic #4: be if could would should must or might rather free hero have used fine world meet careful by string honest
Topic #5: n't do ca wo no ai know i you but just stop what care say 'cause it if why get
Topic #6: heaven hope rise carry angel peace shall earth grace welcome angels faith above strong cross mother wings prayer thee father
Topic #7: i my 'm me and am know can in when see but not with so

In [204]:
t0 = time()
X_train_lda_scaled = custom_scaler(X_train_lda.T).T
print("done in %0.3fs." % (time() - t0))

done in 259.603s.


In [205]:
# feature scaling example
peek_document = 22
print(X_train_lda.shape)
print()
print(X_train_lda[peek_document])
print()
print(X_train_lda_scaled[peek_document])

(367229, 60)

[5.06585613e-05 5.06585613e-05 5.06585613e-05 5.06585613e-05
 5.06585613e-05 5.06585613e-05 5.06585613e-05 1.98706074e-01
 5.06585613e-05 5.06585613e-05 4.94756343e-02 5.06585613e-05
 5.47357133e-02 3.09017224e-03 5.06585613e-05 5.06585613e-05
 5.06585613e-05 9.31727519e-02 5.06585613e-05 5.06585613e-05
 5.06585613e-05 3.56382417e-02 3.31279043e-01 5.06585613e-05
 5.06585613e-05 5.06585613e-05 5.06585613e-05 5.06585613e-05
 5.06585613e-05 5.06585613e-05 5.06585613e-05 5.06585613e-05
 5.06585613e-05 5.06585613e-05 5.06899647e-02 5.06585613e-05
 5.06585613e-05 5.06585613e-05 5.06585613e-05 3.85559796e-02
 5.06585613e-05 5.06585613e-05 5.06585613e-05 5.06585613e-05
 5.06585613e-05 5.06585613e-05 5.06585613e-05 5.06585613e-05
 5.06585613e-05 5.06585613e-05 5.06585613e-05 5.06585613e-05
 5.06585613e-05 5.06585613e-05 5.06585613e-05 5.06585613e-05
 1.42123497e-01 5.06585613e-05 5.06585613e-05 5.06585613e-05]

[-1.         -1.         -1.         -1.         -1.         -1.
 -1.

In [210]:
d = {}
uris = mpd['spotify_track_uri'].values
for i in range(len(mpd)):
    d[uris[i]] = X_train_lda_scaled[i]

In [228]:
def print_best_topic(model, feature_names, n_top_words, best_topic_id):
    for topic_idx, topic in enumerate(model.components_):
        if topic_idx == best_topic_id:
            message = "Best Topic #%d: " % topic_idx
            message += " ".join([feature_names[i]
                                 for i in topic.argsort()[:-n_top_words - 1:-1]])
            print(message)

In [241]:
# topic modelling example
uri = list(d.keys())[1969]
topics = d[uri]
best_topic = np.argmax(topics)
print_top(lda, tf_feature_names, 20, best_topic)
print()
tree_structure(normalize_lyric(mpd.loc[uri]['lyrics']))

Topic #56: i to the and you that but so it me just a of this all n't for 's not ,



[['i admire your perseverance!',
  'everytime my back is turned your falling closer into my world,',
  "i've told you so many times you need to back down, need to back down.",
  'i told you you need to walk away.',
  'all the countless efforts i have made.',
  "one day i'll get you back, i swear i'll get you back.",
  'will this ever end or will time stand still forever?',
  'will this ever end or will time stand still forever?',
  "everytime my back is turned you're falling closer to my world.",
  "how can you sleep when you know what you're doing to me?",
  "everytime my back is turned you're falling closer to my world.",
  "how can you sleep when you know what you're doing to me?",
  'i told you you need to walk away',
  'fuck all the efforts that i have made.',
  "i've got you right where i want you.",
  "after all the years i've known you when did you decide to fuck this up?",
  "after all the years i've known you when did you decide to try and fuck this up?",
  'i still admire yo