In [15]:
import pandas as pd
import html
from functools import reduce
import re
import numpy as np
from nltk import word_tokenize

import spacy
import pronouncing
from textblob import TextBlob

from sklearn.preprocessing import StandardScaler
from keras.models import load_model

nlp = spacy.load('en_core_web_lg')

In [16]:
mpd_with_lyrics = pd.read_csv('resources/mpd to wasabi alignment/mpd_wasabi_aligned.csv', sep='\t', encoding='utf8')
mpd_with_lyrics = mpd_with_lyrics.drop(['Unnamed: 0'], axis=1)
mpd_with_lyrics.head()

Unnamed: 0,spotify_track_uri,artist,title,urlSong,lyrics
0,6Z32g3TxhI9KOEDxkF5whx,A Broken Silence,What Are We Waiting For (Life Is Wonderful),http://lyrics.wikia.com/A_Broken_Silence:What_...,(Cactus)<br>What are we waiting for? It&apos;s...
1,1fTNpl2mxqHVlLqRNbyDhR,A Day To Remember,"I'm Made of Wax, Larry, What Are You Made Of?",http://lyrics.wikia.com/A_Day_To_Remember:I%27...,"Don&apos;t blink, they won&apos;t even miss yo..."
2,7BQk0o7TxM3WRFTPCuA4e4,A Fine Frenzy,Almost Lover (Live),http://lyrics.wikia.com/A_Fine_Frenzy:Almost_L...,Your fingertips across my skin<br>The palm tre...
3,33VihH9UNQMxiQS4wcPIKL,A Flock Of Seagulls,I Ran,http://lyrics.wikia.com/A_Flock_Of_Seagulls:I_...,I walk along the avenue<br>I never thought I&a...
4,5VNW7zhvsqo5UD0kUiRTYr,A Flock Of Seagulls,I Ran (So Far Away) (Re-Recorded / Remastered),http://lyrics.wikia.com/A_Flock_Of_Seagulls:I_...,I walk along the avenue<br>I never thought I&a...


In [17]:
print('aligned indices    :', len(mpd_with_lyrics))
print('unique Spotify URIs:', len(set(mpd_with_lyrics['spotify_track_uri'])))
print('unique lyrics      :', len(set(mpd_with_lyrics['lyrics'])))

aligned indices    : 416121
unique Spotify URIs: 416121
unique lyrics      : 358334


In [18]:
all_lyrics = mpd_with_lyrics.head()['lyrics']
all_lyrics

0    (Cactus)<br>What are we waiting for? It&apos;s...
1    Don&apos;t blink, they won&apos;t even miss yo...
2    Your fingertips across my skin<br>The palm tre...
3    I walk along the avenue<br>I never thought I&a...
4    I walk along the avenue<br>I never thought I&a...
Name: lyrics, dtype: object

In [19]:
all_titles = mpd_with_lyrics.head()['title']
all_titles

0       What Are We Waiting For (Life Is Wonderful)
1     I'm Made of Wax, Larry, What Are You Made Of?
2                               Almost Lover (Live)
3                                             I Ran
4    I Ran (So Far Away) (Re-Recorded / Remastered)
Name: title, dtype: object

In [20]:
# parse lyrics to segment-line-structure, assuming lines are separated by line_border_indicator and
# segments are separated by multiple consecutive line_border_indicator occurences
# assuming line_border_indicator is <br> (standard in lyrics.wikia.com)
def tree_structure(text):
    #normalize segment border encoding
    segment_border_encoder = '<segmentborder>'
    line_border_encoder = '<lineborder>'
    tree_string = re.sub('(( )*<br>( )*){2,}', segment_border_encoder, text)
    tree_string = re.sub('( )*<br>( )*', line_border_encoder, tree_string)
    #parse tree_string
    segment_structure = tree_string.split(segment_border_encoder)
    tree_structure = list(map(lambda segment: segment.split(line_border_encoder), segment_structure))
    return tree_structure

#flattened tree structure, does not differentiate between segment and line border
def line_structure(lyric_tree):
    return reduce(lambda x, segment: x + segment, lyric_tree, [])

# flattened line_structure
def token_structure(lyric_tree, tokenizer=word_tokenize):
    return reduce(lambda x, line: extend_with_return(x, tokenizer(line)), line_structure(lyric_tree), [])

def extend_with_return(some_list, other_list):
    some_list.extend(other_list)
    return some_list

# normalizations we want to apply to all lyrics go here
def normalize_lyric(lyric):
    lyric = html.unescape(lyric)
    lyric = lyric.lower()
    return lyric

# Reduce a list of numbers to single number / feature (cf. np.average, np.std, ...)
def list_span(some_list):
    return min(some_list) / max(some_list) if max(some_list) > 0 else 1e-10

In [21]:
######################################
########Stylometric features##########
######################################

def type_token_ratio(lyric_tokens):
    return len(set(lyric_tokens)) / len(lyric_tokens)

def line_lengths_in_chars(lyric_lines):
    return list(map(len, lyric_lines))

def line_lengths_in_tokens(lyric_lines):
    return list(map(lambda line: len(word_tokenize(line)), lyric_lines))

def pos_tag_distribution(lyric_lines):
    # Look at https://spacy.io/api/annotation for a better description of each tag
    tags = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 
        'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']
    freq = dict()
    for tag in tags:
        freq[tag] = 0
    for line in lyric_lines:
        doc = nlp(line)
        for word in doc:
            if word.pos_ in tags:
                freq[word.pos_] += 1
    wc = sum(line_lengths_in_tokens(lyric_lines))
    for key in freq:
        freq[key] /= wc
    return freq

def get_rhymes(lyric_lines):
    count = 0
    for i in range(len(lyric_lines)-1):
        words = lyric_lines[i].split()
        if len(words) < 1:
            continue
        rhymes = pronouncing.rhymes(words[-1])
        next_line_words = lyric_lines[i+1].split()
        if next_line_words is not None and len(next_line_words) > 0 and  next_line_words[-1] in rhymes:
            count += 1 
    return count / ( len(lyric_lines) if len(lyric_lines) > 0 else 1 )

def get_echoisms(lyric_lines):
    vowels = ['a', 'e', 'i', 'o', 'u']
    # Do echoism count on a word level
    echoism_count = 0
    for line in lyric_lines:
        doc = nlp(line)
        for i in range(len(doc) - 1):
            echoism_count += doc[i].text.lower() == doc[i+1].text.lower()
        # Count echoisms inside words e.g. yeeeeeeah
        for tk in doc:     
            for i in range(len(tk.text) - 1):
                if tk.text[i] == tk.text[i+1] and tk.text in vowels:
                    echoism_count += 1
                    break
    return echoism_count / sum(line_lengths_in_tokens(lyric_lines))

def is_title_in_lyrics(title, lyric_lines):
    for line in lyric_lines:
        if title in line:
            return True
    return False

def count_duplicate_lines(lyric_lines):
    wc = sum(line_lengths_in_tokens(lyric_lines))
    wc = wc if wc > 0 else 1
    return sum([lyric_lines.count(x) for x in list(set(lyric_lines)) if lyric_lines.count(x) > 1]) / wc

In [22]:
##################################
########Segment features##########
##################################

#The indices of lines that end a segment
def segment_borders(lyric_tree):
    segment_lengths = reduce(lambda x, block: x + [len(block)], lyric_tree, [])
    segment_indices = []
    running_sum = -1
    for i in range(len(segment_lengths)):
        running_sum += segment_lengths[i]
        segment_indices.append(running_sum)
    return segment_indices[:-1]

# lengths of the segments
def segment_lengths(lyric_tree):
    return reduce(lambda x, block: x + [len(block)], lyric_tree, [])

In [23]:
##################################
########Orientation feature#######
##################################
def get_verb_tense_frequencies(lyric_lines):
    freq = dict()
    freq['present'] = 0
    freq['future'] = 0
    freq['past'] = 0
    verbs_no = 0

    for line in lyric_lines:
        doc = nlp(line)
        for i in range(len(doc)):
            token = doc[i]
            if token.pos_ == 'VERB' and token.tag_ != 'MD': 
                verbs_no += 1
                if 'present' in spacy.explain(token.tag_):
                    freq['present'] += 1
                elif 'past' in spacy.explain(token.tag_):
                    freq['past'] += 1 
            elif token.pos_ == 'VERB' and token.tag_ == 'MD' and token.text.lower() == 'will':
                if i < len(doc) - 1:
                    i += 1
                    next_token = doc[i]
                    if next_token is not None and next_token.text == 'VB':
                        verbs_no += 1
                        freq['future'] += 1

    if verbs_no > 0:
        for key, value in freq.items():
            freq[key] = value/verbs_no

    return freq

def get_polarity_and_subjectivity(lyric_lines):
    text = '\n'.join(lyric_lines)
    opinion = TextBlob(text)
    sentiment = opinion.sentiment
    return (sentiment.polarity, sentiment.subjectivity)

In [36]:
##################################
########Emotion features##########
##################################

import emoclassify as clf

def get_emotion_vector(lyric, title, modelpath='emodetect.h5'):
    return clf.classify(0, '', title, lyric_content = html.unescape(lyric).replace('<br>', '\n'))

In [37]:
def representations_from(lyric):
    """Compute different representations of lyric: tree (with paragraphs), lines, tokens"""
    lyric_tree = tree_structure(lyric)
    lyric_lines = line_structure(lyric_tree)
    lyric_tokens = token_structure(lyric_tree)
    return lyric_tree, lyric_lines, lyric_tokens

def feat_vect_from(feature_list):
    """Assuming a list of features of the lyric"""
    feat_vect = []
    feat_vect.append(np.median(feature_list))
    feat_vect.append(np.std(feature_list))
    feat_vect.append(list_span(feature_list))
    return feat_vect

def extend_feat_vect(feat_vect, feature_list):
    feat_vect.extend(feat_vect_from(feature_list))
    return feat_vect

def feature_vector_from(lyric, title):
    lyric_tree, lyric_lines, lyric_tokens = representations_from(lyric)
    
    # lump everything in a single feature vector
    feat_vect = []
    
    # segmentation features
    feat_vect = extend_feat_vect(feat_vect, segment_lengths(lyric_tree))
    
    # stylometric features
    ln_lengths_chars = line_lengths_in_chars(lyric_lines)
    feat_vect = extend_feat_vect(feat_vect, ln_lengths_chars)
    feat_vect = extend_feat_vect(feat_vect, line_lengths_in_tokens(lyric_lines))
    feat_vect = extend_feat_vect(feat_vect, list(pos_tag_distribution(lyric_lines).values()))
    feat_vect = extend_feat_vect(feat_vect, [get_rhymes(lyric_lines)])
    feat_vect = extend_feat_vect(feat_vect, [get_echoisms(lyric_lines)])
    
    # orientation features
    feat_vect = extend_feat_vect(feat_vect, list(get_verb_tense_frequencies(lyric_lines).values()))
    feat_vect = extend_feat_vect(feat_vect, get_polarity_and_subjectivity(lyric_lines))
    
    # emotion features
    emo = get_emotion_vector(lyric, title)
    feat_vect = extend_feat_vect(feat_vect, get_emotion_vector(lyric, title).reshape(4))
    
    feat_vect.append(len(ln_lengths_chars))
    feat_vect.append(type_token_ratio(lyric_tokens))
    return feat_vect

def feature_vectors_from(many_lyrics: list, many_titles: list) -> np.ndarray:
    many_count = len(many_lyrics)
    first_feat_vect = feature_vector_from(many_lyrics[0], many_titles[0])
    feat_vects = np.empty((many_count, len(first_feat_vect)), dtype=object)
    feat_vects[0] = first_feat_vect
    for i in range(1, many_count):
        feat_vects[i] = feature_vector_from(many_lyrics[i], many_titles[i])
    return feat_vects

def min_max_scaler(elems: list) -> list:
    min_elem = min(elems)
    max_elem = max(elems)
    min_max_range = max_elem - min_elem
    if not min_max_range:
        min_max_range = 1
    return list(map(lambda x: (x - min_elem) / min_max_range, elems))

def apply_to_columns(f, matrix: np.ndarray) -> np.ndarray:
    """Apply a function f to each column of the matrix"""
    f_matrix = np.empty((matrix.shape[0], matrix.shape[1]))
    for j in range(matrix.shape[1]):
        f_matrix[:, j] = f(matrix[:, j])
    return f_matrix

In [38]:
matrix = feature_vectors_from(all_lyrics, all_titles)
print(matrix)
print()

scaled_matrix = apply_to_columns(min_max_scaler, matrix)
scaled_matrix

[[4.0 0.44221663871405326 0.6 47.0 18.434890911053575 0.11594202898550725
  9.0 4.632242725574456 0.125 0.0136986301369863 0.06072027577139217 0.0
  0.03278688524590164 0.0 1.0 0.0 0.0 1e-10 0.1926605504587156
  0.21191579375009684 0.0 0.4233585858585859 0.19426767676767678
  0.37092157985117347 0.21151292 0.120185494 0.2839179 61
  0.3167808219178082]
 [4.0 1.8652854850741754 0.14285714285714285 29.0 14.56410855583545
  0.12698412698412698 9.5 4.621622396850757 0.16666666666666666
  0.022530329289428077 0.042523834986257364 0.0 0.0 0.0 1e-10 0.0 0.0
  1e-10 0.19318181818181818 0.15773229404285616 0.0 0.09881613756613754
  0.24393518518518514 -0.42339456622414334 0.17179891 0.18457119
  0.17931825 58 0.2045060658578856]
 [3.0 2.638181191654584 0.25 29.0 9.160509295642658 0.13636363636363635
  6.5 3.186819099387699 0.07142857142857142 0.02865329512893983
  0.05108768717321954 0.0 0.0 0.0 1e-10 0.0 0.0 1e-10 0.2631578947368421
  0.16100467716884448 0.0 0.18452380952380953 0.1875
  -0.007

array([[1.        , 0.        , 1.        , 1.        , 1.        ,
        0.        , 0.83333333, 1.        , 0.22674419, 0.        ,
        1.        , 0.        , 0.15300546, 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.85912624,
        0.        , 1.        , 0.08166582, 1.        , 1.        ,
        0.        , 1.        , 1.        , 0.66742332],
       [1.        , 0.64803817, 0.        , 0.        , 0.64509264,
        0.03605228, 1.        , 0.99494465, 0.40310078, 0.59056483,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.0022096 , 0.        ,
        0.        , 0.        , 0.68100559, 0.        , 0.50311154,
        0.37472895, 0.62481409, 0.90909091, 0.        ],
       [0.        , 1.        , 0.234375  , 0.        , 0.14964315,
        0.06667623, 0.        , 0.31196767, 0.        , 1.        ,
        0.47063337, 0.        , 0.        , 0.        , 0.        ,
  