In [7]:
import pandas as pd
import html
from functools import reduce
import re
import numpy as np
from nltk import word_tokenize

In [9]:
mpd_with_lyrics = pd.read_csv('resources/mpd to wasabi alignment/mpd_wasabi_aligned.csv', sep='\t', encoding='utf8')
mpd_with_lyrics = mpd_with_lyrics.drop(['Unnamed: 0'], axis=1)
mpd_with_lyrics.head()

Unnamed: 0,spotify_track_uri,artist,title,urlSong,lyrics
0,6Z32g3TxhI9KOEDxkF5whx,A Broken Silence,What Are We Waiting For (Life Is Wonderful),http://lyrics.wikia.com/A_Broken_Silence:What_...,(Cactus)<br>What are we waiting for? It&apos;s...
1,1fTNpl2mxqHVlLqRNbyDhR,A Day To Remember,"I'm Made of Wax, Larry, What Are You Made Of?",http://lyrics.wikia.com/A_Day_To_Remember:I%27...,"Don&apos;t blink, they won&apos;t even miss yo..."
2,7BQk0o7TxM3WRFTPCuA4e4,A Fine Frenzy,Almost Lover (Live),http://lyrics.wikia.com/A_Fine_Frenzy:Almost_L...,Your fingertips across my skin<br>The palm tre...
3,33VihH9UNQMxiQS4wcPIKL,A Flock Of Seagulls,I Ran,http://lyrics.wikia.com/A_Flock_Of_Seagulls:I_...,I walk along the avenue<br>I never thought I&a...
4,5VNW7zhvsqo5UD0kUiRTYr,A Flock Of Seagulls,I Ran (So Far Away) (Re-Recorded / Remastered),http://lyrics.wikia.com/A_Flock_Of_Seagulls:I_...,I walk along the avenue<br>I never thought I&a...


In [10]:
print('aligned indices    :', len(mpd_with_lyrics))
print('unique Spotify URIs:', len(set(mpd_with_lyrics['spotify_track_uri'])))
print('unique lyrics      :', len(set(mpd_with_lyrics['lyrics'])))

aligned indices    : 416121
unique Spotify URIs: 416121
unique lyrics      : 358334


In [11]:
all_lyrics = mpd_with_lyrics.head()['lyrics']
all_lyrics

0    (Cactus)<br>What are we waiting for? It&apos;s...
1    Don&apos;t blink, they won&apos;t even miss yo...
2    Your fingertips across my skin<br>The palm tre...
3    I walk along the avenue<br>I never thought I&a...
4    I walk along the avenue<br>I never thought I&a...
Name: lyrics, dtype: object

In [12]:
# parse lyrics to segment-line-structure, assuming lines are separated by line_border_indicator and
# segments are separated by multiple consecutive line_border_indicator occurences
# assuming line_border_indicator is <br> (standard in lyrics.wikia.com)
def tree_structure(text):
    #normalize segment border encoding
    segment_border_encoder = '<segmentborder>'
    line_border_encoder = '<lineborder>'
    tree_string = re.sub('(( )*<br>( )*){2,}', segment_border_encoder, text)
    tree_string = re.sub('( )*<br>( )*', line_border_encoder, tree_string)
    #parse tree_string
    segment_structure = tree_string.split(segment_border_encoder)
    tree_structure = list(map(lambda segment: segment.split(line_border_encoder), segment_structure))
    return tree_structure

#flattened tree structure, does not differentiate between segment and line border
def line_structure(lyric_tree):
    return reduce(lambda x, segment: x + segment, lyric_tree, [])

# flattened line_structure
def token_structure(lyric_tree, tokenizer=word_tokenize):
    return reduce(lambda x, line: extend_with_return(x, tokenizer(line)), line_structure(lyric_tree), [])

def extend_with_return(some_list, other_list):
    some_list.extend(other_list)
    return some_list

# normalizations we want to apply to all lyrics go here
def normalize_lyric(lyric):
    lyric = html.unescape(lyric)
    lyric = lyric.lower()
    return lyric

# Reduce a list of numbers to single number / feature (cf. np.average, np.std, ...)
def list_span(some_list):
    return min(some_list) / max(some_list)

In [13]:
######################################
########Stylometric features##########
######################################

def type_token_ratio(lyric_tokens):
    return len(set(lyric_tokens)) / len(lyric_tokens)

def line_lengths_in_chars(lyric_lines):
    return list(map(len, lyric_lines))

def line_lengths_in_tokens(lyric_lines):
    return list(map(lambda line: len(word_tokenize(line)), lyric_lines))

In [14]:
##################################
########Segment features##########
##################################

#The indices of lines that end a segment
def segment_borders(lyric_tree):
    segment_lengths = reduce(lambda x, block: x + [len(block)], lyric_tree, [])
    segment_indices = []
    running_sum = -1
    for i in range(len(segment_lengths)):
        running_sum += segment_lengths[i]
        segment_indices.append(running_sum)
    return segment_indices[:-1]

# lengths of the segments
def segment_lengths(lyric_tree):
    return reduce(lambda x, block: x + [len(block)], lyric_tree, [])

In [49]:
def representations_from(lyric):
    """Compute different representations of lyric: tree (with paragraphs), lines, tokens"""
    lyric_tree = tree_structure(lyric)
    lyric_lines = line_structure(lyric_tree)
    lyric_tokens = token_structure(lyric_tree)
    return lyric_tree, lyric_lines, lyric_tokens

def feat_vect_from(feature_list):
    """Assuming a list of features of the lyric"""
    feat_vect = []
    feat_vect.append(np.median(feature_list))
    feat_vect.append(np.std(feature_list))
    feat_vect.append(list_span(feature_list))
    return feat_vect

def extend_feat_vect(feat_vect, feature_list):
    feat_vect.extend(feat_vect_from(feature_list))
    return feat_vect

def feature_vector_from(lyric):
    lyric_tree, lyric_lines, lyric_tokens = representations_from(lyric)
    
    # lump everything in a single feature vector
    feat_vect = []
    
    # segmentation features
    feat_vect = extend_feat_vect(feat_vect, segment_lengths(lyric_tree))
    
    # stylometric features
    feat_vect = extend_feat_vect(feat_vect, line_lengths_in_chars(lyric_lines))
    feat_vect = extend_feat_vect(feat_vect, line_lengths_in_tokens(lyric_lines))
    
    feat_vect.append(len(ln_lengths_chars))
    feat_vect.append(type_token_ratio(lyric_tokens))
    return feat_vect

def feature_vectors_from(many_lyrics: list) -> np.ndarray:
    many_count = len(many_lyrics)
    first_feat_vect = feature_vector_from(many_lyrics[0])
    feat_vects = np.empty((many_count, len(first_feat_vect)), dtype=object)
    feat_vects[0] = first_feat_vect
    for i in range(1, many_count):
        feat_vects[i] = feature_vector_from(many_lyrics[i])
    return feat_vects

def min_max_scaler(elems: list) -> list:
    min_elem = min(elems)
    max_elem = max(elems)
    min_max_range = max_elem - min_elem
    if not min_max_range:
        min_max_range = 1
    return list(map(lambda x: (x - min_elem) / min_max_range, elems))

def apply_to_columns(f, matrix: np.ndarray) -> np.ndarray:
    """Apply a function f to each column of the matrix"""
    f_matrix = np.empty((matrix.shape[0], matrix.shape[1]))
    for j in range(matrix.shape[1]):
        f_matrix[:, j] = f(matrix[:, j])
    return f_matrix

In [56]:
matrix = feature_vectors_from(all_lyrics)
print(matrix)
print()

scaled_matrix = apply_to_columns(min_max_scaler, matrix)
scaled_matrix

[[4.0 0.44221663871405326 0.6 47.0 18.434890911053575 0.11594202898550725
  9.0 4.6322427255744563 0.125 28 0.3167808219178082]
 [4.0 1.8652854850741754 0.14285714285714285 29.0 14.56410855583545
  0.12698412698412698 9.5 4.621622396850757 0.16666666666666666 28
  0.2045060658578856]
 [3.0 2.6381811916545841 0.25 29.0 9.1605092956426581 0.13636363636363635
  6.5 3.1868190993876988 0.07142857142857142 28 0.3495702005730659]
 [3.0 1.7728105208558367 0.3333333333333333 30.0 7.5284325007512249
  0.4222222222222222 8.0 2.5314350209527641 0.3076923076923077 28
  0.37272727272727274]
 [3.0 1.7728105208558367 0.3333333333333333 30.0 7.5284325007512249
  0.4222222222222222 8.0 2.5314350209527641 0.3076923076923077 28
  0.37272727272727274]]



array([[ 1.        ,  0.        ,  1.        ,  1.        ,  1.        ,
         0.        ,  0.83333333,  1.        ,  0.22674419,  0.        ,
         0.66742332],
       [ 1.        ,  0.64803817,  0.        ,  0.        ,  0.64509264,
         0.03605228,  1.        ,  0.99494465,  0.40310078,  0.        ,
         0.        ],
       [ 0.        ,  1.        ,  0.234375  ,  0.        ,  0.14964315,
         0.06667623,  0.        ,  0.31196767,  0.        ,  0.        ,
         0.86234154],
       [ 0.        ,  0.60592685,  0.41666667,  0.05555556,  0.        ,
         1.        ,  0.5       ,  0.        ,  1.        ,  0.        ,
         1.        ],
       [ 0.        ,  0.60592685,  0.41666667,  0.05555556,  0.        ,
         1.        ,  0.5       ,  0.        ,  1.        ,  0.        ,
         1.        ]])