In [1]:
import pandas as pd
import html
from functools import reduce
import re
import numpy as np
from nltk import word_tokenize

In [18]:
mpd_with_lyrics = pd.read_csv('resources/mpd to wasabi alignment/mpd_wasabi_aligned_uri_at_regex.csv', sep='\t', encoding='utf8')
mpd_with_lyrics = mpd_with_lyrics.drop(['Unnamed: 0'], axis=1)
mpd_with_lyrics.head()

Unnamed: 0,spotify_track_uri,artist,title,urlSong,lyrics
0,6Z32g3TxhI9KOEDxkF5whx,A Broken Silence,What Are We Waiting For (Life Is Wonderful),http://lyrics.wikia.com/A_Broken_Silence:What_...,(Cactus)<br>What are we waiting for? It&apos;s...
1,1fTNpl2mxqHVlLqRNbyDhR,A Day To Remember,"I'm Made of Wax, Larry, What Are You Made Of?",http://lyrics.wikia.com/A_Day_To_Remember:I%27...,"Don&apos;t blink, they won&apos;t even miss yo..."
2,7BQk0o7TxM3WRFTPCuA4e4,A Fine Frenzy,Almost Lover (Live),http://lyrics.wikia.com/A_Fine_Frenzy:Almost_L...,Your fingertips across my skin<br>The palm tre...
3,33VihH9UNQMxiQS4wcPIKL,A Flock Of Seagulls,I Ran,http://lyrics.wikia.com/A_Flock_Of_Seagulls:I_...,I walk along the avenue<br>I never thought I&a...
4,5VNW7zhvsqo5UD0kUiRTYr,A Flock Of Seagulls,I Ran (So Far Away) (Re-Recorded / Remastered),http://lyrics.wikia.com/A_Flock_Of_Seagulls:I_...,I walk along the avenue<br>I never thought I&a...


In [24]:
print('aligned indices    :', len(mpd_with_lyrics))
print('unique Spotify URIs:', len(set(mpd_with_lyrics['spotify_track_uri'])))
print('unique lyrics      :', len(set(mpd_with_lyrics['lyrics'])))

aligned indices    : 416121
unique Spotify URIs: 416121
unique lyrics      : 358334


In [23]:
mpd_unaligned = pd.read_csv('resources/mpd to wasabi alignment/mpd_unaligned.csv', sep='\t', encoding='utf8')
print('Not aligned MPD indices:', len(mpd_unaligned))

Not aligned MPD indices: 1846171


In [21]:
all_lyrics = mpd_with_lyrics.head()['lyrics']
all_lyrics

0    (Cactus)<br>What are we waiting for? It&apos;s...
1    Don&apos;t blink, they won&apos;t even miss yo...
2    Your fingertips across my skin<br>The palm tre...
3    I walk along the avenue<br>I never thought I&a...
4    I walk along the avenue<br>I never thought I&a...
Name: lyrics, dtype: object

In [8]:
# parse lyrics to segment-line-structure, assuming lines are separated by line_border_indicator and
# segments are separated by multiple consecutive line_border_indicator occurences
# assuming line_border_indicator is <br> (standard in lyrics.wikia.com)
def tree_structure(text):
    #normalize segment border encoding
    segment_border_encoder = '<segmentborder>'
    line_border_encoder = '<lineborder>'
    tree_string = re.sub('(( )*<br>( )*){2,}', segment_border_encoder, text)
    tree_string = re.sub('( )*<br>( )*', line_border_encoder, tree_string)
    #parse tree_string
    segment_structure = tree_string.split(segment_border_encoder)
    tree_structure = list(map(lambda segment: segment.split(line_border_encoder), segment_structure))
    return tree_structure

#flattened tree structure, does not differentiate between segment and line border
def line_structure(lyric_tree):
    return reduce(lambda x, segment: x + segment, lyric_tree, [])

def extend_with_return(some_list, other_list):
    some_list.extend(other_list)
    return some_list

# flattened line_structure
def token_structure(lyric_tree, tokenizer=word_tokenize):
    return reduce(lambda x, line: extend_with_return(x, tokenizer(line)), line_structure(lyric_tree), [])

def pretty_print_tree(text_tree):
    space_between = '    '
    res = ''
    output_separator = '\n'
    block_index = 0
    line_index = 0
    for block in text_tree:
        if not block:
            continue
        line_in_block_index = 0
        for line in block:
            line = line.strip()
            if not line:
                continue
            line_pretty = space_between + str(block_index) + '.' + str(line_in_block_index)\
                          + space_between + str(line_index) + space_between + line + output_separator
            res += line_pretty
            line_in_block_index += 1
            line_index += 1
        block_index += 1
        res += output_separator
    return res

# normalizations we want to apply to all lyrics go here
def normalize_lyric(lyric):
    lyric = html.unescape(lyric)
    lyric = lyric.lower()
    return lyric

# Reduce a list of numbers to single number / feature (cf. np.average, np.std, ...)
def list_span(some_list):
    return min(some_list) / max(some_list)

# TODO: consistency: difference between consecutive segments

In [9]:
######################################
########Stylometric features##########
######################################

def type_token_ratio(lyric_tokens):
    return len(set(lyric_tokens)) / len(lyric_tokens)

def line_lengths_in_chars(lyric_lines):
    return list(map(len, lyric_lines))

def line_lengths_in_tokens(lyric_lines):
    return list(map(lambda line: len(word_tokenize(line)), lyric_lines))

In [10]:
##################################
########Segment features##########
##################################

#The indices of lines that end a segment
def segment_borders(lyric_tree):
    segment_lengths = reduce(lambda x, block: x + [len(block)], lyric_tree, [])
    segment_indices = []
    running_sum = -1
    for i in range(len(segment_lengths)):
        running_sum += segment_lengths[i]
        segment_indices.append(running_sum)
    return segment_indices[:-1]

########################################
######Here are the actual features######
########################################
# lengths of the segments
def segment_lengths(lyric_tree):
    return reduce(lambda x, block: x + [len(block)], lyric_tree, [])

In [22]:
for lyric in all_lyrics:
    # See segments and lines of lyric
    print(pretty_print_tree(tree_structure(html.unescape(lyric))))
    
    # compute different representations of lyric
    lyric = normalize_lyric(lyric)
    lyric_tree = tree_structure(lyric)
    lyric_lines = line_structure(lyric_tree)
    lyric_tokens = token_structure(lyric_tree)
    
    # segmentation features
    seg_lengths = segment_lengths(lyric_tree)
    #print('segment lengths [lines]:', seg_lengths)
    print('segment count:', len(seg_lengths))
    print('segment length avg.:', np.median(seg_lengths))
    print('segment length std.:', np.std(seg_lengths))
    print('segment length span:', list_span(seg_lengths))
    
    # stylometric features
    ln_lengths_chars = line_lengths_in_chars(lyric_lines)
    ln_lengths_tokens = line_lengths_in_tokens(lyric_lines)
    #print('line lengths [chars]:', ln_lengths_chars)
    print('line count:', len(ln_lengths_chars))
    print('line length avg. [chars]:', np.median(ln_lengths_chars))
    print('line length std. [chars]:', np.std(ln_lengths_chars))
    print('line length span [chars]:', list_span(ln_lengths_chars))
    
    #print('line lengths [tokens]:', ln_lengths_tokens)
    print('line length avg. [tokens]:', np.median(ln_lengths_tokens))
    print('line length std. [tokens]:', np.std(ln_lengths_tokens))
    print('line length span [tokens]:', list_span(ln_lengths_tokens))

    print('type-token ratio:', type_token_ratio(lyric_tokens))
    
    print('\n\n\n\n\n\n\n\n\n\n')

    0.0    0    (Cactus)
    0.1    1    What are we waiting for? It's only slipping away
    0.2    2    (Torcha)
    0.3    3    What can be done at all? My god, life is so wonderful

    1.0    4    (Cactus)
    1.1    5    What are we waiting for, it's only slipping away
    1.2    6    (Torcha)
    1.3    7    What can be done at all? My god life is so wonderful

    2.0    8    Some claiming it's insane
    2.1    9    To be driven for a change, should redirect aim
    2.2    10    (Everything stays the same)
    2.3    11    Living is great, fuck switching up lanes, perfect picture in my frame
    2.4    12    (So sic and so vein)

    3.0    13    Got many trying to cope
    3.1    14    We ain't throwing no ropes, spent our pennies on dope
    3.2    15    (Got plenty but we broke)
    3.3    16    So soles choked in confusion, overdosed in potent illusions

    4.0    17    Got people trying to climb that be teaching to the blind
    4.1    18    The defeated in the mind, rea

In [8]:
# TODO: normalize feature-wise