In [1]:
import pandas as pd
import html
from functools import reduce
import re
import numpy as np
from nltk import word_tokenize

In [2]:
# http://lyrics.wikia.com/wiki/Eartha:Love_Jones
some_lyric = 'I&apos;ve got a love Jones for you Jesus<br>Something in my heart is going on<br>I&apos;ve got a love song for you Jesus<br>And I&apos;ll sing it all day long<br><br>It is so amazing the joy I feel inside<br>And it&apos;s fascinating just to have you in my life<br>Everyone keeps wondering what&apos;s wrong with me these days<br>Ain&apos;t got a clue about me and you; they just think I&apos;m in a daze<br><br>When my heart was broken, it was you who mended me<br>Bound it up with chords of love your tender remedy<br>If I could repay you I would anoint your feet <br>And this girl would tell the world how much you mean to me<br><br>I must be in love, you&apos;re all I think of <br>When I wake in the morning, <br>You&apos;re the only one that&apos;s on my mind <br>In the noonday Lord I feel your embrace <br><br>When the world surrounding me is so very cold and unkind<br>In the evening I hear you speaking to me <br>Through your holy Word calling me your beloved<br>I&apos;ve been bought with a price<br>Lord that&apos;s sure mighty nice that you love me so much<br><br>Everyday is jubilee because of what you&apos;ve done in me<br>Shown me how sweet life could be, you&apos;re the only remedy  <br>Everyday is a holiday since you came into my life to stay<br>Hear me Jesus when I say I never thought I&apos;d feel this way<br><br>I&apos;ve got nothin? but love for you Jesus<br>Got a love Jones for your love<br>Out of all the people in the world, <br>You&apos;re the only one that I&apos;m constantly thinking of.'

# http://lyrics.wikia.com/wiki/The_Rolling_Stones:Keys_To_Your_Love
other_lyric = 'Keys to your love<br>I&apos;ve got the keys to your love<br><br>No matter where I go baby, no matter what I do<br>I spend my whole life honey, just thinking of you<br>Yeah it don&apos;t seem to matter, who&apos;s right and who&apos;s wrong<br>I want to tell you how I feel now, in the words of this song<br><br>What now baby, put a spell on you<br>There ain&apos;t nothing that you&apos;re going to do<br><br>I&apos;ve got the keys to your love<br>I&apos;ve got the secret of your heart<br>No matter where you I go baby<br>We&apos;re not too far apart<br><br>I&apos;ve got the keys to your love<br>I&apos;ve got the secret of your heart<br>No matter where you I go honey<br>We&apos;re not too far apart<br><br>Now you try to lock me out, I can always get in<br>I&apos;ve got every permutation, every code and every pin<br>I&apos;m so crazy about you, yeah that&apos;s the word on the street<br>Yeah I bet you heard all about it from the people you meet<br><br>You know baby, I put a spell on you<br>Can&apos;t take it off now, there ain&apos;t nothing you can do<br><br>I&apos;ve got the keys to your love<br>I play the keys of your heart<br>It sounds so sweet baby<br>Won&apos;t stop once it starts<br><br>I&apos;ve got the keys to your love<br>I&apos;ve got the secret of your heart<br>No matter where you go baby<br>We&apos;re not too far apart<br><br>You&apos;re all that I&apos;ve got<br>Without you baby, my life&apos;s kind of shot<br>I look for your face<br>&apos;Cause no one else could fill this space<br><br>Watch out baby, I put a spell on you<br>You can&apos;t resist it, I just hoodoo you<br><br>I&apos;ve got the keys to your love<br>I&apos;ve got the secret of your heart<br>No matter where I go baby<br>We&apos;re not too far apart<br>No matter where I go baby<br>No matter where I go sugar<br><br>I&apos;ve got the keys to your love, baby<br>Keys to your love<br>Keys to your love<br>Keys to your love<br>Keys to your love<br>Keys to your love'

# list of all lyrics we want to compute features from
all_lyrics = [some_lyric, other_lyric]

In [3]:
# parse lyrics to segment-line-structure, assuming lines are separated by line_border_indicator and
# segments are separated by multiple consecutive line_border_indicator occurences
# assuming line_border_indicator is <br> (standard in lyrics.wikia.com)
def tree_structure(text):
    #normalize segment border encoding
    segment_border_encoder = '<segmentborder>'
    line_border_encoder = '<lineborder>'
    tree_string = re.sub('(( )*<br>( )*){2,}', segment_border_encoder, text)
    tree_string = re.sub('( )*<br>( )*', line_border_encoder, tree_string)
    #parse tree_string
    segment_structure = tree_string.split(segment_border_encoder)
    tree_structure = list(map(lambda segment: segment.split(line_border_encoder), segment_structure))
    return tree_structure

#flattened tree structure, does not differentiate between segment and line border
def line_structure(lyric_tree):
    return reduce(lambda x, segment: x + segment, lyric_tree, [])

def extend_with_return(some_list, other_list):
    some_list.extend(other_list)
    return some_list

# flattened line_structure
def token_structure(lyric_tree, tokenizer=word_tokenize):
    return reduce(lambda x, line: extend_with_return(x, tokenizer(line)), line_structure(lyric_tree), [])

def pretty_print_tree(text_tree):
    space_between = '    '
    res = ''
    output_separator = '\n'
    block_index = 0
    line_index = 0
    for block in text_tree:
        if not block:
            continue
        line_in_block_index = 0
        for line in block:
            line = line.strip()
            if not line:
                continue
            line_pretty = space_between + str(block_index) + '.' + str(line_in_block_index)\
                          + space_between + str(line_index) + space_between + line + output_separator
            res += line_pretty
            line_in_block_index += 1
            line_index += 1
        block_index += 1
        res += output_separator
    return res

# normalizations we want to apply to all lyrics go here
def normalize_lyric(lyric):
    lyric = html.unescape(lyric)
    lyric = lyric.lower()
    return lyric

# Reduce a list of numbers to single number / feature (cf. np.average, np.std, ...)
def list_span(some_list):
    return min(some_list) / max(some_list)

# TODO: consistency: difference between consecutive segments

In [4]:
######################################
########Stylometric features##########
######################################

def type_token_ratio(lyric_tokens):
    return len(set(lyric_tokens)) / len(lyric_tokens)

def line_lengths_in_chars(lyric_lines):
    return list(map(len, lyric_lines))

def line_lengths_in_tokens(lyric_lines):
    return list(map(lambda line: len(word_tokenize(line)), lyric_lines))

In [5]:
##################################
########Segment features##########
##################################

#The indices of lines that end a segment
def segment_borders(lyric_tree):
    segment_lengths = reduce(lambda x, block: x + [len(block)], lyric_tree, [])
    segment_indices = []
    running_sum = -1
    for i in range(len(segment_lengths)):
        running_sum += segment_lengths[i]
        segment_indices.append(running_sum)
    return segment_indices[:-1]

########################################
######Here are the actual features######
########################################
# lengths of the segments
def segment_lengths(lyric_tree):
    return reduce(lambda x, block: x + [len(block)], lyric_tree, [])

In [6]:
# DEBUG: printing lyrics
#given a text tree structure, print it nicely




In [7]:
for lyric in all_lyrics:
    # See segments and lines of lyric
    print(pretty_print_tree(tree_structure(html.unescape(lyric))))
    
    # compute different representations of lyric
    lyric = normalize_lyric(lyric)
    lyric_tree = tree_structure(lyric)
    lyric_lines = line_structure(lyric_tree)
    lyric_tokens = token_structure(lyric_tree)
    
    # segmentation features
    seg_lengths = segment_lengths(lyric_tree)
    #print('segment lengths [lines]:', seg_lengths)
    print('segment count:', len(seg_lengths))
    print('segment length avg.:', np.median(seg_lengths))
    print('segment length std.:', np.std(seg_lengths))
    print('segment length span:', list_span(seg_lengths))
    
    # stylometric features
    ln_lengths_chars = line_lengths_in_chars(lyric_lines)
    ln_lengths_tokens = line_lengths_in_tokens(lyric_lines)
    #print('line lengths [chars]:', ln_lengths_chars)
    print('line count:', len(ln_lengths_chars))
    print('line length avg. [chars]:', np.median(ln_lengths_chars))
    print('line length std. [chars]:', np.std(ln_lengths_chars))
    print('line length span [chars]:', list_span(ln_lengths_chars))
    
    #print('line lengths [tokens]:', ln_lengths_tokens)
    print('line length avg. [tokens]:', np.median(ln_lengths_tokens))
    print('line length std. [tokens]:', np.std(ln_lengths_tokens))
    print('line length span [tokens]:', list_span(ln_lengths_tokens))

    print('type-token ratio:', type_token_ratio(lyric_tokens))
    
    print('\n\n\n\n\n\n\n\n\n\n')

    0.0    0    I've got a love Jones for you Jesus
    0.1    1    Something in my heart is going on
    0.2    2    I've got a love song for you Jesus
    0.3    3    And I'll sing it all day long

    1.0    4    It is so amazing the joy I feel inside
    1.1    5    And it's fascinating just to have you in my life
    1.2    6    Everyone keeps wondering what's wrong with me these days
    1.3    7    Ain't got a clue about me and you; they just think I'm in a daze

    2.0    8    When my heart was broken, it was you who mended me
    2.1    9    Bound it up with chords of love your tender remedy
    2.2    10    If I could repay you I would anoint your feet
    2.3    11    And this girl would tell the world how much you mean to me

    3.0    12    I must be in love, you're all I think of
    3.1    13    When I wake in the morning,
    3.2    14    You're the only one that's on my mind
    3.3    15    In the noonday Lord I feel your embrace

    4.0    16    When the world sur

In [8]:
# TODO: normalize feature-wise