In [70]:
import sys
sys.path.append('/Users/mic.fell/Documents/venvs/jupyter/lib/python3.6/site-packages')

import pandas as pd
import html
from functools import reduce
import re
import numpy as np
from nltk import word_tokenize

import spacy
import pronouncing
from textblob import TextBlob

from sklearn.preprocessing import StandardScaler
from keras.models import load_model

import _pickle
from sklearn import preprocessing

nlp = spacy.load('en_core_web_lg')

In [2]:
mpd_with_lyrics = pd.read_csv('/Users/mic.fell/Documents/Jupyter Orbit/resources/recsys_challenge_2018/mpd_wasabi_aligned_langdetect.csv', sep='\t', encoding='utf8')
mpd_with_lyrics = mpd_with_lyrics.drop(['Unnamed: 0'], axis=1)
mpd_with_lyrics.head()

Unnamed: 0,spotify_track_uri,artist,title,urlSong,lyrics,langdetect
0,6Z32g3TxhI9KOEDxkF5whx,A Broken Silence,What Are We Waiting For (Life Is Wonderful),http://lyrics.wikia.com/A_Broken_Silence:What_...,(Cactus)<br>What are we waiting for? It&apos;s...,en
1,1fTNpl2mxqHVlLqRNbyDhR,A Day To Remember,"I'm Made of Wax, Larry, What Are You Made Of?",http://lyrics.wikia.com/A_Day_To_Remember:I%27...,"Don&apos;t blink, they won&apos;t even miss yo...",en
2,7BQk0o7TxM3WRFTPCuA4e4,A Fine Frenzy,Almost Lover (Live),http://lyrics.wikia.com/A_Fine_Frenzy:Almost_L...,Your fingertips across my skin<br>The palm tre...,en
3,33VihH9UNQMxiQS4wcPIKL,A Flock Of Seagulls,I Ran,http://lyrics.wikia.com/A_Flock_Of_Seagulls:I_...,I walk along the avenue<br>I never thought I&a...,en
4,5VNW7zhvsqo5UD0kUiRTYr,A Flock Of Seagulls,I Ran (So Far Away) (Re-Recorded / Remastered),http://lyrics.wikia.com/A_Flock_Of_Seagulls:I_...,I walk along the avenue<br>I never thought I&a...,en


In [171]:
features = read_file('/Users/mic.fell/Documents/Jupyter Orbit/resources/recsys_challenge_2018/mpd_ids_english_features.pickle')
features.head()

Unnamed: 0,spotify_track_uri,feature_vector
0,6Z32g3TxhI9KOEDxkF5whx,"[4.0, 0.442216638714, 1.6666666666666667, 0.31..."
1,1fTNpl2mxqHVlLqRNbyDhR,"[4.0, 1.86528548507, 7.0, 0.2045060658578856, ..."
2,7BQk0o7TxM3WRFTPCuA4e4,"[3.0, 2.63818119165, 4.0, 0.3495702005730659, ..."
3,33VihH9UNQMxiQS4wcPIKL,"[3.0, 1.77281052086, 3.0, 0.37272727272727274,..."
4,5VNW7zhvsqo5UD0kUiRTYr,"[3.0, 1.77281052086, 3.0, 0.37272727272727274,..."


In [3]:
# use only english lyrics
mpd_with_lyrics = mpd_with_lyrics[mpd_with_lyrics['langdetect'] == 'en']

In [4]:
print('aligned indices    :', len(mpd_with_lyrics))
print('unique Spotify URIs:', len(set(mpd_with_lyrics['spotify_track_uri'])))
print('unique lyrics      :', len(set(mpd_with_lyrics['lyrics'])))

aligned indices    : 367229
unique Spotify URIs: 367229
unique lyrics      : 314383


In [172]:
def read_file(name):
    with open(name, 'rb') as f:
        content = _pickle.load(f)
    return content

def write_file(content, name):
    with open(name, 'wb') as f:
        _pickle.dump(content, f)
        
def robust_min_max_scaler(matrix):
    matrix_robust = preprocessing.RobustScaler().fit_transform(matrix)
    matrix_mmax = preprocessing.MinMaxScaler().fit_transform(matrix_robust)
    return matrix_mmax

In [6]:
# parse lyrics to segment-line-structure, assuming lines are separated by line_border_indicator and
# segments are separated by multiple consecutive line_border_indicator occurences
# assuming line_border_indicator is <br> (standard in lyrics.wikia.com)
def tree_structure(text):
    #normalize segment border encoding
    segment_border_encoder = '<segmentborder>'
    line_border_encoder = '<lineborder>'
    tree_string = re.sub('(( )*<br>( )*){2,}', segment_border_encoder, text)
    tree_string = re.sub('( )*<br>( )*', line_border_encoder, tree_string)
    #parse tree_string
    segment_structure = tree_string.split(segment_border_encoder)
    tree_structure = list(map(lambda segment: segment.split(line_border_encoder), segment_structure))
    return tree_structure

#flattened tree structure, does not differentiate between segment and line border
def line_structure(lyric_tree):
    return reduce(lambda x, segment: x + segment, lyric_tree, [])

# flattened line_structure
def token_structure(lyric_tree, tokenizer=word_tokenize):
    return reduce(lambda x, line: extend_with_return(x, tokenizer(line)), line_structure(lyric_tree), [])

def extend_with_return(some_list, other_list):
    some_list.extend(other_list)
    return some_list

# normalizations we want to apply to all lyrics go here
def normalize_lyric(lyric):
    lyric = html.unescape(lyric)
    lyric = lyric.lower()
    return lyric

# Reduce a list of numbers to single number / feature (cf. np.average, np.std, ...)
def list_span(some_list):
    min_list = min(some_list)
    return max(some_list) / min_list if min_list > 0 else 1e-10

In [7]:
######################################
########Stylometric features##########
######################################

def type_token_ratio(lyric_tokens):
    return len(set(lyric_tokens)) / len(lyric_tokens)

def line_lengths_in_chars(lyric_lines):
    return list(map(len, lyric_lines))

def line_lengths_in_tokens(lyric_lines):
    return list(map(lambda line: len(word_tokenize(line)), lyric_lines))

def pos_tag_distribution(lyric_lines):
    # Look at https://spacy.io/api/annotation for a better description of each tag
    tags = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 
        'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']
    freq = dict()
    for tag in tags:
        freq[tag] = 0
    for line in lyric_lines:
        doc = nlp(line)
        for word in doc:
            if word.pos_ in tags:
                freq[word.pos_] += 1
    wc = sum(line_lengths_in_tokens(lyric_lines))
    for key in freq:
        freq[key] /= wc
    return freq

def get_rhymes(lyric_lines):
    count = 0
    lyric_length = len(lyric_lines)
    for i in range(lyric_length-1):
        words = lyric_lines[i].split()
        if len(words) < 1:
            continue
        rhymes = pronouncing.rhymes(words[-1])
        next_line_words = lyric_lines[i+1].split()
        if next_line_words is not None and len(next_line_words) > 0 and  next_line_words[-1] in rhymes:
            count += 1 
    return count / lyric_length if lyric_length > 0 else 0

def get_echoisms(lyric_lines):
    vowels = ['a', 'e', 'i', 'o', 'u']
    # Do echoism count on a word level
    echoism_count = 0
    for line in lyric_lines:
        doc = nlp(line)
        for i in range(len(doc) - 1):
            echoism_count += doc[i].text.lower() == doc[i+1].text.lower()
        # Count echoisms inside words e.g. yeeeeeeah
        for tk in doc:     
            for i in range(len(tk.text) - 1):
                if tk.text[i] == tk.text[i+1] and tk.text in vowels:
                    echoism_count += 1
                    break
    return echoism_count / sum(line_lengths_in_tokens(lyric_lines))

def is_title_in_lyrics(title, lyric_lines):
    for line in lyric_lines:
        if title in line:
            return True
    return False

def count_duplicate_lines(lyric_lines):
    wc = sum(line_lengths_in_tokens(lyric_lines))
    wc = wc if wc > 0 else 1
    return sum([lyric_lines.count(x) for x in list(set(lyric_lines)) if lyric_lines.count(x) > 1]) / wc

In [8]:
##################################
########Segment features##########
##################################

#The indices of lines that end a segment
def segment_borders(lyric_tree):
    segment_lengths = reduce(lambda x, block: x + [len(block)], lyric_tree, [])
    segment_indices = []
    running_sum = -1
    for i in range(len(segment_lengths)):
        running_sum += segment_lengths[i]
        segment_indices.append(running_sum)
    return segment_indices[:-1]

# lengths of the segments
def segment_lengths(lyric_tree):
    return reduce(lambda x, block: x + [len(block)], lyric_tree, [])

In [9]:
##################################
########Orientation feature#######
##################################
def get_verb_tense_frequencies(lyric_lines):
    freq = dict()
    freq['present'] = 0
    freq['future'] = 0
    freq['past'] = 0
    verbs_no = 0

    for line in lyric_lines:
        doc = nlp(line)
        for i in range(len(doc)):
            token = doc[i]
            if token.pos_ == 'VERB' and token.tag_ != 'MD': 
                verbs_no += 1
                if 'present' in spacy.explain(token.tag_):
                    freq['present'] += 1
                elif 'past' in spacy.explain(token.tag_):
                    freq['past'] += 1 
            elif token.pos_ == 'VERB' and token.tag_ == 'MD' and token.text.lower() == 'will':
                if i < len(doc) - 1:
                    i += 1
                    next_token = doc[i]
                    if next_token is not None and next_token.text == 'VB':
                        verbs_no += 1
                        freq['future'] += 1

    if verbs_no > 0:
        for key, value in freq.items():
            freq[key] = value/verbs_no

    return freq

def get_polarity_and_subjectivity(lyric_lines):
    text = '\n'.join(lyric_lines)
    opinion = TextBlob(text)
    sentiment = opinion.sentiment
    return (sentiment.polarity, sentiment.subjectivity)

In [10]:
##################################
########Emotion features##########
##################################

#import emoclassify as clf
#
#def get_emotion_vector(lyric, title, modelpath='emodetect.h5'):
#    return clf.classify(0, '', title, lyric_content = html.unescape(lyric).replace('<br>', '\n'))

In [13]:
def representations_from(lyric):
    """Compute different representations of lyric: tree (with paragraphs), lines, tokens"""
    lyric_tree = tree_structure(lyric)
    lyric_lines = line_structure(lyric_tree)
    lyric_tokens = token_structure(lyric_tree)
    return lyric_tree, lyric_lines, lyric_tokens

def feat_vect_from(feature_list):
    """Assuming a list of features of the lyric"""
    feat_vect = []
    feat_vect.append(np.median(feature_list))
    feat_vect.append(np.std(feature_list))
    feat_vect.append(list_span(feature_list))
    return feat_vect

def extend_feat_vect(feat_vect, feature_list):
    feat_vect.extend(feat_vect_from(feature_list))
    return feat_vect

def feature_vector_from(lyric, title):
    lyric_tree, lyric_lines, lyric_tokens = representations_from(lyric)
    
    # lump everything in a single feature vector
    feat_vect = []
    
    # segmentation features
    feat_vect = extend_feat_vect(feat_vect, segment_lengths(lyric_tree))
    
    # stylometric features
    feat_vect.append(type_token_ratio(lyric_tokens))
    ln_lengths_chars = line_lengths_in_chars(lyric_lines)
    # line count
    feat_vect.append(len(ln_lengths_chars))
    
    feat_vect = extend_feat_vect(feat_vect, ln_lengths_chars)
    feat_vect = extend_feat_vect(feat_vect, line_lengths_in_tokens(lyric_lines))
    feat_vect.append(get_rhymes(lyric_lines))
    
    # orientation features
    feat_vect.extend(get_polarity_and_subjectivity(lyric_lines))
    
    
    # emotion features
    #emo = get_emotion_vector(lyric, title)
    #feat_vect.extend(get_emotion_vector(lyric, title))
    
    #features using expensive nlp(.) calls
    #feat_vect.extend(pos_tag_distribution(lyric_lines).values())
    #feat_vect.extend(get_verb_tense_frequencies(lyric_lines).values())
    #feat_vect.append(get_echoisms(lyric_lines))
    
    return feat_vect

def feature_vectors_from(many_lyrics: list, many_titles: list) -> np.ndarray:
    many_count = len(many_lyrics)
    first_feat_vect = feature_vector_from(many_lyrics.iloc[0], many_titles.iloc[0])
    feat_vects = np.empty((many_count, len(first_feat_vect)), dtype=object)
    feat_vects[0] = first_feat_vect
    for i in range(1, many_count):
        feat_vects[i] = feature_vector_from(many_lyrics.iloc[i], many_titles.iloc[i])
        if i % 100 == 0:
            print('Progress:', i, 'of', many_count, '(' + str(round(i/many_count*100, 1)) + '%)')
    return feat_vects

In [174]:
# Define subset to compute features on
mpd_with_lyrics_subset = mpd_with_lyrics.iloc[:10]
all_uris = mpd_with_lyrics_subset['spotify_track_uri']
all_lyrics = mpd_with_lyrics_subset['lyrics']
all_titles = mpd_with_lyrics_subset['title']
assert len(all_uris) == len(all_lyrics) == len(all_titles)

In [177]:
matrix = feature_vectors_from(all_lyrics, all_titles)
print(matrix.shape)

(10, 14)


In [179]:
#Put back into dataframe
fvec = {}
for row in range(matrix.shape[0]):
    fvec[all_uris.iloc[row]] = matrix[row, :]
    
xf = pd.DataFrame(list(fvec.items()), columns=['spotify_track_uri', 'feature_vector'])
xf.head()

Unnamed: 0,spotify_track_uri,feature_vector
0,6Z32g3TxhI9KOEDxkF5whx,"[4.0, 0.442216638714, 1.6666666666666667, 0.31..."
1,1fTNpl2mxqHVlLqRNbyDhR,"[4.0, 1.86528548507, 7.0, 0.2045060658578856, ..."
2,7BQk0o7TxM3WRFTPCuA4e4,"[3.0, 2.63818119165, 4.0, 0.3495702005730659, ..."
3,33VihH9UNQMxiQS4wcPIKL,"[3.0, 1.77281052086, 3.0, 0.37272727272727274,..."
4,5VNW7zhvsqo5UD0kUiRTYr,"[3.0, 1.77281052086, 3.0, 0.37272727272727274,..."


In [123]:
write_file(content=xf, fl='/Users/mic.fell/Documents/Jupyter Orbit/resources/recsys_challenge_2018/mpd_ids_english_features.csv')

In [167]:
xx = read_file('/Users/mic.fell/Documents/Jupyter Orbit/resources/recsys_challenge_2018/mpd_ids_english_features.pickle')
xx

Unnamed: 0,spotify_track_uri,feature_vector
0,6Z32g3TxhI9KOEDxkF5whx,"[4.0, 0.442216638714, 1.6666666666666667, 0.31..."
1,1fTNpl2mxqHVlLqRNbyDhR,"[4.0, 1.86528548507, 7.0, 0.2045060658578856, ..."
2,7BQk0o7TxM3WRFTPCuA4e4,"[3.0, 2.63818119165, 4.0, 0.3495702005730659, ..."
3,33VihH9UNQMxiQS4wcPIKL,"[3.0, 1.77281052086, 3.0, 0.37272727272727274,..."
4,5VNW7zhvsqo5UD0kUiRTYr,"[3.0, 1.77281052086, 3.0, 0.37272727272727274,..."
5,5U2ieKAUXX052cndu0X39U,"[3.0, 1.77281052086, 3.0, 0.37272727272727274,..."
6,5Dee1RUQptEKvX5m9TNGGZ,"[3.0, 1.77281052086, 3.0, 0.37272727272727274,..."
7,7vJtcvKr3ujLUSatW3IBiE,"[3.0, 1.77281052086, 3.0, 0.37272727272727274,..."
8,2HsYqme4TK0EYKkrd6Mmue,"[3.0, 1.77281052086, 3.0, 0.37272727272727274,..."
9,6A3dh1AhCPasfBQexBaTgQ,"[4.0, 1.24721912892, 5.0, 0.24347826086956523,..."


In [169]:
xx.iloc[567].feature_vector.shape

(14,)

In [125]:
from time import time

mpd_limited = mpd_with_lyrics_subset
all_lyrics, all_titles = (mpd_limited['lyrics'], mpd_limited['title'])

t = time()
matrix = feature_vectors_from(all_lyrics, all_titles)
print('Lyrics count      :', len(all_lyrics))
time_taken = time()-t
print('Time taken [s]    :', round(time_taken, 2))
print('Time / lyric [s]  :', round(time_taken / len(all_lyrics), 3))
print('Time / 416k [h/m] :', round(time_taken / len(all_lyrics) * 416000 / 60 / 60, 1), '/', round(time_taken / len(all_lyrics) * 416000 / 60, 1))

t = time()
scaled_matrix = apply_to_columns(min_max_scaler, matrix)
print('Scaling', matrix.shape, ':', round(time() - t, 3))

Lyrics count      : 100
Time taken [s]    : 1.34
Time / lyric [s]  : 0.013
Time / 416k [h/m] : 1.6 / 93.0
Scaling (100, 14) : 0.001
