In [3]:
from selenium.webdriver import Chrome 
from bs4 import BeautifulSoup
import pymongo
import datetime
import time 
import pandas as pd
import re

In [4]:
mc = pymongo.MongoClient()

db = mc['chordify']

raw_html = db['raw_html']

In [119]:
raw_html.find().count()

4511

In [6]:
html_docs = list(raw_html.find())

In [120]:
import warnings 

def get_lines_from_song(html_doc):
    if 'html' in html_doc:
        html = html_doc['html']
    elif 'song_html' in html_doc:
        html = html_doc['song_html']
    else:
        raise KeyError (f"Html not found for {html_doc.get('_id')}")
    soup = BeautifulSoup(html, 'html.parser')
    song_body = soup.select_one('pre._1YgOS')
    song_lines = str(song_body).split('\n')
    return song_lines


def strip_html(text):
    result = []
    in_tag = False
    for char in text:
        if char == '<':
            in_tag = True
        if not in_tag:
            result.append(char)
        if char == '>':
            in_tag = False
    return ''.join(result)


def separate_lines(html_doc):
    song_lines = get_lines_from_song(html_doc)
    lines = []
    for i, song_line in enumerate(song_lines):
        if '_3L0Da' in song_line:
            lines.append({'chords': strip_html(song_line)})
        elif (('_3L0Da' in song_lines[i-1]) and (song_line == song_line) and (strip_html(song_line) == song_line)):
            lines[-1]['words'] = song_line
    return lines


def get_chords(line):
    chord_idxs = []
    chords = []
    c_string = line['chords']
    for chord in re.finditer('\w+', c_string):
        chord_idxs.append(chord.start())
        chords.append(chord.group())
    chord_tups = list(zip(chord_idxs, chords))
    return chord_idxs, chords, chord_tups


def get_words(line):
    word_idxs = []
    words = []
    if 'words'in line:
        w_string = line['words']
        for word in re.finditer(r"\w[\w']*", w_string):
            word_idxs.append(word.start())
            words.append(word.group())
    word_tups = list(zip(word_idxs, words))
    return word_idxs, words, word_tups


def merge_chord_word(line):
    chord_tups = get_chords(line)[2]
    word_tups = get_words(line)[2]
    word_list = get_words(line)[1]
    chord_idx_list = []
    for chord_tup in chord_tups:
        for i, word_tup in enumerate(word_tups):
            if word_tup[0] > chord_tup[0]:
                chord_idx_list.append((i-1, chord_tup[1]))
                break
    return (chord_idx_list, word_list)


def combine_ch_wd_lists(merged_line_1, merged_line_2):
    chord_idx_list_1, word_list_1 = merged_line_1
    chord_idx_list_2, word_list_2 = merged_line_2
    new_chord_tups = []
    for chord_idx_tup in chord_idx_list_2:
        new_chord_tups.append( ( ( (chord_idx_tup)[0] + len(word_list_1) ), chord_idx_tup[1] ) )
    all_chords_tups = chord_idx_list_1 + new_chord_tups
    all_words_list = word_list_1 + word_list_2
    return all_chords_tups, all_words_list


def parse_lines(lines):
    if len(lines) == 0:
        raise ValueError("Bad song.")
    for i, line in enumerate(lines):
        if i == 0:
            parsed_line = merge_chord_word(line)
        else:
            parsed_line_next = merge_chord_word(line)
            parsed_line = combine_ch_wd_lists(parsed_line, parsed_line_next)
    return parsed_line 

def parse_song(html_doc):
    lines = separate_lines(html_doc)
    parsed_song = parse_lines(lines)
    return parsed_song 

def parse_many(html_docs):
    parsed_songs = []
    for html_doc in html_docs:
        try:
            parsed_song = parse_song(html_doc)
            parsed_songs.append(parsed_song)
        except KeyError as e:
            warnings.warn(e.message)
            continue 
        except ValueError as e:
            warnings.warn(str(e))
            continue
    return parsed_songs 

In [10]:
def get_all_lyrics_chords(song):
    song_lyrics = ' '.join(song[1])
    song_chords = []
    for ch_tup in song[0]:
        song_chords.append(ch_tup[1])
    return (song_lyrics, song_chords) 

In [122]:
parse_many(html_docs[555:559])

[([(0, 'D'), (2, 'G'), (8, 'D'), (12, 'D'), (13, 'G'), (20, 'D')],
  ["I'm",
   'a',
   'man',
   'of',
   'constant',
   'sorrow',
   'Ive',
   'seen',
   'trouble',
   'all',
   'my',
   'days',
   "I'll",
   'say',
   'goodbye',
   'to',
   'Colorado',
   'Where',
   'I',
   'was',
   'born',
   'and',
   'partly',
   'raised']),
 ([(0, 'D'),
   (3, 'G'),
   (10, 'D'),
   (15, 'D'),
   (17, 'G'),
   (23, 'D'),
   (27, 'D'),
   (29, 'G'),
   (36, 'D'),
   (41, 'D'),
   (43, 'G'),
   (52, 'D'),
   (56, 'D'),
   (59, 'G'),
   (66, 'D'),
   (70, 'D'),
   (73, 'G'),
   (79, 'D'),
   (84, 'D'),
   (87, 'G'),
   (91, 'D'),
   (96, 'D'),
   (100, 'G'),
   (105, 'D')],
  ['I',
   'am',
   'a',
   'man',
   'of',
   'constant',
   'sorrow',
   'And',
   'I',
   'dont',
   'drugs',
   'for',
   'all',
   'my',
   'days',
   'I',
   'say',
   'goodbye',
   'to',
   'Colorado',
   'Where',
   'I',
   'was',
   'born',
   'and',
   'partly',
   'raised',
   'Your',
   'mother',
   'says',
   'im'

In [121]:
one_song = parse_song(html_docs[129])
one_song

([(1, 'Em'),
  (10, 'Em'),
  (19, 'Am7'),
  (25, 'Em'),
  (32, 'Em'),
  (35, 'Em'),
  (37, 'Am7'),
  (42, 'Em'),
  (46, 'Bsus4'),
  (48, 'Dsus2'),
  (54, 'Em'),
  (67, 'Em'),
  (77, 'Am7'),
  (87, 'Em'),
  (93, 'Em'),
  (97, 'Em'),
  (100, 'Am7'),
  (105, 'Em'),
  (110, 'Bsus4'),
  (113, 'Dsus2'),
  (119, 'Em'),
  (125, 'Em'),
  (136, 'Am7'),
  (143, 'Em'),
  (149, 'Em'),
  (152, 'Em'),
  (155, 'Am7'),
  (159, 'Em'),
  (160, 'Em7'),
  (162, 'Bsus4'),
  (165, 'Dsus2'),
  (171, 'Em'),
  (178, 'Em'),
  (188, 'Am7'),
  (196, 'Em'),
  (204, 'Em'),
  (208, 'Em'),
  (210, 'Am7'),
  (217, 'Em'),
  (220, 'Bsus4'),
  (223, 'Dsus2'),
  (230, 'Em'),
  (237, 'Em'),
  (245, 'Am7'),
  (252, 'Em'),
  (261, 'Em'),
  (264, 'Em'),
  (266, 'Am7'),
  (271, 'Em'),
  (276, 'Bsus4'),
  (281, 'Dsus2'),
  (285, 'Em'),
  (292, 'Em'),
  (300, 'Am7'),
  (309, 'Em'),
  (317, 'Em'),
  (321, 'Em'),
  (329, 'Em'),
  (332, 'Bsus4'),
  (335, 'Dsus2'),
  (341, 'Em'),
  (350, 'Em'),
  (358, 'Am7'),
  (369, 'Em'),
  (375, 

In [118]:
paragraph, ch_lst = get_all_lyrics_chords(one_song)
paragraph

"As I walked out tonight in the mystic garden The wounded flowers were dangling from the vines I was passing by yon cool and crystal fountain Someone hit me from behind Ain't talkin' just walkin' Through this weary world of woe Heart burnin' still yearnin' No one on earth would ever know They say prayer has the power to help So pray from the mother In the human heart an evil spirit can dwell I'm trying to love my neighbor and do good unto others But oh mother things ain't going well Ain't talkin' just walkin' I'll burn that bridge before you get across Heart burnin' still yearnin' There'll be no mercy for you once you've lost Now I'm all worn out by weepin' My eyes are full of tears my lips are dry If I catch my opponents ever sleepin' I'll just slaughter them where they lie Ain't talkin' just walkin' Through the world mysterious and vague Heart burnin' still yearnin' Walking through the cities of the plague Well the whole world is filled with speculation The whole wide world which peo

In [13]:
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

input_string = remove_accents(paragraph)

In [14]:
from nltk.tokenize import sent_tokenize

sent_tokens = sent_tokenize(input_string)

sent_tokens

["As I walked out tonight in the mystic garden The wounded flowers were dangling from the vines I was passing by yon cool and crystal fountain Someone hit me from behind Ain't talkin' just walkin' Through this weary world of woe Heart burnin' still yearnin' No one on earth would ever know They say prayer has the power to help So pray from the mother In the human heart an evil spirit can dwell I'm trying to love my neighbor and do good unto others But oh mother things ain't going well Ain't talkin' just walkin' I'll burn that bridge before you get across Heart burnin' still yearnin' There'll be no mercy for you once you've lost Now I'm all worn out by weepin' My eyes are full of tears my lips are dry If I catch my opponents ever sleepin' I'll just slaughter them where they lie Ain't talkin' just walkin' Through the world mysterious and vague Heart burnin' still yearnin' Walking through the cities of the plague Well the whole world is filled with speculation The whole wide world which pe

In [15]:
from nltk.tokenize import word_tokenize

tokens = [sent for sent in map(word_tokenize, sent_tokens)]

list(enumerate(tokens))

[(0,
  ['As',
   'I',
   'walked',
   'out',
   'tonight',
   'in',
   'the',
   'mystic',
   'garden',
   'The',
   'wounded',
   'flowers',
   'were',
   'dangling',
   'from',
   'the',
   'vines',
   'I',
   'was',
   'passing',
   'by',
   'yon',
   'cool',
   'and',
   'crystal',
   'fountain',
   'Someone',
   'hit',
   'me',
   'from',
   'behind',
   'Ai',
   "n't",
   'talkin',
   "'",
   'just',
   'walkin',
   "'",
   'Through',
   'this',
   'weary',
   'world',
   'of',
   'woe',
   'Heart',
   'burnin',
   "'",
   'still',
   'yearnin',
   "'",
   'No',
   'one',
   'on',
   'earth',
   'would',
   'ever',
   'know',
   'They',
   'say',
   'prayer',
   'has',
   'the',
   'power',
   'to',
   'help',
   'So',
   'pray',
   'from',
   'the',
   'mother',
   'In',
   'the',
   'human',
   'heart',
   'an',
   'evil',
   'spirit',
   'can',
   'dwell',
   'I',
   "'m",
   'trying',
   'to',
   'love',
   'my',
   'neighbor',
   'and',
   'do',
   'good',
   'unto',
   'oth

In [16]:
import string

tokens_lower = [[word.lower() for word in sent]
                 for sent in tokens]

In [17]:
from nltk.corpus import stopwords

stopwords_ = "a,able,about,across,after,all,almost,also,am,among,an,and,any,\
are,as,at,be,because,been,but,by,can,could,dear,did,do,does,either,\
else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,\
how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,\
me,might,most,must,my,neither,no,of,off,often,on,only,or,other,our,\
own,rather,said,say,says,she,should,since,so,some,than,that,the,their,\
them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,\
what,when,where,which,while,who,whom,why,will,with,would,yet,you,your]".split(',')

print("--- stopwords in english: {}".format(stopwords_))

--- stopwords in english: ['a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'but', 'by', 'can', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your]']


In [18]:
import string

punctuation_ = set(string.punctuation)
print("--- punctuation: {}".format(string.punctuation))

def filter_tokens(sent):
    return([w for w in sent if not w in stopwords_ and not w in punctuation_])

tokens_filtered = list(map(filter_tokens, tokens_lower))

for sent in tokens_filtered:
    print("--- sentence tokens: {}".format(sent))

--- punctuation: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
--- sentence tokens: ['walked', 'out', 'tonight', 'mystic', 'garden', 'wounded', 'flowers', 'dangling', 'vines', 'passing', 'yon', 'cool', 'crystal', 'fountain', 'someone', 'hit', 'behind', 'ai', "n't", 'talkin', 'walkin', 'through', 'weary', 'world', 'woe', 'heart', 'burnin', 'still', 'yearnin', 'one', 'earth', 'know', 'prayer', 'power', 'help', 'pray', 'mother', 'human', 'heart', 'evil', 'spirit', 'dwell', "'m", 'trying', 'love', 'neighbor', 'good', 'unto', 'others', 'oh', 'mother', 'things', 'ai', "n't", 'going', 'well', 'ai', "n't", 'talkin', 'walkin', "'ll", 'burn', 'bridge', 'before', 'heart', 'burnin', 'still', 'yearnin', "'ll", 'mercy', 'once', "'ve", 'lost', 'now', "'m", 'worn', 'out', 'weepin', 'eyes', 'full', 'tears', 'lips', 'dry', 'catch', 'opponents', 'sleepin', "'ll", 'slaughter', 'lie', 'ai', "n't", 'talkin', 'walkin', 'through', 'world', 'mysterious', 'vague', 'heart', 'burnin', 'still', 'yearnin', 'walking', 'through',

In [19]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer

stemmer_porter = PorterStemmer()
tokens_stemporter = [list(map(stemmer_porter.stem, sent)) for sent in tokens_filtered]
print("--- sentence tokens (porter): {}".format(tokens_stemporter[0]))

stemmer_snowball = SnowballStemmer('english')
tokens_stemsnowball = [list(map(stemmer_snowball.stem, sent)) for sent in tokens_filtered]
print("--- sentence tokens (snowball): {}".format(tokens_stemsnowball[0]))

--- sentence tokens (porter): ['walk', 'out', 'tonight', 'mystic', 'garden', 'wound', 'flower', 'dangl', 'vine', 'pass', 'yon', 'cool', 'crystal', 'fountain', 'someon', 'hit', 'behind', 'ai', "n't", 'talkin', 'walkin', 'through', 'weari', 'world', 'woe', 'heart', 'burnin', 'still', 'yearnin', 'one', 'earth', 'know', 'prayer', 'power', 'help', 'pray', 'mother', 'human', 'heart', 'evil', 'spirit', 'dwell', "'m", 'tri', 'love', 'neighbor', 'good', 'unto', 'other', 'oh', 'mother', 'thing', 'ai', "n't", 'go', 'well', 'ai', "n't", 'talkin', 'walkin', "'ll", 'burn', 'bridg', 'befor', 'heart', 'burnin', 'still', 'yearnin', "'ll", 'merci', 'onc', "'ve", 'lost', 'now', "'m", 'worn', 'out', 'weepin', 'eye', 'full', 'tear', 'lip', 'dri', 'catch', 'oppon', 'sleepin', "'ll", 'slaughter', 'lie', 'ai', "n't", 'talkin', 'walkin', 'through', 'world', 'mysteri', 'vagu', 'heart', 'burnin', 'still', 'yearnin', 'walk', 'through', 'citi', 'plagu', 'well', 'whole', 'world', 'fill', 'specul', 'whole', 'wide', 

In [20]:
from nltk.util import ngrams

list(ngrams(tokens_stemsnowball[0],4))

[('walk', 'out', 'tonight', 'mystic'),
 ('out', 'tonight', 'mystic', 'garden'),
 ('tonight', 'mystic', 'garden', 'wound'),
 ('mystic', 'garden', 'wound', 'flower'),
 ('garden', 'wound', 'flower', 'dangl'),
 ('wound', 'flower', 'dangl', 'vine'),
 ('flower', 'dangl', 'vine', 'pass'),
 ('dangl', 'vine', 'pass', 'yon'),
 ('vine', 'pass', 'yon', 'cool'),
 ('pass', 'yon', 'cool', 'crystal'),
 ('yon', 'cool', 'crystal', 'fountain'),
 ('cool', 'crystal', 'fountain', 'someon'),
 ('crystal', 'fountain', 'someon', 'hit'),
 ('fountain', 'someon', 'hit', 'behind'),
 ('someon', 'hit', 'behind', 'ai'),
 ('hit', 'behind', 'ai', "n't"),
 ('behind', 'ai', "n't", 'talkin'),
 ('ai', "n't", 'talkin', 'walkin'),
 ("n't", 'talkin', 'walkin', 'through'),
 ('talkin', 'walkin', 'through', 'weari'),
 ('walkin', 'through', 'weari', 'world'),
 ('through', 'weari', 'world', 'woe'),
 ('weari', 'world', 'woe', 'heart'),
 ('world', 'woe', 'heart', 'burnin'),
 ('woe', 'heart', 'burnin', 'still'),
 ('heart', 'burnin', '

In [21]:
from nltk.util import ngrams

def join_sent_ngrams(input_tokens, n):
    # first add the 1-gram tokens
    ret_list = list(input_tokens)
    
    #then for each n
    for i in range(2,n+1):
        # add each n-grams to the list
        ret_list.extend(['-'.join(tgram) for tgram in ngrams(input_tokens, i)])
    
    return(ret_list)

tokens_ngrams = list(map(lambda x : join_sent_ngrams(x, 3), tokens_stemsnowball))

print("--- sentence tokens: {}".format(tokens_ngrams[0]))

--- sentence tokens: ['walk', 'out', 'tonight', 'mystic', 'garden', 'wound', 'flower', 'dangl', 'vine', 'pass', 'yon', 'cool', 'crystal', 'fountain', 'someon', 'hit', 'behind', 'ai', "n't", 'talkin', 'walkin', 'through', 'weari', 'world', 'woe', 'heart', 'burnin', 'still', 'yearnin', 'one', 'earth', 'know', 'prayer', 'power', 'help', 'pray', 'mother', 'human', 'heart', 'evil', 'spirit', 'dwell', "'m", 'tri', 'love', 'neighbor', 'good', 'unto', 'other', 'oh', 'mother', 'thing', 'ai', "n't", 'go', 'well', 'ai', "n't", 'talkin', 'walkin', 'll', 'burn', 'bridg', 'befor', 'heart', 'burnin', 'still', 'yearnin', 'll', 'merci', 'onc', 've', 'lost', 'now', "'m", 'worn', 'out', 'weepin', 'eye', 'full', 'tear', 'lip', 'dri', 'catch', 'oppon', 'sleepin', 'll', 'slaughter', 'lie', 'ai', "n't", 'talkin', 'walkin', 'through', 'world', 'mysteri', 'vagu', 'heart', 'burnin', 'still', 'yearnin', 'walk', 'through', 'citi', 'plagu', 'well', 'whole', 'world', 'fill', 'specul', 'whole', 'wide', 'world', 'peo

In [25]:
cd ..

/Users/emilynaftalin/galvanize/dsi/capstone/Guitar-Chord-Generator


In [26]:
import os               # for environ variables in Part 3
from src.nlp_pipeline import extract_bow_from_raw_text

In [27]:
parsed_songs = parse_many(html_docs[305:316])
parsed_songs

[([(2, 'F'),
   (9, 'Am'),
   (15, 'Bb'),
   (21, 'C'),
   (27, 'F'),
   (34, 'Am'),
   (40, 'Bb'),
   (41, 'C7'),
   (44, 'C'),
   (49, 'F'),
   (50, 'C'),
   (54, 'F'),
   (61, 'Am'),
   (68, 'Bb'),
   (75, 'C'),
   (80, 'F'),
   (84, 'Am'),
   (90, 'Bb'),
   (90, 'C'),
   (94, 'C'),
   (99, 'F'),
   (99, 'C'),
   (104, 'F'),
   (110, 'Am'),
   (116, 'Bb'),
   (122, 'C'),
   (128, 'F'),
   (135, 'Am'),
   (141, 'Bb'),
   (141, 'C'),
   (145, 'C'),
   (150, 'F'),
   (150, 'C')],
  ['May',
   'God',
   'bless',
   'and',
   'keep',
   'you',
   'always',
   'May',
   'your',
   'wishes',
   'all',
   'come',
   'true',
   'May',
   'you',
   'always',
   'do',
   'for',
   'others',
   'And',
   'let',
   'others',
   'do',
   'for',
   'you',
   'May',
   'you',
   'build',
   'a',
   'ladder',
   'to',
   'the',
   'stars',
   'And',
   'climb',
   'on',
   'every',
   'rung',
   'May',
   'you',
   'stay',
   'forever',
   'young',
   'Forever',
   'young',
   'forever',
   'young',

In [28]:
parsed_songs[6]

([(0, 'E'),
  (2, 'A'),
  (7, 'E'),
  (13, 'A'),
  (16, 'Bm'),
  (22, 'A'),
  (29, 'E'),
  (35, 'Bm'),
  (39, 'D'),
  (43, 'A'),
  (51, 'E'),
  (59, 'A'),
  (63, 'Bm'),
  (68, 'A'),
  (75, 'E'),
  (82, 'Bm'),
  (86, 'D'),
  (88, 'E'),
  (90, 'E'),
  (92, 'A'),
  (97, 'E'),
  (103, 'A'),
  (106, 'Bm'),
  (112, 'A'),
  (119, 'E'),
  (125, 'Bm'),
  (130, 'D'),
  (137, 'A'),
  (140, 'Bm'),
  (144, 'E'),
  (150, 'A'),
  (154, 'Bm'),
  (161, 'A'),
  (167, 'E'),
  (175, 'Bm'),
  (179, 'D'),
  (183, 'E'),
  (185, 'A'),
  (190, 'E'),
  (196, 'A'),
  (199, 'Bm'),
  (205, 'A'),
  (212, 'E')],
 ['Four',
  'strong',
  'winds',
  'that',
  'blow',
  'lonely',
  'Seven',
  'seas',
  'that',
  'run',
  'high',
  'All',
  'those',
  'things',
  'that',
  "don't",
  'change',
  'come',
  'what',
  'may',
  'But',
  'the',
  'good',
  'times',
  'are',
  'all',
  'gone',
  'And',
  "I'm",
  'bound',
  'for',
  'moving',
  'on',
  "I'll",
  'look',
  'for',
  'you',
  'if',
  'you',
  'ever',
  'come',
  

In [29]:
song_dict_list = []
for song in parsed_songs:
    song_dict  = {}
    song_dict['lyrics'] = get_all_lyrics_chords(song)[0]
    song_dict['chords'] = get_all_lyrics_chords(song)[1]
    song_dict_list.append(song_dict)
# extracting bows
    bows = list(map(lambda song_dict: extract_bow_from_raw_text(song_dict['lyrics']), song_dict_list))

# displaying bows
for i in range(len(song_dict_list)):
    print("\n--- chords: {}".format(song_dict_list[i]['chords']))
    print("--- lyrics: {}".format(song_dict_list[i]['lyrics']))
    print("--- bow: {}".format(bows[i])) 


--- chords: ['F', 'Am', 'Bb', 'C', 'F', 'Am', 'Bb', 'C7', 'C', 'F', 'C', 'F', 'Am', 'Bb', 'C', 'F', 'Am', 'Bb', 'C', 'C', 'F', 'C', 'F', 'Am', 'Bb', 'C', 'F', 'Am', 'Bb', 'C', 'C', 'F', 'C']
--- lyrics: May God bless and keep you always May your wishes all come true May you always do for others And let others do for you May you build a ladder to the stars And climb on every rung May you stay forever young Forever young forever young May you stay forever young May you grow up to be righteous May you grow up to be true May you always know the truth And see the light surrounding you May you always be courageous Stand upright and be strong May you stay forever young Forever young forever young May you stay forever young May your hands always be busy May your feet always be swift May you have a strong foundation When the winds are changing shift May your heart always be joyful And may your song always be sung May you stay forever young Forever young forever young May you stay forever young

In [30]:
from collections import Counter

# term occurence = counting distinct words in each bag
term_occ = list(map(lambda bow : Counter(bow), bows))

# term frequency = occurences over length of bag
term_freq = list()
for i in range(len(song_dict_list)):
    term_freq.append( {k: (v / float(len(bows[i])))
                       for k, v in term_occ[i].items()} )

# displaying occurences
for i in range(len(song_dict_list)):
    print("\n--- lyrics: {}".format(song_dict_list[i]['lyrics']))
    print("--- bow: {}".format(bows[i]))
    print("--- term_occ: {}".format(term_occ[i]))
    print("--- term_freq: {}".format(term_freq[i]))


--- lyrics: May God bless and keep you always May your wishes all come true May you always do for others And let others do for you May you build a ladder to the stars And climb on every rung May you stay forever young Forever young forever young May you stay forever young May you grow up to be righteous May you grow up to be true May you always know the truth And see the light surrounding you May you always be courageous Stand upright and be strong May you stay forever young Forever young forever young May you stay forever young May your hands always be busy May your feet always be swift May you have a strong foundation When the winds are changing shift May your heart always be joyful And may your song always be sung May you stay forever young Forever young forever young May you stay forever young
--- bow: ["b'may", 'god', 'bless', 'may', 'wish', 'true', 'may', 'other', 'other', 'may', 'ladder', 'star', 'climb', 'rung', 'may', 'young', 'forev', 'young', 'young', 'may', 'young', 'may',

In [31]:
# document occurence = number of documents having this word
# term frequency = occurences over length of bag

doc_occ = Counter( [word for bow in bows for word in set(bow)] )

# document frequency = occurences over length of corpus
doc_freq = {k: (v / float(len(song_dict_list)))
            for k, v in doc_occ.items()}

# displaying vocabulary
print("\n--- full vocabulary: {}".format(doc_occ))
print("\n--- doc freq: {}".format(doc_freq))


--- full vocabulary: Counter({'god': 8, 'other': 7, 'rung': 6, 'ladder': 6, 'truth': 6, 'hand': 6, 'climb': 6, 'young': 6, 'strong': 6, 'true': 6, 'wish': 6, 'star': 6, 'may': 6, 'bless': 6, 'b': 6, 'song': 5, 'feet': 5, 'foundat': 5, 'courag': 5, 'heart': 5, 'upright': 5, 'busi': 5, 'stand': 5, 'swift': 5, 'righteous': 5, 'sung': 5, 'joy': 5, 'wind': 5, 'forev': 4, 'light': 3, "b'may": 3, 'chang': 3, 'lie': 3, "ev'ri": 2, 'good': 2, 'high': 2, 'mind': 2, 'time': 2, 'way': 2, 'thing': 2, 'suit': 2, 'eye': 2, 'shoe': 2, 'floor': 2, 'lord': 2, 'oh': 2, 'babi': 2, 'shift': 1, "b'i": 1, 'chord': 1, 'measur': 1, 'ung': 1, 'yo': 1, "b'1": 1, 'blow': 1, 'more': 1, 'fli': 1, 'fare': 1, 'snow': 1, 'hundr': 1, 'friend': 1, 'alberta': 1, 'weather': 1, 'bound': 1, 'four': 1, 'much': 1, 'springtim': 1, 'seven': 1, 'fall': 1, 'wheelchair': 1, 'crutch': 1, 'last': 1, 'hallway': 1, 'come': 1, 'dear': 1, 'drum': 1, "ev'rybodi": 1, 'peic': 1, 'face': 1, 'jamaican': 1, 'word': 1, 'clear': 1, 'red': 1, '

In [32]:
# the minimum document frequency (in proportion of the length of the corpus)
min_df = 0.1

# filtering items to obtain the vocabulary
vocabulary = [ k for k,v in doc_freq.items() if v >= min_df ]

# print vocabulary
print ("-- vocabulary (len={}): {}".format(len(vocabulary),vocabulary))

-- vocabulary (len=47): ['rung', 'light', 'ladder', 'other', "b'may", 'truth', 'hand', 'climb', 'song', 'forev', 'feet', 'foundat', 'god', 'courag', 'heart', 'young', 'upright', 'strong', 'busi', 'stand', 'swift', 'true', 'righteous', 'sung', 'wish', 'joy', 'star', 'may', 'bless', 'wind', 'chang', 'lie', 'b', "ev'ri", 'good', 'high', 'mind', 'time', 'way', 'thing', 'suit', 'eye', 'shoe', 'floor', 'lord', 'oh', 'babi']


In [33]:
import numpy as np

# create a dense matrix of vectors for each document
# each vector has the length of the vocabulary
vectors = np.zeros((len(song_dict_list),len(vocabulary)))

# fill these vectors with tf-idf values
for i in range(len(song_dict_list)):
    for j in range(len(vocabulary)):
        term     = vocabulary[j]
        term_tf  = term_freq[i].get(term, 0.0)   # 0.0 if term not found in doc
        term_idf = np.log(1 + 1 / doc_freq[term]) # smooth formula
        vectors[i,j] = term_tf * term_idf

# displaying results
for i in range(len(song_dict_list)):
    print("\n--- review: {}".format(song_dict_list[i]['lyrics']))
    print("--- bow: {}".format(bows[i]))
    print("--- tfidf vector: {}".format( vectors[i] ) )
    print("--- tfidf sorted: {}".format( 
            sorted( zip(vocabulary,vectors[i]), key=lambda x:-x[1] )
         ))


--- review: May God bless and keep you always May your wishes all come true May you always do for others And let others do for you May you build a ladder to the stars And climb on every rung May you stay forever young Forever young forever young May you stay forever young May you grow up to be righteous May you grow up to be true May you always know the truth And see the light surrounding you May you always be courageous Stand upright and be strong May you stay forever young Forever young forever young May you stay forever young May your hands always be busy May your feet always be swift May you have a strong foundation When the winds are changing shift May your heart always be joyful And may your song always be sung May you stay forever young Forever young forever young May you stay forever young
--- bow: ["b'may", 'god', 'bless', 'may', 'wish', 'true', 'may', 'other', 'other', 'may', 'ladder', 'star', 'climb', 'rung', 'may', 'young', 'forev', 'young', 'young', 'may', 'young', 'may',

In [34]:
corpus = [song_dict['lyrics'] for song_dict in song_dict_list]

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

tf = CountVectorizer()

document_tf_matrix = tf.fit_transform(corpus).todense()

print(sorted(tf.vocabulary_))
print(document_tf_matrix)

['100', '12th', 'about', 'after', 'again', 'against', 'ago', 'ain', 'albert', 'alberta', 'alice', 'all', 'always', 'an', 'and', 'annotated', 'another', 'are', 'aren', 'as', 'ask', 'asked', 'baby', 'back', 'bartender', 'be', 'been', 'beer', 'before', 'better', 'big', 'bless', 'blow', 'blows', 'blues', 'bly', 'boomer', 'boot', 'born', 'bottom', 'bound', 'breaking', 'bringin', 'brought', 'bucket', 'build', 'busy', 'but', 'buttoned', 'by', 'calm', 'can', 'cat', 'cell', 'chained', 'change', 'changes', 'changing', 'child', 'chord', 'clear', 'climb', 'clothes', 'colt', 'come', 'corner', 'could', 'courageous', 'covered', 'cried', 'crutch', 'cry', 'cute', 'dead', 'deaf', 'dear', 'degree', 'destroy', 'did', 'didn', 'dirt', 'dixie', 'do', 'dog', 'dollers', 'don', 'done', 'down', 'drawer', 'driver', 'drum', 'each', 'else', 'enough', 'ev', 'ever', 'every', 'eyes', 'face', 'fall', 'fare', 'feet', 'fell', 'felt', 'filled', 'find', 'finding', 'flies', 'floor', 'for', 'forced', 'forever', 'forget', 'fo

In [36]:
from math import log

def idf(frequency_matrix):
    df =  float(len(document_tf_matrix)) / sum(frequency_matrix > 0)
    return [log(i) for i in df.getA()[0]]
print(sorted(tf.vocabulary_))
print(idf(document_tf_matrix))

['100', '12th', 'about', 'after', 'again', 'against', 'ago', 'ain', 'albert', 'alberta', 'alice', 'all', 'always', 'an', 'and', 'annotated', 'another', 'are', 'aren', 'as', 'ask', 'asked', 'baby', 'back', 'bartender', 'be', 'been', 'beer', 'before', 'better', 'big', 'bless', 'blow', 'blows', 'blues', 'bly', 'boomer', 'boot', 'born', 'bottom', 'bound', 'breaking', 'bringin', 'brought', 'bucket', 'build', 'busy', 'but', 'buttoned', 'by', 'calm', 'can', 'cat', 'cell', 'chained', 'change', 'changes', 'changing', 'child', 'chord', 'clear', 'climb', 'clothes', 'colt', 'come', 'corner', 'could', 'courageous', 'covered', 'cried', 'crutch', 'cry', 'cute', 'dead', 'deaf', 'dear', 'degree', 'destroy', 'did', 'didn', 'dirt', 'dixie', 'do', 'dog', 'dollers', 'don', 'done', 'down', 'drawer', 'driver', 'drum', 'each', 'else', 'enough', 'ev', 'ever', 'every', 'eyes', 'face', 'fall', 'fare', 'feet', 'fell', 'felt', 'filled', 'find', 'finding', 'flies', 'floor', 'for', 'forced', 'forever', 'forget', 'fo

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
document_tfidf_matrix = tfidf.fit_transform(corpus)
print(sorted(tfidf.vocabulary_))
print(document_tfidf_matrix.todense())

['100', '12th', 'about', 'after', 'again', 'against', 'ago', 'ain', 'albert', 'alberta', 'alice', 'all', 'always', 'an', 'and', 'annotated', 'another', 'are', 'aren', 'as', 'ask', 'asked', 'baby', 'back', 'bartender', 'be', 'been', 'beer', 'before', 'better', 'big', 'bless', 'blow', 'blows', 'blues', 'bly', 'boomer', 'boot', 'born', 'bottom', 'bound', 'breaking', 'bringin', 'brought', 'bucket', 'build', 'busy', 'but', 'buttoned', 'by', 'calm', 'can', 'cat', 'cell', 'chained', 'change', 'changes', 'changing', 'child', 'chord', 'clear', 'climb', 'clothes', 'colt', 'come', 'corner', 'could', 'courageous', 'covered', 'cried', 'crutch', 'cry', 'cute', 'dead', 'deaf', 'dear', 'degree', 'destroy', 'did', 'didn', 'dirt', 'dixie', 'do', 'dog', 'dollers', 'don', 'done', 'down', 'drawer', 'driver', 'drum', 'each', 'else', 'enough', 'ev', 'ever', 'every', 'eyes', 'face', 'fall', 'fare', 'feet', 'fell', 'felt', 'filled', 'find', 'finding', 'flies', 'floor', 'for', 'forced', 'forever', 'forget', 'fo

In [38]:
pairwise_similarity = document_tfidf_matrix * document_tfidf_matrix.T
pairwise_similarity

<11x11 sparse matrix of type '<class 'numpy.float64'>'
	with 121 stored elements in Compressed Sparse Row format>

In [39]:
pairwise_similarity.A

array([[ 1.        ,  0.81971268,  0.99265592,  0.94628364,  0.9494428 ,
         0.95725087,  0.15500751,  0.11007363,  0.04022103,  0.1205459 ,
         0.04801136],
       [ 0.81971268,  1.        ,  0.81843989,  0.77066382,  0.77251614,
         0.78615904,  0.16441761,  0.14024733,  0.0329806 ,  0.15045747,
         0.05589083],
       [ 0.99265592,  0.81843989,  1.        ,  0.95055161,  0.95060323,
         0.96387867,  0.14768078,  0.10723888,  0.04075206,  0.11534815,
         0.05188176],
       [ 0.94628364,  0.77066382,  0.95055161,  1.        ,  0.91702934,
         0.99540143,  0.17214969,  0.13367289,  0.04988764,  0.13543704,
         0.0620946 ],
       [ 0.9494428 ,  0.77251614,  0.95060323,  0.91702934,  1.        ,
         0.92686713,  0.1338413 ,  0.10825279,  0.03548429,  0.09359938,
         0.03562691],
       [ 0.95725087,  0.78615904,  0.96387867,  0.99540143,  0.92686713,
         1.        ,  0.17067205,  0.13378654,  0.04743112,  0.13372943,
         0.060

In [117]:
raw_html.find().count()

4479

In [41]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [42]:
# song_dict_list = []
# for song in parsed_songs:
#     song_dict  = {}
#     song_dict['lyrics'] = get_all_lyrics_chords(song)[0]
#     song_dict['chords'] = get_all_lyrics_chords(song)[1]
#     song_dict_list.append(song_dict)
# # extracting bows
#     bows = list(map(lambda song_dict: extract_bow_from_raw_text(song_dict['lyrics']), song_dict_list))

# # displaying bows
# for i in range(len(song_dict_list)):
#     print("\n--- chords: {}".format(song_dict_list[i]['chords']))
#     print("--- lyrics: {}".format(song_dict_list[i]['lyrics']))
#     print("--- bow: {}".format(bows[i])) 
    


In [43]:
song_dict_1 = song_dict_list[9]

In [44]:
any('m' in chord for chord in song_dict_1['chords'])

True

In [45]:
song_dict_list[1]

{'chords': ['F',
  'm',
  'C',
  'means',
  'F',
  'm',
  'chord',
  'and',
  'play',
  'C',
  'in',
  'the',
  'bass',
  'D',
  'F',
  'm',
  'C',
  'G',
  'B',
  'G',
  'D',
  'D',
  'F',
  'm',
  'C',
  'G',
  'G',
  'A',
  'A',
  'D',
  'D',
  'A',
  'A'],
 'lyrics': 'I annotated a chord for each measure it is used May God bless and keep you always May your wishes all come true May you always do for others And let others do for you May you build a ladder to the stars And climb on every rung And may you stay forever young Forever young Forever young May you stay forever young'}

In [46]:
contains_minor = []
lyrics_list = []

for song_dict in song_dict_list:
    contains_minor.append(any('m' in chord for chord in song_dict['chords']))
    lyrics_list.append(song_dict['lyrics'])
    

In [47]:
contains_minor

[True, True, True, False, True, False, True, True, False, True, False]

In [48]:
song_dict_list[10]['lyrics']

"I was born in Dixie in a boomer shack Just a little shanty by the railroad track Freight train was it taught me how to cry hummin' of the driver was my lullaby Oh Lord mama I got them in the bottom of my rambling shoes And when the whistle blows I gotta go baby don't you know Well it looks like I'm never gonna lose the freight train blues"

In [49]:
lyrics_list[10]

"I was born in Dixie in a boomer shack Just a little shanty by the railroad track Freight train was it taught me how to cry hummin' of the driver was my lullaby Oh Lord mama I got them in the bottom of my rambling shoes And when the whistle blows I gotta go baby don't you know Well it looks like I'm never gonna lose the freight train blues"

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
# document_tfidf_matrix = tfidf.fit_transform(corpus)
# print(sorted(tfidf.vocabulary_))
# print(document_tfidf_matrix.todense())

In [51]:
tfidf.fit(lyrics_list, contains_minor)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [52]:
tfidf.transform(lyrics_list, contains_minor)

<11x388 sparse matrix of type '<class 'numpy.float64'>'
	with 777 stored elements in Compressed Sparse Row format>

In [61]:
many_songs = html_docs[285:290]

In [54]:
parsed_songs_2 = parse_many(html_docs[150:216])

In [55]:
song_dict_list_2 = []
for song in parsed_songs_2:
    song_dict  = {}
    song_dict['lyrics'] = get_all_lyrics_chords(song)[0]
    song_dict['chords'] = get_all_lyrics_chords(song)[1]
    song_dict_list_2.append(song_dict)

contains_minor_train = []
lyrics_list_train = []    
    
for song_dict in song_dict_list_2:
    contains_minor_train.append(any('m' in chord for chord in song_dict['chords']))
    lyrics_list_train.append(song_dict['lyrics'])


In [56]:
parsed_songs_3 = parse_many(html_docs[800:824])

In [57]:
song_dict_list_3 = []
for song in parsed_songs_3:
    song_dict  = {}
    song_dict['lyrics'] = get_all_lyrics_chords(song)[0]
    song_dict['chords'] = get_all_lyrics_chords(song)[1]
    song_dict_list_3.append(song_dict)

contains_minor_test = []
lyrics_list_test = []    
    
for song_dict in song_dict_list_3:
    contains_minor_test.append(any('m' in chord for chord in song_dict['chords']))
    lyrics_list_test.append(song_dict['lyrics'])

In [58]:
tfidf.transform(lyrics_list_test, contains_minor_test).A

array([[ 0.        ,  0.        ,  0.        , ...,  0.20029357,
         0.        ,  0.28370588],
       [ 0.        ,  0.        ,  0.02969628, ...,  0.01244452,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.02995859, ...,  0.01255444,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.21305719,
         0.        ,  0.27434993],
       [ 0.        ,  0.        ,  0.        , ...,  0.11508824,
         0.        ,  0.07409853],
       [ 0.        ,  0.        ,  0.        , ...,  0.12390906,
         0.        ,  0.06382219]])

In [66]:
tfidf.fit(lyrics_list_train)

train_matrix = tfidf.transform(lyrics_list_train)

In [67]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()

In [68]:
test_matrix = tfidf.transform(lyrics_list_test)

In [69]:
logistic.fit(train_matrix, contains_minor_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [72]:
logistic.predict_proba(test_matrix)

array([[ 0.25045162,  0.74954838],
       [ 0.37637933,  0.62362067],
       [ 0.37291807,  0.62708193],
       [ 0.30445711,  0.69554289],
       [ 0.33005776,  0.66994224],
       [ 0.31509862,  0.68490138],
       [ 0.34245908,  0.65754092],
       [ 0.27127326,  0.72872674],
       [ 0.32622825,  0.67377175],
       [ 0.33431648,  0.66568352],
       [ 0.37684139,  0.62315861],
       [ 0.3342627 ,  0.6657373 ],
       [ 0.29462296,  0.70537704],
       [ 0.32733405,  0.67266595],
       [ 0.25429811,  0.74570189],
       [ 0.32130983,  0.67869017],
       [ 0.32573896,  0.67426104],
       [ 0.25081082,  0.74918918],
       [ 0.37466524,  0.62533476],
       [ 0.37757843,  0.62242157],
       [ 0.36972394,  0.63027606],
       [ 0.31366668,  0.68633332],
       [ 0.35115255,  0.64884745],
       [ 0.34016589,  0.65983411]])

In [110]:
many_songs = parse_many(html_docs[216:1209])



In [80]:
%pdb

Automatic pdb calling has been turned OFF


In [116]:
raw_html.find().count()

4454