In [6]:
from selenium.webdriver import Chrome 
from bs4 import BeautifulSoup
import pymongo
import datetime
import time 
import pandas as pd
import re

In [7]:
mc = pymongo.MongoClient()

db = mc['chordify']

raw_html = db['raw_html']

In [14]:
raw_html.find().count()

2221

In [9]:
html_docs = list(raw_html.find())

In [86]:
def get_lines_from_song(html_doc):
    if 'html' in html_doc:
        html = html_doc['html']
    elif 'song_html' in html_doc:
        html = html_doc['song_html']
    else:
        raise KeyError (f"Html not found for {html_doc.get('_id')}")
    soup = BeautifulSoup(html, 'html.parser')
    song_body = soup.select_one('pre._1YgOS')
    song_lines = str(song_body).split('\n')
    return song_lines


def strip_html(text):
    result = []
    in_tag = False
    for char in text:
        if char == '<':
            in_tag = True
        if not in_tag:
            result.append(char)
        if char == '>':
            in_tag = False
    return ''.join(result)


def separate_lines(html_doc):
    song_lines = get_lines_from_song(html_doc)
    lines = []
    for i, song_line in enumerate(song_lines):
        if '_3L0Da' in song_line:
            lines.append({'chords': strip_html(song_line)})
        elif (('_3L0Da' in song_lines[i-1]) and (song_line == song_line) and (strip_html(song_line) == song_line)):
            lines[-1]['words'] = song_line
    return lines


def get_chords(line):
    chord_idxs = []
    chords = []
    c_string = line['chords']
    for chord in re.finditer('\w+', c_string):
        chord_idxs.append(chord.start())
        chords.append(chord.group())
    chord_tups = list(zip(chord_idxs, chords))
    return chord_idxs, chords, chord_tups


def get_words(line):
    word_idxs = []
    words = []
    if 'words'in line:
        w_string = line['words']
        for word in re.finditer(r"\w[\w']*", w_string):
            word_idxs.append(word.start())
            words.append(word.group())
    word_tups = list(zip(word_idxs, words))
    return word_idxs, words, word_tups


def merge_chord_word(line):
    chord_tups = get_chords(line)[2]
    word_tups = get_words(line)[2]
    word_list = get_words(line)[1]
    chord_idx_list = []
    for chord_tup in chord_tups:
        for i, word_tup in enumerate(word_tups):
            if word_tup[0] > chord_tup[0]:
                chord_idx_list.append((i-1, chord_tup[1]))
                break
    return (chord_idx_list, word_list)


def combine_ch_wd_lists(merged_line_1, merged_line_2):
    chord_idx_list_1, word_list_1 = merged_line_1
    chord_idx_list_2, word_list_2 = merged_line_2
    new_chord_tups = []
    for chord_idx_tup in chord_idx_list_2:
        new_chord_tups.append( ( ( (chord_idx_tup)[0] + len(word_list_1) ), chord_idx_tup[1] ) )
    all_chords_tups = chord_idx_list_1 + new_chord_tups
    all_words_list = word_list_1 + word_list_2
    return all_chords_tups, all_words_list


def parse_lines(lines):
    for i, line in enumerate(lines):
        if i == 0:
            parsed_line = merge_chord_word(line)
        else:
            parsed_line_next = merge_chord_word(line)
            parsed_line = combine_ch_wd_lists(parsed_line, parsed_line_next)
    return parsed_line 

In [87]:
def parse_song(html_doc):
    lines = separate_lines(html_doc)
    parsed_song = parse_lines(lines)
    return parsed_song 

In [88]:
def parse_many(some_docs):
    parsed_songs = []
    for some_doc in some_docs:
        try:
            parsed_song = parse_song(some_doc)
            parsed_songs.append(parsed_song)
        except KeyError as e:
            warnings.warn(e.message)
            continue 
    return parsed_songs 

In [89]:
def get_all_lyrics_chords(song):
    song_lyrics = ' '.join(song[1])
    song_chords = []
    for ch_tup in song[0]:
        song_chords.append(ch_tup[1])
    return (song_lyrics, song_chords) 

In [95]:
one_song = parse_song(html_docs[315])
one_song

([(0, 'C'),
  (15, 'G'),
  (17, 'C'),
  (44, 'D'),
  (49, 'C'),
  (53, 'F'),
  (60, 'C'),
  (65, 'G')],
 ['I',
  'was',
  'born',
  'in',
  'Dixie',
  'in',
  'a',
  'boomer',
  'shack',
  'Just',
  'a',
  'little',
  'shanty',
  'by',
  'the',
  'railroad',
  'track',
  'Freight',
  'train',
  'was',
  'it',
  'taught',
  'me',
  'how',
  'to',
  'cry',
  "hummin'",
  'of',
  'the',
  'driver',
  'was',
  'my',
  'lullaby',
  'Oh',
  'Lord',
  'mama',
  'I',
  'got',
  'them',
  'in',
  'the',
  'bottom',
  'of',
  'my',
  'rambling',
  'shoes',
  'And',
  'when',
  'the',
  'whistle',
  'blows',
  'I',
  'gotta',
  'go',
  'baby',
  "don't",
  'you',
  'know',
  'Well',
  'it',
  'looks',
  'like',
  "I'm",
  'never',
  'gonna',
  'lose',
  'the',
  'freight',
  'train',
  'blues'])

In [96]:
paragraph, ch_lst = get_all_lyrics_chords(one_song)

In [97]:
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

input_string = remove_accents(paragraph)

In [98]:
from nltk.tokenize import sent_tokenize

sent_tokens = sent_tokenize(input_string)

sent_tokens

["I was born in Dixie in a boomer shack Just a little shanty by the railroad track Freight train was it taught me how to cry hummin' of the driver was my lullaby Oh Lord mama I got them in the bottom of my rambling shoes And when the whistle blows I gotta go baby don't you know Well it looks like I'm never gonna lose the freight train blues"]

In [105]:
from nltk.tokenize import word_tokenize

tokens = [sent for sent in map(word_tokenize, sent_tokens)]

list(enumerate(tokens))

[(0,
  ['I',
   'was',
   'born',
   'in',
   'Dixie',
   'in',
   'a',
   'boomer',
   'shack',
   'Just',
   'a',
   'little',
   'shanty',
   'by',
   'the',
   'railroad',
   'track',
   'Freight',
   'train',
   'was',
   'it',
   'taught',
   'me',
   'how',
   'to',
   'cry',
   'hummin',
   "'",
   'of',
   'the',
   'driver',
   'was',
   'my',
   'lullaby',
   'Oh',
   'Lord',
   'mama',
   'I',
   'got',
   'them',
   'in',
   'the',
   'bottom',
   'of',
   'my',
   'rambling',
   'shoes',
   'And',
   'when',
   'the',
   'whistle',
   'blows',
   'I',
   'got',
   'ta',
   'go',
   'baby',
   'do',
   "n't",
   'you',
   'know',
   'Well',
   'it',
   'looks',
   'like',
   'I',
   "'m",
   'never',
   'gon',
   'na',
   'lose',
   'the',
   'freight',
   'train',
   'blues'])]

In [106]:
import string

tokens_lower = [[word.lower() for word in sent]
                 for sent in tokens]

In [107]:
from nltk.corpus import stopwords

stopwords_ = "a,able,about,across,after,all,almost,also,am,among,an,and,any,\
are,as,at,be,because,been,but,by,can,could,dear,did,do,does,either,\
else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,\
how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,\
me,might,most,must,my,neither,no,of,off,often,on,only,or,other,our,\
own,rather,said,say,says,she,should,since,so,some,than,that,the,their,\
them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,\
what,when,where,which,while,who,whom,why,will,with,would,yet,you,your]".split(',')

print("--- stopwords in english: {}".format(stopwords_))

--- stopwords in english: ['a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'but', 'by', 'can', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your]']


In [108]:
import string

punctuation_ = set(string.punctuation)
print("--- punctuation: {}".format(string.punctuation))

def filter_tokens(sent):
    return([w for w in sent if not w in stopwords_ and not w in punctuation_])

tokens_filtered = list(map(filter_tokens, tokens_lower))

for sent in tokens_filtered:
    print("--- sentence tokens: {}".format(sent))

--- punctuation: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
--- sentence tokens: ['born', 'dixie', 'boomer', 'shack', 'little', 'shanty', 'railroad', 'track', 'freight', 'train', 'taught', 'cry', 'hummin', 'driver', 'lullaby', 'oh', 'lord', 'mama', 'bottom', 'rambling', 'shoes', 'whistle', 'blows', 'ta', 'go', 'baby', "n't", 'know', 'well', 'looks', "'m", 'never', 'gon', 'na', 'lose', 'freight', 'train', 'blues']


In [109]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer

stemmer_porter = PorterStemmer()
tokens_stemporter = [list(map(stemmer_porter.stem, sent)) for sent in tokens_filtered]
print("--- sentence tokens (porter): {}".format(tokens_stemporter[0]))

stemmer_snowball = SnowballStemmer('english')
tokens_stemsnowball = [list(map(stemmer_snowball.stem, sent)) for sent in tokens_filtered]
print("--- sentence tokens (snowball): {}".format(tokens_stemsnowball[0]))

--- sentence tokens (porter): ['born', 'dixi', 'boomer', 'shack', 'littl', 'shanti', 'railroad', 'track', 'freight', 'train', 'taught', 'cri', 'hummin', 'driver', 'lullabi', 'oh', 'lord', 'mama', 'bottom', 'rambl', 'shoe', 'whistl', 'blow', 'ta', 'go', 'babi', "n't", 'know', 'well', 'look', "'m", 'never', 'gon', 'na', 'lose', 'freight', 'train', 'blue']
--- sentence tokens (snowball): ['born', 'dixi', 'boomer', 'shack', 'littl', 'shanti', 'railroad', 'track', 'freight', 'train', 'taught', 'cri', 'hummin', 'driver', 'lullabi', 'oh', 'lord', 'mama', 'bottom', 'rambl', 'shoe', 'whistl', 'blow', 'ta', 'go', 'babi', "n't", 'know', 'well', 'look', "'m", 'never', 'gon', 'na', 'lose', 'freight', 'train', 'blue']


In [110]:
from nltk.util import ngrams

list(ngrams(tokens_stemsnowball[0],4))

[('born', 'dixi', 'boomer', 'shack'),
 ('dixi', 'boomer', 'shack', 'littl'),
 ('boomer', 'shack', 'littl', 'shanti'),
 ('shack', 'littl', 'shanti', 'railroad'),
 ('littl', 'shanti', 'railroad', 'track'),
 ('shanti', 'railroad', 'track', 'freight'),
 ('railroad', 'track', 'freight', 'train'),
 ('track', 'freight', 'train', 'taught'),
 ('freight', 'train', 'taught', 'cri'),
 ('train', 'taught', 'cri', 'hummin'),
 ('taught', 'cri', 'hummin', 'driver'),
 ('cri', 'hummin', 'driver', 'lullabi'),
 ('hummin', 'driver', 'lullabi', 'oh'),
 ('driver', 'lullabi', 'oh', 'lord'),
 ('lullabi', 'oh', 'lord', 'mama'),
 ('oh', 'lord', 'mama', 'bottom'),
 ('lord', 'mama', 'bottom', 'rambl'),
 ('mama', 'bottom', 'rambl', 'shoe'),
 ('bottom', 'rambl', 'shoe', 'whistl'),
 ('rambl', 'shoe', 'whistl', 'blow'),
 ('shoe', 'whistl', 'blow', 'ta'),
 ('whistl', 'blow', 'ta', 'go'),
 ('blow', 'ta', 'go', 'babi'),
 ('ta', 'go', 'babi', "n't"),
 ('go', 'babi', "n't", 'know'),
 ('babi', "n't", 'know', 'well'),
 ("n't"

In [111]:
from nltk.util import ngrams

def join_sent_ngrams(input_tokens, n):
    # first add the 1-gram tokens
    ret_list = list(input_tokens)
    
    #then for each n
    for i in range(2,n+1):
        # add each n-grams to the list
        ret_list.extend(['-'.join(tgram) for tgram in ngrams(input_tokens, i)])
    
    return(ret_list)

tokens_ngrams = list(map(lambda x : join_sent_ngrams(x, 3), tokens_stemsnowball))

print("--- sentence tokens: {}".format(tokens_ngrams[0]))

--- sentence tokens: ['born', 'dixi', 'boomer', 'shack', 'littl', 'shanti', 'railroad', 'track', 'freight', 'train', 'taught', 'cri', 'hummin', 'driver', 'lullabi', 'oh', 'lord', 'mama', 'bottom', 'rambl', 'shoe', 'whistl', 'blow', 'ta', 'go', 'babi', "n't", 'know', 'well', 'look', "'m", 'never', 'gon', 'na', 'lose', 'freight', 'train', 'blue', 'born-dixi', 'dixi-boomer', 'boomer-shack', 'shack-littl', 'littl-shanti', 'shanti-railroad', 'railroad-track', 'track-freight', 'freight-train', 'train-taught', 'taught-cri', 'cri-hummin', 'hummin-driver', 'driver-lullabi', 'lullabi-oh', 'oh-lord', 'lord-mama', 'mama-bottom', 'bottom-rambl', 'rambl-shoe', 'shoe-whistl', 'whistl-blow', 'blow-ta', 'ta-go', 'go-babi', "babi-n't", "n't-know", 'know-well', 'well-look', "look-'m", "'m-never", 'never-gon', 'gon-na', 'na-lose', 'lose-freight', 'freight-train', 'train-blue', 'born-dixi-boomer', 'dixi-boomer-shack', 'boomer-shack-littl', 'shack-littl-shanti', 'littl-shanti-railroad', 'shanti-railroad-tra

In [112]:
import os               # for environ variables in Part 3
from src.nlp_pipeline import extract_bow_from_raw_text

In [113]:
parsed_songs = parse_many(html_docs[305:316])
type(parsed_songs)

list

In [121]:
parsed_songs[6]

([(0, 'E'),
  (2, 'A'),
  (7, 'E'),
  (13, 'A'),
  (16, 'Bm'),
  (22, 'A'),
  (29, 'E'),
  (35, 'Bm'),
  (39, 'D'),
  (43, 'A'),
  (51, 'E'),
  (59, 'A'),
  (63, 'Bm'),
  (68, 'A'),
  (75, 'E'),
  (82, 'Bm'),
  (86, 'D'),
  (88, 'E'),
  (90, 'E'),
  (92, 'A'),
  (97, 'E'),
  (103, 'A'),
  (106, 'Bm'),
  (112, 'A'),
  (119, 'E'),
  (125, 'Bm'),
  (130, 'D'),
  (137, 'A'),
  (140, 'Bm'),
  (144, 'E'),
  (150, 'A'),
  (154, 'Bm'),
  (161, 'A'),
  (167, 'E'),
  (175, 'Bm'),
  (179, 'D'),
  (183, 'E'),
  (185, 'A'),
  (190, 'E'),
  (196, 'A'),
  (199, 'Bm'),
  (205, 'A'),
  (212, 'E')],
 ['Four',
  'strong',
  'winds',
  'that',
  'blow',
  'lonely',
  'Seven',
  'seas',
  'that',
  'run',
  'high',
  'All',
  'those',
  'things',
  'that',
  "don't",
  'change',
  'come',
  'what',
  'may',
  'But',
  'the',
  'good',
  'times',
  'are',
  'all',
  'gone',
  'And',
  "I'm",
  'bound',
  'for',
  'moving',
  'on',
  "I'll",
  'look',
  'for',
  'you',
  'if',
  'you',
  'ever',
  'come',
  

In [122]:
song_dict_list = []
for song in parsed_songs:
    song_dict  = {}
    song_dict['lyrics'] = get_all_lyrics_chords(song)[0]
    song_dict['chords'] = get_all_lyrics_chords(song)[1]
    song_dict_list.append(song_dict)
# extracting bows
    bows = list(map(lambda song_dict: extract_bow_from_raw_text(song_dict['lyrics']), song_dict_list))

# displaying bows
for i in range(len(song_dict_list)):
    print("\n--- chords: {}".format(song_dict_list[i]['chords']))
    print("--- lyrics: {}".format(song_dict_list[i]['lyrics']))
    print("--- bow: {}".format(bows[i])) 


--- chords: ['F', 'Am', 'Bb', 'C', 'F', 'Am', 'Bb', 'C7', 'C', 'F', 'C', 'F', 'Am', 'Bb', 'C', 'F', 'Am', 'Bb', 'C', 'C', 'F', 'C', 'F', 'Am', 'Bb', 'C', 'F', 'Am', 'Bb', 'C', 'C', 'F', 'C']
--- lyrics: May God bless and keep you always May your wishes all come true May you always do for others And let others do for you May you build a ladder to the stars And climb on every rung May you stay forever young Forever young forever young May you stay forever young May you grow up to be righteous May you grow up to be true May you always know the truth And see the light surrounding you May you always be courageous Stand upright and be strong May you stay forever young Forever young forever young May you stay forever young May your hands always be busy May your feet always be swift May you have a strong foundation When the winds are changing shift May your heart always be joyful And may your song always be sung May you stay forever young Forever young forever young May you stay forever young

In [123]:
from collections import Counter

# term occurence = counting distinct words in each bag
term_occ = list(map(lambda bow : Counter(bow), bows))

# term frequency = occurences over length of bag
term_freq = list()
for i in range(len(song_dict_list)):
    term_freq.append( {k: (v / float(len(bows[i])))
                       for k, v in term_occ[i].items()} )

# displaying occurences
for i in range(len(song_dict_list)):
    print("\n--- lyrics: {}".format(song_dict_list[i]['lyrics']))
    print("--- bow: {}".format(bows[i]))
    print("--- term_occ: {}".format(term_occ[i]))
    print("--- term_freq: {}".format(term_freq[i]))


--- lyrics: May God bless and keep you always May your wishes all come true May you always do for others And let others do for you May you build a ladder to the stars And climb on every rung May you stay forever young Forever young forever young May you stay forever young May you grow up to be righteous May you grow up to be true May you always know the truth And see the light surrounding you May you always be courageous Stand upright and be strong May you stay forever young Forever young forever young May you stay forever young May your hands always be busy May your feet always be swift May you have a strong foundation When the winds are changing shift May your heart always be joyful And may your song always be sung May you stay forever young Forever young forever young May you stay forever young
--- bow: ["b'may", 'god', 'bless', 'may', 'wish', 'true', 'may', 'other', 'other', 'may', 'ladder', 'star', 'climb', 'rung', 'may', 'young', 'forev', 'young', 'young', 'may', 'young', 'may',

In [124]:
# document occurence = number of documents having this word
# term frequency = occurences over length of bag

doc_occ = Counter( [word for bow in bows for word in set(bow)] )

# document frequency = occurences over length of corpus
doc_freq = {k: (v / float(len(song_dict_list)))
            for k, v in doc_occ.items()}

# displaying vocabulary
print("\n--- full vocabulary: {}".format(doc_occ))
print("\n--- doc freq: {}".format(doc_freq))


--- full vocabulary: Counter({'god': 8, 'other': 7, 'bless': 6, 'ladder': 6, 'truth': 6, 'star': 6, 'rung': 6, 'may': 6, 'young': 6, 'true': 6, 'strong': 6, 'hand': 6, 'climb': 6, 'wish': 6, 'b': 6, 'stand': 5, 'foundat': 5, 'busi': 5, 'courag': 5, 'righteous': 5, 'heart': 5, 'wind': 5, 'upright': 5, 'feet': 5, 'joy': 5, 'song': 5, 'sung': 5, 'swift': 5, 'forev': 4, "b'may": 3, 'light': 3, 'chang': 3, 'lie': 3, "ev'ri": 2, 'high': 2, 'mind': 2, 'thing': 2, 'way': 2, 'good': 2, 'time': 2, 'floor': 2, 'eye': 2, 'shoe': 2, 'suit': 2, 'babi': 2, 'oh': 2, 'lord': 2, 'shift': 1, "b'i": 1, 'chord': 1, 'measur': 1, 'ung': 1, 'yo': 1, "b'1": 1, 'seven': 1, 'four': 1, 'weather': 1, 'bound': 1, 'fli': 1, 'snow': 1, 'blow': 1, 'springtim': 1, 'more': 1, 'fall': 1, 'hundr': 1, 'much': 1, 'fare': 1, 'friend': 1, 'alberta': 1, 'pocket': 1, 'dirt': 1, 'crutch': 1, 'jamaican': 1, 'thought': 1, 'thumb': 1, 'peic': 1, 'red': 1, 'cute': 1, 'come': 1, 'drawer': 1, 'boot': 1, 'rum': 1, 'last': 1, 'shirt': 