In [12]:
# from selenium.webdriver import Chrome
from bs4 import BeautifulSoup
import pymongo
# import datetime
# import time
import pandas as pd
import numpy as np
# import re
import warnings
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split
import string
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
mc = pymongo.MongoClient()
db = mc['chordify']
raw_html = db['raw_html']
parsed_songs_db = db["parsed_songs"]

In [3]:
parsed_songs_db

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'chordify'), 'parsed_songs')

In [8]:
html_docs = list(raw_html.find())

In [99]:
raw_html.find().count()

5872

In [4]:
parsed_songs = list(parsed_songs_db.find())

In [48]:
sep_lines_song = separate_lines(html_docs[3])

In [5]:
parsed_songs[3]

{'_id': ObjectId('5afb47b886b136c399fd8c35'),
 'chord_idxs': [[0, 'Dm'],
  [5, 'C'],
  [9, 'Dm'],
  [10, 'C'],
  [14, 'Dm'],
  [17, 'C'],
  [19, 'Dm'],
  [21, 'C'],
  [25, 'F'],
  [28, 'Am'],
  [30, 'Dm'],
  [31, 'C'],
  [36, 'Dm'],
  [39, 'C'],
  [41, 'Dm'],
  [43, 'C'],
  [44, 'Dm'],
  [46, 'Dm'],
  [49, 'C'],
  [52, 'Dm'],
  [54, 'C'],
  [57, 'Dm'],
  [62, 'C'],
  [64, 'Dm'],
  [68, 'C'],
  [71, 'F'],
  [75, 'Am'],
  [77, 'Dm'],
  [80, 'C'],
  [83, 'Dm'],
  [86, 'C'],
  [89, 'Dm'],
  [90, 'C'],
  [93, 'Dm'],
  [97, 'C'],
  [99, 'Dm'],
  [101, 'C'],
  [104, 'Dm'],
  [107, 'C'],
  [110, 'Dm'],
  [111, 'C'],
  [114, 'F'],
  [120, 'Am'],
  [122, 'Dm'],
  [124, 'C'],
  [125, 'Dm'],
  [127, 'Dm'],
  [131, 'C'],
  [134, 'Dm'],
  [136, 'C']],
 'words': ['As',
  'I',
  'went',
  'out',
  'one',
  'morning',
  'to',
  'breathe',
  'the',
  'air',
  'around',
  'Tom',
  "Paine's",
  'I',
  'spied',
  'the',
  'fairest',
  'damsel',
  'that',
  'ever',
  'did',
  'walk',
  'in',
  'chains',
  '

In [6]:
def get_lines_from_song(html_doc):
    if 'html' in html_doc:
        html = html_doc['html']
    elif 'song_html' in html_doc:
        html = html_doc['song_html']
    else:
        raise KeyError (f"Html not found for {html_doc.get('_id')}")
    soup = BeautifulSoup(html, 'html.parser')
    song_body = soup.select_one('pre._1YgOS')
    song_lines = str(song_body).split('\n')
    return song_lines


def strip_html(text):
    result = []
    in_tag = False
    for char in text:
        if char == '<':
            in_tag = True
        if not in_tag:
            result.append(char)
        if char == '>':
            in_tag = False
    return ''.join(result)


def separate_lines(html_doc):
    song_lines = get_lines_from_song(html_doc)
    lines = []
    for i, song_line in enumerate(song_lines):
        if '_3L0Da' in song_line:
            lines.append({'chords': strip_html(song_line)})
        elif (('_3L0Da' in song_lines[i-1]) and (song_line == song_line) and (strip_html(song_line) == song_line)):
            lines[-1]['words'] = song_line
    return lines


def get_chords(line):
    chord_idxs = []
    chords = []
    c_string = line['chords']
    for chord in re.finditer('\w+', c_string):
        chord_idxs.append(chord.start())
        chords.append(chord.group())
    chord_tups = list(zip(chord_idxs, chords))
    return chord_idxs, chords, chord_tups


# def get_words(line):
#     word_idxs = []
#     words = []
#     if 'words'in line:
#         w_string = line['words']
#         for word in re.finditer('\w+', w_string):
#             word_idxs.append(word.start())
#             words.append(word.group())
#     word_tups = list(zip(word_idxs, words))
#     return word_idxs, words, word_tups

def get_words(line):
    word_idxs = []
    words = []
    if 'words'in line:
        w_string = line['words']
        for word in re.finditer(r"\w[\w']+", w_string):
            word_idxs.append(word.start())
            words.append(word.group())
    word_tups = list(zip(word_idxs, words))
    return word_idxs, words, word_tups


def merge_chord_word(line):
    chord_tups = get_chords(line)[2]
    word_tups = get_words(line)[2]
    word_list = get_words(line)[1]
    chord_idx_list = []
    for chord_tup in chord_tups:
        for i, word_tup in enumerate(word_tups):
            if word_tup[0] > chord_tup[0]:
                chord_idx_list.append((i-1, chord_tup[1]))
                break
    return (chord_idx_list, word_list)


def combine_ch_wd_lists(merged_line_1, merged_line_2):
    chord_idx_list_1, word_list_1 = merged_line_1
    chord_idx_list_2, word_list_2 = merged_line_2
    new_chord_tups = []
    for chord_idx_tup in chord_idx_list_2:
        new_chord_tups.append( ( ( (chord_idx_tup)[0] + len(word_list_1) ), chord_idx_tup[1] ) )
    all_chords_tups = chord_idx_list_1 + new_chord_tups
    all_words_list = word_list_1 + word_list_2
    return all_chords_tups, all_words_list


def parse_lines(lines):
    for i, line in enumerate(lines):
        if i == 0:
            parsed_line = merge_chord_word(line)
        else:
            parsed_line_next = merge_chord_word(line)
            parsed_line = combine_ch_wd_lists(parsed_line, parsed_line_next)
    return parsed_line 

def parse_song(html_doc):
    lines = separate_lines(html_doc)
    parsed_song = parse_lines(lines)
    return parsed_song

def parse_many(some_docs):
    parsed_songs = []
    for some_doc in some_docs:
        try:
            parsed_song = parse_song(some_doc)
            parsed_songs.append(parsed_song)
        except KeyError as e:
            warnings.warn(e.message)
            continue 
    return parsed_songs 

In [28]:
def get_phrase_for_chord(one_parsed_song):
    word_list = one_parsed_song['words']
    phrase_tups = []
    for chord_set in one_parsed_song['chord_idxs']:
        word_idx = chord_set[0]
        chord_name = chord_set[1]
        phrase = word_list[(word_idx - 8):(word_idx + 3)]
        phrase_tups.append((chord_name, phrase))
    return phrase_tups



In [29]:
one_parsed_song = parsed_songs[3]
one_parsed_song

{'_id': ObjectId('5afb47b886b136c399fd8c35'),
 'chord_idxs': [[0, 'Dm'],
  [5, 'C'],
  [9, 'Dm'],
  [10, 'C'],
  [14, 'Dm'],
  [17, 'C'],
  [19, 'Dm'],
  [21, 'C'],
  [25, 'F'],
  [28, 'Am'],
  [30, 'Dm'],
  [31, 'C'],
  [36, 'Dm'],
  [39, 'C'],
  [41, 'Dm'],
  [43, 'C'],
  [44, 'Dm'],
  [46, 'Dm'],
  [49, 'C'],
  [52, 'Dm'],
  [54, 'C'],
  [57, 'Dm'],
  [62, 'C'],
  [64, 'Dm'],
  [68, 'C'],
  [71, 'F'],
  [75, 'Am'],
  [77, 'Dm'],
  [80, 'C'],
  [83, 'Dm'],
  [86, 'C'],
  [89, 'Dm'],
  [90, 'C'],
  [93, 'Dm'],
  [97, 'C'],
  [99, 'Dm'],
  [101, 'C'],
  [104, 'Dm'],
  [107, 'C'],
  [110, 'Dm'],
  [111, 'C'],
  [114, 'F'],
  [120, 'Am'],
  [122, 'Dm'],
  [124, 'C'],
  [125, 'Dm'],
  [127, 'Dm'],
  [131, 'C'],
  [134, 'Dm'],
  [136, 'C']],
 'words': ['As',
  'I',
  'went',
  'out',
  'one',
  'morning',
  'to',
  'breathe',
  'the',
  'air',
  'around',
  'Tom',
  "Paine's",
  'I',
  'spied',
  'the',
  'fairest',
  'damsel',
  'that',
  'ever',
  'did',
  'walk',
  'in',
  'chains',
  '

In [30]:
type(one_parsed_song['chord_idxs'])

list

In [50]:
song_with_phrases = get_phrase_for_chord(one_parsed_song)
song_with_phrases

[('Dm', []),
 ('C', []),
 ('Dm',
  ['I',
   'went',
   'out',
   'one',
   'morning',
   'to',
   'breathe',
   'the',
   'air',
   'around',
   'Tom']),
 ('C',
  ['went',
   'out',
   'one',
   'morning',
   'to',
   'breathe',
   'the',
   'air',
   'around',
   'Tom',
   "Paine's"]),
 ('Dm',
  ['to',
   'breathe',
   'the',
   'air',
   'around',
   'Tom',
   "Paine's",
   'I',
   'spied',
   'the',
   'fairest']),
 ('C',
  ['air',
   'around',
   'Tom',
   "Paine's",
   'I',
   'spied',
   'the',
   'fairest',
   'damsel',
   'that',
   'ever']),
 ('Dm',
  ['Tom',
   "Paine's",
   'I',
   'spied',
   'the',
   'fairest',
   'damsel',
   'that',
   'ever',
   'did',
   'walk']),
 ('C',
  ['I',
   'spied',
   'the',
   'fairest',
   'damsel',
   'that',
   'ever',
   'did',
   'walk',
   'in',
   'chains']),
 ('F',
  ['damsel',
   'that',
   'ever',
   'did',
   'walk',
   'in',
   'chains',
   'I',
   "offer'd",
   'her',
   'my']),
 ('Am',
  ['did',
   'walk',
   'in',
   'chains',

In [37]:
type(with_phrases[3])

tuple

In [56]:
def make_phrase_list(song_with_phrase_tuples):
    phrases = []
    chords = []
    is_minor = []
    for phrase_tup in song_with_phrase_tuples:
        phrases.append(phrase_tup[1])
        chords.append(phrase_tup[0])
        is_minor.append('m' in phrase_tup[0])
    return phrases, chords, is_minor    

In [57]:
phrases, chords, is_minor = make_phrase_list(song_with_phrases)
phrases, chords, is_minor

([[],
  [],
  ['I',
   'went',
   'out',
   'one',
   'morning',
   'to',
   'breathe',
   'the',
   'air',
   'around',
   'Tom'],
  ['went',
   'out',
   'one',
   'morning',
   'to',
   'breathe',
   'the',
   'air',
   'around',
   'Tom',
   "Paine's"],
  ['to',
   'breathe',
   'the',
   'air',
   'around',
   'Tom',
   "Paine's",
   'I',
   'spied',
   'the',
   'fairest'],
  ['air',
   'around',
   'Tom',
   "Paine's",
   'I',
   'spied',
   'the',
   'fairest',
   'damsel',
   'that',
   'ever'],
  ['Tom',
   "Paine's",
   'I',
   'spied',
   'the',
   'fairest',
   'damsel',
   'that',
   'ever',
   'did',
   'walk'],
  ['I',
   'spied',
   'the',
   'fairest',
   'damsel',
   'that',
   'ever',
   'did',
   'walk',
   'in',
   'chains'],
  ['damsel',
   'that',
   'ever',
   'did',
   'walk',
   'in',
   'chains',
   'I',
   "offer'd",
   'her',
   'my'],
  ['did',
   'walk',
   'in',
   'chains',
   'I',
   "offer'd",
   'her',
   'my',
   'hand',
   'she',
   'took'],
  ['i

In [59]:
def make_phrase_chord_df(phrases, chords, is_minor):
    df = pd.DataFrame({
        'contains_minor': is_minor,
        'chords': chords,
        'words': phrases
    })
    return df

In [60]:
make_phrase_chord_df(phrases, chords, is_minor)

Unnamed: 0,chords,contains_minor,words
0,Dm,True,[]
1,C,False,[]
2,Dm,True,"[I, went, out, one, morning, to, breathe, the,..."
3,C,False,"[went, out, one, morning, to, breathe, the, ai..."
4,Dm,True,"[to, breathe, the, air, around, Tom, Paine's, ..."
5,C,False,"[air, around, Tom, Paine's, I, spied, the, fai..."
6,Dm,True,"[Tom, Paine's, I, spied, the, fairest, damsel,..."
7,C,False,"[I, spied, the, fairest, damsel, that, ever, d..."
8,F,False,"[damsel, that, ever, did, walk, in, chains, I,..."
9,Am,True,"[did, walk, in, chains, I, offer'd, her, my, h..."


In [49]:
sep_lines_song

[{'chords': 'Dm                 C                      Dm   C         Dm',
  'words': "As I went out one morning, to breathe the air around Tom Paine's"},
 {'chords': '    Dm               C            Dm      C       Dm',
  'words': 'I spied the fairest damsel, that ever did walk in chains'},
 {'chords': '    F              Am       Dm    C       Dm',
  'words': "I offer'd her my hand, she took me by the arm"},
 {'chords': '    Dm            C            Dm      C    Dm',
  'words': 'I knew that very instant, she meant to do me harm'},
 {'chords': 'Dm                 C             Dm       C     Dm',
  'words': 'Depart from me this moment, I told her with my voice'},
 {'chords': ' Dm                       C       Dm            C        Dm',
  'words': 'Said she, "But I don\'t wish to, said I, but you have no choice'},
 {'chords': 'F                 Am         Dm                C       Dm',
  'words': 'I beg you, sir, she pleaded, from the corners of her mouth'},
 {'chords': 'Dm       

In [93]:
def get_phrase_for_chord(one_parsed_song):
    word_list = one_parsed_song['words']
    chord_phrase_tups = []
    for chord_set in one_parsed_song['chord_idxs']:
        word_idx = chord_set[0]
        chord_name = chord_set[1]
        phrase = word_list[(word_idx - 8):(word_idx + 3)]
        chord_phrase_tups.append((chord_name, phrase))
    return chord_phrase_tups


def make_phrase_is_minor_list(chord_phrase_tuples):
    phrases = []
    chords = []
    is_minor = []
    for chord_phrase_tup in chord_phrase_tuples:
        phrases.append(chord_phrase_tup[1])
        chords.append(chord_phrase_tup[0])
        is_minor.append('m' in chord_phrase_tup[0])
    return phrases, chords, is_minor


def get_full_phrase_is_minor_list(parsed_songs):
    phrases_all = []
    chords_all = []
    is_minor_all = []
    for song in parsed_songs:
        chord_phrase_tups = get_phrase_for_chord(song)
        phrases, chords, is_minor = make_phrase_is_minor_list(chord_phrase_tups)
        phrases_all.extend(phrases)
        chords_all.extend(chords)
        is_minor_all.extend(is_minor)
    return phrases_all, chords_all, is_minor_all

def make_phrase_chord_df(phrases_all, chords_all, is_minor_all):
    df = pd.DataFrame({
        'is_minor': is_minor_all,
        'chords': chords_all,
        'words': phrases_all
    })
    return df

In [91]:
parsed_songs_all = list(parsed_songs_db.find())
len(parsed_songs_all)

216

In [81]:
some_parsed_songs = parsed_songs_all[:10]

In [82]:
some_parsed_songs[9]

{'_id': ObjectId('5afb47b886b136c399fd8c3b'),
 'chord_idxs': [[1, 'G'],
  [6, 'C'],
  [10, 'C'],
  [18, 'C'],
  [27, 'G'],
  [31, 'D']],
 'words': ['I',
  'went',
  'to',
  'the',
  'doctor',
  'he',
  'says',
  "I'm",
  'alright',
  'I',
  'knew',
  'he',
  'was',
  "lyin'",
  "I'm",
  'losing',
  'my',
  'sight',
  'He',
  'should',
  'have',
  'examined',
  'the',
  'eyes',
  'of',
  'my',
  'mind',
  '20',
  '20',
  'vision',
  'and',
  "walkin'",
  'round',
  'blind']}

In [83]:
phs, chgs, mns = get_full_phrase_is_minor_list(some_parsed_songs)

In [84]:
make_phrase_chord_df(phs, chgs, mns)

Unnamed: 0,chords,is_minor,words
0,Dm,True,[]
1,C,False,[]
2,Dm,True,"[I, went, out, one, morning, to, breathe, the,..."
3,C,False,"[went, out, one, morning, to, breathe, the, ai..."
4,Dm,True,"[to, breathe, the, air, around, Tom, Paine's, ..."
5,C,False,"[air, around, Tom, Paine's, I, spied, the, fai..."
6,Dm,True,"[Tom, Paine's, I, spied, the, fairest, damsel,..."
7,C,False,"[I, spied, the, fairest, damsel, that, ever, d..."
8,F,False,"[damsel, that, ever, did, walk, in, chains, I,..."
9,Am,True,"[did, walk, in, chains, I, offer'd, her, my, h..."


In [112]:
pwd

'/Users/emilynaftalin/galvanize/dsi/capstone/Guitar-Chord-Generator'

In [88]:
cd .. 

/Users/emilynaftalin/galvanize/dsi/capstone/Guitar-Chord-Generator


In [114]:
%run src/logistic_by_line.py

In [117]:
df_by_line.tail()

Unnamed: 0,chords,is_minor,words
2989,D,False,pound you There's a lawman on your trail like ...
2990,A,False,There's a lawman on your trail like to surroun...
2991,D,False,lawman on your trail like to surround you Boun...
2992,A,False,to surround you Bounty hunters are dancing all...
2993,E,False,you Bounty hunters are dancing all around you ...


In [118]:
logistic_score

0.76101468624833113