In [12]:
# from selenium.webdriver import Chrome
from bs4 import BeautifulSoup
import pymongo
# import datetime
# import time
import pandas as pd
import numpy as np
# import re
import warnings
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split
import string
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
mc = pymongo.MongoClient()
db = mc['chordify']
raw_html = db['raw_html']
parsed_songs_db = db["parsed_songs"]

In [3]:
parsed_songs_db

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'chordify'), 'parsed_songs')

In [8]:
html_docs = list(raw_html.find())

In [215]:
raw_html.find().count()

6070

In [127]:
parsed_songs = list(parsed_songs_db.find())

In [128]:
def get_lines_from_song(html_doc):
    if 'html' in html_doc:
        html = html_doc['html']
    elif 'song_html' in html_doc:
        html = html_doc['song_html']
    else:
        raise KeyError (f"Html not found for {html_doc.get('_id')}")
    soup = BeautifulSoup(html, 'html.parser')
    song_body = soup.select_one('pre._1YgOS')
    song_lines = str(song_body).split('\n')
    return song_lines


def strip_html(text):
    result = []
    in_tag = False
    for char in text:
        if char == '<':
            in_tag = True
        if not in_tag:
            result.append(char)
        if char == '>':
            in_tag = False
    return ''.join(result)


def separate_lines(html_doc):
    song_lines = get_lines_from_song(html_doc)
    lines = []
    for i, song_line in enumerate(song_lines):
        if '_3L0Da' in song_line:
            lines.append({'chords': strip_html(song_line)})
        elif (('_3L0Da' in song_lines[i-1]) and (song_line == song_line) and (strip_html(song_line) == song_line)):
            lines[-1]['words'] = song_line
    return lines


def get_chords(line):
    chord_idxs = []
    chords = []
    c_string = line['chords']
    for chord in re.finditer('\w+', c_string):
        chord_idxs.append(chord.start())
        chords.append(chord.group())
    chord_tups = list(zip(chord_idxs, chords))
    return chord_idxs, chords, chord_tups


def get_words(line):
    word_idxs = []
    words = []
    if 'words'in line:
        w_string = line['words']
        for word in re.finditer(r"\w[\w']+", w_string):
            word_idxs.append(word.start())
            words.append(word.group())
    word_tups = list(zip(word_idxs, words))
    return word_idxs, words, word_tups


def merge_chord_word(line):
    chord_tups = get_chords(line)[2]
    word_tups = get_words(line)[2]
    word_list = get_words(line)[1]
    chord_idx_list = []
    for chord_tup in chord_tups:
        for i, word_tup in enumerate(word_tups):
            if word_tup[0] > chord_tup[0]:
                chord_idx_list.append((i-1, chord_tup[1]))
                break
    return (chord_idx_list, word_list)


def combine_ch_wd_lists(merged_line_1, merged_line_2):
    chord_idx_list_1, word_list_1 = merged_line_1
    chord_idx_list_2, word_list_2 = merged_line_2
    new_chord_tups = []
    for chord_idx_tup in chord_idx_list_2:
        new_chord_tups.append( ( ( (chord_idx_tup)[0] + len(word_list_1) ), chord_idx_tup[1] ) )
    all_chords_tups = chord_idx_list_1 + new_chord_tups
    all_words_list = word_list_1 + word_list_2
    return all_chords_tups, all_words_list


def parse_lines(lines):
    for i, line in enumerate(lines):
        if i == 0:
            parsed_line = merge_chord_word(line)
        else:
            parsed_line_next = merge_chord_word(line)
            parsed_line = combine_ch_wd_lists(parsed_line, parsed_line_next)
    return parsed_line 

def parse_song(html_doc):
    lines = separate_lines(html_doc)
    parsed_song = parse_lines(lines)
    return parsed_song

def parse_many(some_docs):
    parsed_songs = []
    for some_doc in some_docs:
        try:
            parsed_song = parse_song(some_doc)
            parsed_songs.append(parsed_song)
        except KeyError as e:
            warnings.warn(e.message)
            continue 
    return parsed_songs 

In [129]:
def get_phrase_for_chord(one_parsed_song):
    word_list = one_parsed_song['words']
    chord_phrase_tups = []
    for chord_set in one_parsed_song['chord_idxs']:
        word_idx = chord_set[0]
        chord_name = chord_set[1]
        phrase = ' '.join(word_list[(word_idx - 8):(word_idx + 3)])
        chord_phrase_tups.append((chord_name, phrase))
    return chord_phrase_tups


def make_phrase_is_minor_list(chord_phrase_tuples):
    phrases = []
    chords = []
    is_minor = []
    for chord_phrase_tup in chord_phrase_tuples:
        phrases.append(chord_phrase_tup[1])
        chords.append(chord_phrase_tup[0])
        is_minor.append('m' in chord_phrase_tup[0])
    return phrases, chords, is_minor


def get_full_phrase_is_minor_list(parsed_songs):
    phrases_all = []
    chords_all = []
    is_minor_all = []
    for song in parsed_songs:
        chord_phrase_tups = get_phrase_for_chord(song)
        phrases, chords, is_minor = make_phrase_is_minor_list(chord_phrase_tups)
        phrases_all.extend(phrases)
        chords_all.extend(chords)
        is_minor_all.extend(is_minor)
    return phrases_all, chords_all, is_minor_all

def make_phrase_chord_df(phrases_all, chords_all, is_minor_all):
    df = pd.DataFrame({
        'is_minor': is_minor_all,
        'chords': chords_all,
        'words': phrases_all
    })
    return df

In [130]:
one_parsed_song = parsed_songs[3]
one_parsed_song

{'_id': ObjectId('5afb47b886b136c399fd8c35'),
 'chord_idxs': [[0, 'Dm'],
  [5, 'C'],
  [9, 'Dm'],
  [10, 'C'],
  [14, 'Dm'],
  [17, 'C'],
  [19, 'Dm'],
  [21, 'C'],
  [25, 'F'],
  [28, 'Am'],
  [30, 'Dm'],
  [31, 'C'],
  [36, 'Dm'],
  [39, 'C'],
  [41, 'Dm'],
  [43, 'C'],
  [44, 'Dm'],
  [46, 'Dm'],
  [49, 'C'],
  [52, 'Dm'],
  [54, 'C'],
  [57, 'Dm'],
  [62, 'C'],
  [64, 'Dm'],
  [68, 'C'],
  [71, 'F'],
  [75, 'Am'],
  [77, 'Dm'],
  [80, 'C'],
  [83, 'Dm'],
  [86, 'C'],
  [89, 'Dm'],
  [90, 'C'],
  [93, 'Dm'],
  [97, 'C'],
  [99, 'Dm'],
  [101, 'C'],
  [104, 'Dm'],
  [107, 'C'],
  [110, 'Dm'],
  [111, 'C'],
  [114, 'F'],
  [120, 'Am'],
  [122, 'Dm'],
  [124, 'C'],
  [125, 'Dm'],
  [127, 'Dm'],
  [131, 'C'],
  [134, 'Dm'],
  [136, 'C']],
 'words': ['As',
  'I',
  'went',
  'out',
  'one',
  'morning',
  'to',
  'breathe',
  'the',
  'air',
  'around',
  'Tom',
  "Paine's",
  'I',
  'spied',
  'the',
  'fairest',
  'damsel',
  'that',
  'ever',
  'did',
  'walk',
  'in',
  'chains',
  '

In [131]:
song_with_phrases = get_phrase_for_chord(one_parsed_song)
song_with_phrases

[('Dm', ''),
 ('C', ''),
 ('Dm', 'I went out one morning to breathe the air around Tom'),
 ('C', "went out one morning to breathe the air around Tom Paine's"),
 ('Dm', "to breathe the air around Tom Paine's I spied the fairest"),
 ('C', "air around Tom Paine's I spied the fairest damsel that ever"),
 ('Dm', "Tom Paine's I spied the fairest damsel that ever did walk"),
 ('C', 'I spied the fairest damsel that ever did walk in chains'),
 ('F', "damsel that ever did walk in chains I offer'd her my"),
 ('Am', "did walk in chains I offer'd her my hand she took"),
 ('Dm', "in chains I offer'd her my hand she took me by"),
 ('C', "chains I offer'd her my hand she took me by the"),
 ('Dm', 'hand she took me by the arm I knew that very'),
 ('C', 'me by the arm I knew that very instant she meant'),
 ('Dm', 'the arm I knew that very instant she meant to do'),
 ('C', 'I knew that very instant she meant to do me harm'),
 ('Dm', 'knew that very instant she meant to do me harm Depart'),
 ('Dm', 'very 

In [132]:
phrases, chords, is_minor = make_phrase_is_minor_list(song_with_phrases)
phrases, chords, is_minor

(['',
  '',
  'I went out one morning to breathe the air around Tom',
  "went out one morning to breathe the air around Tom Paine's",
  "to breathe the air around Tom Paine's I spied the fairest",
  "air around Tom Paine's I spied the fairest damsel that ever",
  "Tom Paine's I spied the fairest damsel that ever did walk",
  'I spied the fairest damsel that ever did walk in chains',
  "damsel that ever did walk in chains I offer'd her my",
  "did walk in chains I offer'd her my hand she took",
  "in chains I offer'd her my hand she took me by",
  "chains I offer'd her my hand she took me by the",
  'hand she took me by the arm I knew that very',
  'me by the arm I knew that very instant she meant',
  'the arm I knew that very instant she meant to do',
  'I knew that very instant she meant to do me harm',
  'knew that very instant she meant to do me harm Depart',
  'very instant she meant to do me harm Depart from me',
  'meant to do me harm Depart from me this moment I',
  'me harm Dep

In [133]:
def make_phrase_chord_df(phrases_all, chords_all, is_minor_all):
    df = pd.DataFrame({
        'is_minor': is_minor_all,
        'chords': chords_all,
        'words': phrases_all
    })
    return df

In [135]:
sample_df = make_phrase_chord_df(phrases, chords, is_minor)
sample_df.head(7)

Unnamed: 0,chords,is_minor,words
0,Dm,True,
1,C,False,
2,Dm,True,I went out one morning to breathe the air arou...
3,C,False,went out one morning to breathe the air around...
4,Dm,True,to breathe the air around Tom Paine's I spied ...
5,C,False,air around Tom Paine's I spied the fairest dam...
6,Dm,True,Tom Paine's I spied the fairest damsel that ev...


In [49]:
sep_lines_song

[{'chords': 'Dm                 C                      Dm   C         Dm',
  'words': "As I went out one morning, to breathe the air around Tom Paine's"},
 {'chords': '    Dm               C            Dm      C       Dm',
  'words': 'I spied the fairest damsel, that ever did walk in chains'},
 {'chords': '    F              Am       Dm    C       Dm',
  'words': "I offer'd her my hand, she took me by the arm"},
 {'chords': '    Dm            C            Dm      C    Dm',
  'words': 'I knew that very instant, she meant to do me harm'},
 {'chords': 'Dm                 C             Dm       C     Dm',
  'words': 'Depart from me this moment, I told her with my voice'},
 {'chords': ' Dm                       C       Dm            C        Dm',
  'words': 'Said she, "But I don\'t wish to, said I, but you have no choice'},
 {'chords': 'F                 Am         Dm                C       Dm',
  'words': 'I beg you, sir, she pleaded, from the corners of her mouth'},
 {'chords': 'Dm       

In [136]:
some_parsed_songs = parsed_songs[:10]

In [137]:
some_parsed_songs[9]

{'_id': ObjectId('5afb47b886b136c399fd8c3b'),
 'chord_idxs': [[1, 'G'],
  [6, 'C'],
  [10, 'C'],
  [18, 'C'],
  [27, 'G'],
  [31, 'D']],
 'words': ['I',
  'went',
  'to',
  'the',
  'doctor',
  'he',
  'says',
  "I'm",
  'alright',
  'I',
  'knew',
  'he',
  'was',
  "lyin'",
  "I'm",
  'losing',
  'my',
  'sight',
  'He',
  'should',
  'have',
  'examined',
  'the',
  'eyes',
  'of',
  'my',
  'mind',
  '20',
  '20',
  'vision',
  'and',
  "walkin'",
  'round',
  'blind']}

In [112]:
pwd

'/Users/emilynaftalin/galvanize/dsi/capstone/Guitar-Chord-Generator'

In [88]:
cd .. 

/Users/emilynaftalin/galvanize/dsi/capstone/Guitar-Chord-Generator


In [138]:
%run src/logistic_by_line.py

In [139]:
logistic_score

0.77036048064085449

In [179]:
import nltk, re, pprint
from nltk import word_tokenize

In [178]:
words_test

2281             3 0 2 0 You walk into the room with your
2086    fawn I courted her proudly but now she is gone...
992     and went barefoot servants too Outside in the ...
169     How long must I suffer such abuse Won't you le...
2675    know what it is Do you Mister Jones You have many
2634                                                     
2293    you will say when you get home Be cause someth...
374     the good die young Now I just turn around and ...
2121     innocence of a lamb she was gentle like a fawn I
150     room one time before I finally disappear Every...
388     Now I just turn around and he's go one Anybody...
680             Its done ground all to molasses M m m m m
394     friend Martin Can you tell me where he's gone ...
1289      you I don't want to meet your kin Make you spin
2517    milk Or else go home Because something is happ...
1186    you Simplify you classify you Deny Defy or cru...
1169    distance A wild cat did growl Two riders were ...
306       love

In [172]:
len(words_test)

749

In [188]:
len(words_train)

2245

In [174]:
test_matrix

<749x1380 sparse matrix of type '<class 'numpy.float64'>'
	with 6451 stored elements in Compressed Sparse Row format>

In [171]:
test_matrix.shape

(749, 1380)

In [170]:
print(test_matrix)

  (0, 1375)	0.305905288139
  (0, 1373)	0.175342859806
  (0, 1344)	0.325739547117
  (0, 1286)	0.511919092942
  (0, 1172)	0.174375823271
  (0, 976)	0.463725501206
  (0, 617)	0.511919092942
  (1, 1027)	0.25348176941
  (1, 921)	0.378370107467
  (1, 818)	0.250771935113
  (1, 618)	0.160423485516
  (1, 560)	0.237532623784
  (1, 512)	0.588409236892
  (1, 412)	0.369947757428
  (1, 250)	0.369947757428
  (1, 172)	0.166105184544
  (2, 1335)	0.380474394675
  (2, 1313)	0.357246118249
  (2, 1216)	0.315725455852
  (2, 1172)	0.129601604472
  (2, 1016)	0.363068894834
  (2, 845)	0.357246118249
  (2, 606)	0.206875694319
  (2, 320)	0.376594030508
  (2, 91)	0.376594030508
  :	:
  (745, 746)	0.416861551133
  (745, 684)	0.653420990882
  (745, 457)	0.231104147305
  (745, 415)	0.33200018972
  (745, 96)	0.244224684073
  (746, 1373)	0.42310604715
  (746, 101)	0.906080169116
  (747, 1373)	0.129947406496
  (747, 1207)	0.163926031882
  (747, 1178)	0.290907812008
  (747, 1172)	0.129230731236
  (747, 968)	0.3590608638

In [165]:
logistic.predict(test_matrix)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False, False,  True,  True, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
        True, False, False, False, False, False, False, False,  True,
       False, False,

In [194]:
some_words = 'these are just sample words'

tokenizer = tfidf.build_tokenizer()

tokens = tokenizer(some_words)

sample_words = tfidf.transform(tokenizer('sadness awful set up down'))
sample_words

<5x1380 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [195]:
text = nltk.Text(tokens)
text

<Text: these are just sample words...>

In [196]:
text_mat = tfidf.transform(text)
text_mat

<5x1380 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [197]:
logistic.predict_proba(sample_words)

array([[ 0.76429898,  0.23570102],
       [ 0.76429898,  0.23570102],
       [ 0.72391842,  0.27608158],
       [ 0.8944079 ,  0.1055921 ],
       [ 0.73894388,  0.26105612]])

In [201]:
%run src/LogisticLyricAnalyzer.py

In [213]:
tfidf.transform(['list', 'of', 'words'])        

<3x3 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [211]:
lla = LogisticLyricAnalyzer(tfidf, logistic)
lla.predict(['list', 'of', 'words'])

ValueError: X has 3 features per sample; expecting 1380