In [1]:
# from selenium.webdriver import Chrome
from bs4 import BeautifulSoup
import pymongo
# import datetime
# import time
import pandas as pd
import numpy as np
# import re
import warnings
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split
import string
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
mc = pymongo.MongoClient()
db = mc['chordify']
raw_html = db['raw_html']
parsed_songs_db = db["parsed_songs"]

In [3]:
parsed_songs_db

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'chordify'), 'parsed_songs')

In [4]:
html_docs = list(raw_html.find())

In [113]:
raw_html.find().count()

6183

In [6]:
parsed_songs = list(parsed_songs_db.find())

In [7]:
def get_lines_from_song(html_doc):
    if 'html' in html_doc:
        html = html_doc['html']
    elif 'song_html' in html_doc:
        html = html_doc['song_html']
    else:
        raise KeyError (f"Html not found for {html_doc.get('_id')}")
    soup = BeautifulSoup(html, 'html.parser')
    song_body = soup.select_one('pre._1YgOS')
    song_lines = str(song_body).split('\n')
    return song_lines


def strip_html(text):
    result = []
    in_tag = False
    for char in text:
        if char == '<':
            in_tag = True
        if not in_tag:
            result.append(char)
        if char == '>':
            in_tag = False
    return ''.join(result)


def separate_lines(html_doc):
    song_lines = get_lines_from_song(html_doc)
    lines = []
    for i, song_line in enumerate(song_lines):
        if '_3L0Da' in song_line:
            lines.append({'chords': strip_html(song_line)})
        elif (('_3L0Da' in song_lines[i-1]) and (song_line == song_line) and (strip_html(song_line) == song_line)):
            lines[-1]['words'] = song_line
    return lines


def get_chords(line):
    chord_idxs = []
    chords = []
    c_string = line['chords']
    for chord in re.finditer('\w+', c_string):
        chord_idxs.append(chord.start())
        chords.append(chord.group())
    chord_tups = list(zip(chord_idxs, chords))
    return chord_idxs, chords, chord_tups


def get_words(line):
    word_idxs = []
    words = []
    if 'words'in line:
        w_string = line['words']
        for word in re.finditer(r"\w[\w']+", w_string):
            word_idxs.append(word.start())
            words.append(word.group())
    word_tups = list(zip(word_idxs, words))
    return word_idxs, words, word_tups


def merge_chord_word(line):
    chord_tups = get_chords(line)[2]
    word_tups = get_words(line)[2]
    word_list = get_words(line)[1]
    chord_idx_list = []
    for chord_tup in chord_tups:
        for i, word_tup in enumerate(word_tups):
            if word_tup[0] > chord_tup[0]:
                chord_idx_list.append((i-1, chord_tup[1]))
                break
    return (chord_idx_list, word_list)


def combine_ch_wd_lists(merged_line_1, merged_line_2):
    chord_idx_list_1, word_list_1 = merged_line_1
    chord_idx_list_2, word_list_2 = merged_line_2
    new_chord_tups = []
    for chord_idx_tup in chord_idx_list_2:
        new_chord_tups.append( ( ( (chord_idx_tup)[0] + len(word_list_1) ), chord_idx_tup[1] ) )
    all_chords_tups = chord_idx_list_1 + new_chord_tups
    all_words_list = word_list_1 + word_list_2
    return all_chords_tups, all_words_list


def parse_lines(lines):
    for i, line in enumerate(lines):
        if i == 0:
            parsed_line = merge_chord_word(line)
        else:
            parsed_line_next = merge_chord_word(line)
            parsed_line = combine_ch_wd_lists(parsed_line, parsed_line_next)
    return parsed_line 

def parse_song(html_doc):
    lines = separate_lines(html_doc)
    parsed_song = parse_lines(lines)
    return parsed_song

def parse_many(some_docs):
    parsed_songs = []
    for some_doc in some_docs:
        try:
            parsed_song = parse_song(some_doc)
            parsed_songs.append(parsed_song)
        except KeyError as e:
            warnings.warn(e.message)
            continue 
    return parsed_songs 

In [8]:
def get_phrase_for_chord(one_parsed_song):
    word_list = one_parsed_song['words']
    chord_phrase_tups = []
    for chord_set in one_parsed_song['chord_idxs']:
        word_idx = chord_set[0]
        chord_name = chord_set[1]
        phrase = ' '.join(word_list[(word_idx - 8):(word_idx + 3)])
        chord_phrase_tups.append((chord_name, phrase))
    return chord_phrase_tups


def make_phrase_is_minor_list(chord_phrase_tuples):
    phrases = []
    chords = []
    is_minor = []
    for chord_phrase_tup in chord_phrase_tuples:
        phrases.append(chord_phrase_tup[1])
        chords.append(chord_phrase_tup[0])
        is_minor.append('m' in chord_phrase_tup[0])
    return phrases, chords, is_minor


def get_full_phrase_is_minor_list(parsed_songs):
    phrases_all = []
    chords_all = []
    is_minor_all = []
    for song in parsed_songs:
        chord_phrase_tups = get_phrase_for_chord(song)
        phrases, chords, is_minor = make_phrase_is_minor_list(chord_phrase_tups)
        phrases_all.extend(phrases)
        chords_all.extend(chords)
        is_minor_all.extend(is_minor)
    return phrases_all, chords_all, is_minor_all

def make_phrase_chord_df(phrases_all, chords_all, is_minor_all):
    df = pd.DataFrame({
        'is_minor': is_minor_all,
        'chords': chords_all,
        'words': phrases_all
    })
    return df

In [9]:
one_parsed_song = parsed_songs[3]
one_parsed_song

{'_id': ObjectId('5afb47b886b136c399fd8c35'),
 'chord_idxs': [[0, 'Dm'],
  [5, 'C'],
  [9, 'Dm'],
  [10, 'C'],
  [14, 'Dm'],
  [17, 'C'],
  [19, 'Dm'],
  [21, 'C'],
  [25, 'F'],
  [28, 'Am'],
  [30, 'Dm'],
  [31, 'C'],
  [36, 'Dm'],
  [39, 'C'],
  [41, 'Dm'],
  [43, 'C'],
  [44, 'Dm'],
  [46, 'Dm'],
  [49, 'C'],
  [52, 'Dm'],
  [54, 'C'],
  [57, 'Dm'],
  [62, 'C'],
  [64, 'Dm'],
  [68, 'C'],
  [71, 'F'],
  [75, 'Am'],
  [77, 'Dm'],
  [80, 'C'],
  [83, 'Dm'],
  [86, 'C'],
  [89, 'Dm'],
  [90, 'C'],
  [93, 'Dm'],
  [97, 'C'],
  [99, 'Dm'],
  [101, 'C'],
  [104, 'Dm'],
  [107, 'C'],
  [110, 'Dm'],
  [111, 'C'],
  [114, 'F'],
  [120, 'Am'],
  [122, 'Dm'],
  [124, 'C'],
  [125, 'Dm'],
  [127, 'Dm'],
  [131, 'C'],
  [134, 'Dm'],
  [136, 'C']],
 'words': ['As',
  'I',
  'went',
  'out',
  'one',
  'morning',
  'to',
  'breathe',
  'the',
  'air',
  'around',
  'Tom',
  "Paine's",
  'I',
  'spied',
  'the',
  'fairest',
  'damsel',
  'that',
  'ever',
  'did',
  'walk',
  'in',
  'chains',
  '

In [10]:
song_with_phrases = get_phrase_for_chord(one_parsed_song)
song_with_phrases

[('Dm', ''),
 ('C', ''),
 ('Dm', 'I went out one morning to breathe the air around Tom'),
 ('C', "went out one morning to breathe the air around Tom Paine's"),
 ('Dm', "to breathe the air around Tom Paine's I spied the fairest"),
 ('C', "air around Tom Paine's I spied the fairest damsel that ever"),
 ('Dm', "Tom Paine's I spied the fairest damsel that ever did walk"),
 ('C', 'I spied the fairest damsel that ever did walk in chains'),
 ('F', "damsel that ever did walk in chains I offer'd her my"),
 ('Am', "did walk in chains I offer'd her my hand she took"),
 ('Dm', "in chains I offer'd her my hand she took me by"),
 ('C', "chains I offer'd her my hand she took me by the"),
 ('Dm', 'hand she took me by the arm I knew that very'),
 ('C', 'me by the arm I knew that very instant she meant'),
 ('Dm', 'the arm I knew that very instant she meant to do'),
 ('C', 'I knew that very instant she meant to do me harm'),
 ('Dm', 'knew that very instant she meant to do me harm Depart'),
 ('Dm', 'very 

In [11]:
phrases, chords, is_minor = make_phrase_is_minor_list(song_with_phrases)
phrases, chords, is_minor

(['',
  '',
  'I went out one morning to breathe the air around Tom',
  "went out one morning to breathe the air around Tom Paine's",
  "to breathe the air around Tom Paine's I spied the fairest",
  "air around Tom Paine's I spied the fairest damsel that ever",
  "Tom Paine's I spied the fairest damsel that ever did walk",
  'I spied the fairest damsel that ever did walk in chains',
  "damsel that ever did walk in chains I offer'd her my",
  "did walk in chains I offer'd her my hand she took",
  "in chains I offer'd her my hand she took me by",
  "chains I offer'd her my hand she took me by the",
  'hand she took me by the arm I knew that very',
  'me by the arm I knew that very instant she meant',
  'the arm I knew that very instant she meant to do',
  'I knew that very instant she meant to do me harm',
  'knew that very instant she meant to do me harm Depart',
  'very instant she meant to do me harm Depart from me',
  'meant to do me harm Depart from me this moment I',
  'me harm Dep

In [12]:
def make_phrase_chord_df(phrases_all, chords_all, is_minor_all):
    df = pd.DataFrame({
        'is_minor': is_minor_all,
        'chords': chords_all,
        'words': phrases_all
    })
    return df

In [13]:
sample_df = make_phrase_chord_df(phrases, chords, is_minor)
sample_df.head(7)

Unnamed: 0,chords,is_minor,words
0,Dm,True,
1,C,False,
2,Dm,True,I went out one morning to breathe the air arou...
3,C,False,went out one morning to breathe the air around...
4,Dm,True,to breathe the air around Tom Paine's I spied ...
5,C,False,air around Tom Paine's I spied the fairest dam...
6,Dm,True,Tom Paine's I spied the fairest damsel that ev...


In [15]:
some_parsed_songs = parsed_songs[:10]

In [16]:
some_parsed_songs[9]

{'_id': ObjectId('5afb47b886b136c399fd8c3b'),
 'chord_idxs': [[1, 'G'],
  [6, 'C'],
  [10, 'C'],
  [18, 'C'],
  [27, 'G'],
  [31, 'D']],
 'words': ['I',
  'went',
  'to',
  'the',
  'doctor',
  'he',
  'says',
  "I'm",
  'alright',
  'I',
  'knew',
  'he',
  'was',
  "lyin'",
  "I'm",
  'losing',
  'my',
  'sight',
  'He',
  'should',
  'have',
  'examined',
  'the',
  'eyes',
  'of',
  'my',
  'mind',
  '20',
  '20',
  'vision',
  'and',
  "walkin'",
  'round',
  'blind']}

In [17]:
pwd

'/Users/emilynaftalin/galvanize/dsi/capstone/Guitar-Chord-Generator/notebooks'

In [18]:
cd .. 

/Users/emilynaftalin/galvanize/dsi/capstone/Guitar-Chord-Generator


In [40]:
%run src/logistic_by_line.py

In [41]:
logistic_score

0.75834445927903871

In [42]:
import nltk, re, pprint
from nltk import word_tokenize

In [43]:
words_test

1565    gave them time to draw their own blades When a...
1241      you down or bring you down All I really want to
934       the line know what any of it is worth No reason
2489    of F Scott Fitzgerald's books You're very well...
1199    Beat or cheat or mistreat you Simplify you cla...
2733    law against you comin' around You should be ma...
917     the thief There's too much confusion I can't g...
295       me I wear the ball and chain My patron saint is
61        Won't you let me in your room one time before I
1139    worth No reason to get excited The thief he ki...
2441                                                     
2189    matter And so it did happen like it could have...
699     be alive Go down Old Hannah don't you rise no ...
280                       3 amp 4 amp 1 amp 2 amp 3 amp 4
1325                                                     
2367    what it is do you Mister Jones Well the sword ...
1756                               2 0 2 x4 e 3 3 3 3 3 3
2950    ponder

In [44]:
len(words_test)

749

In [45]:
len(words_train)

2245

In [46]:
test_matrix

<749x1393 sparse matrix of type '<class 'numpy.float64'>'
	with 6630 stored elements in Compressed Sparse Row format>

In [47]:
test_matrix.shape

(749, 1393)

In [48]:
print(test_matrix)

  (0, 1333)	0.295616186639
  (0, 1248)	0.518598944427
  (0, 1218)	0.168397108748
  (0, 1213)	0.317363368692
  (0, 1185)	0.330014911647
  (0, 1184)	0.369776153835
  (0, 849)	0.518598944427
  (1, 1386)	0.263769747309
  (1, 1306)	0.327389648958
  (1, 1218)	0.166165858921
  (1, 940)	0.345737014761
  (1, 836)	0.250075166203
  (1, 329)	0.568150234981
  (1, 159)	0.485868385233
  (1, 28)	0.233871751359
  (2, 1372)	0.418995562698
  (2, 1330)	0.253070587569
  (2, 1181)	0.14162411794
  (2, 941)	0.359913759494
  (2, 824)	0.229240520882
  (2, 811)	0.286710005467
  (2, 686)	0.434555289686
  (2, 652)	0.252641480531
  (2, 620)	0.21392588966
  (2, 617)	0.204737660947
  :	:
  (746, 260)	0.313519765199
  (746, 215)	0.320657457356
  (746, 41)	0.114857140185
  (747, 1388)	0.236835818033
  (747, 1386)	0.136536628694
  (747, 1282)	0.28704536069
  (747, 1181)	0.134101827023
  (747, 1180)	0.262253628971
  (747, 1026)	0.457239861379
  (747, 747)	0.215349472626
  (747, 711)	0.272815457879
  (747, 583)	0.29694531

In [None]:
logistic.predict(test_matrix)

In [None]:
some_words = 'these are just sample words'

tokenizer = tfidf.build_tokenizer()

tokens = tokenizer(some_words)

sample_words = tfidf.transform(tokenizer('sadness awful set up down'))
sample_words

In [None]:
text = nltk.Text(tokens)
text

In [None]:
text_mat = tfidf.transform(text)
text_mat

In [None]:
logistic.predict_proba(sample_words)

In [106]:
from src.LogisticLyricAnalyzer import LogisticLyricAnalyzer

In [109]:
tfidf.vocabulary_;

In [108]:
lla = LogisticLyricAnalyzer(tfidf, logistic)
lla.predict('no direction home')

Please enter some words (minimum 10): words love reaction me friend boyfriend hate rivers sunwhine mountains trucks cowboys
['words love reaction me friend boyfriend hate rivers sunwhine mountains trucks cowboys']


array([[ 0.77623431,  0.22376569]])

In [110]:
df_by_line

Unnamed: 0,chords,is_minor,words
0,F,False,
1,A,False,the street the dogs are barking and the day is...
2,F,False,fall then the dogs will lose their bark Then t...
3,A,False,Then the silent night will shattered from the ...
4,A,False,And i'm one too many mornings and a thousand m...
5,C,False,i'm one too many mornings and a thousand miles...
6,G,False,too many mornings and a thousand miles behind ...
7,C,False,behind From the crossroads of my doorstep my e...
8,G,False,room where my love and i have laid I gaze across
9,C,False,laid I gaze across the street to the sidewalks...
