# Song lyric collection and analysis with lyricsgenius

In [1]:
# get lyrics, tokenize by line, word tokenize, analysis
# IDEAS : - filter out adlibs
# -to check vocab size need to do : # unique words / total words
# - need to lemmatize words before removing stopwords

In [2]:
import numpy as np
import pandas as pd
import re
import nltk

import lyricsgenius as lg

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cedge\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cedge\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
file = open("/Users/cedge/JupyterNotebooks/LyricAnalysis/data/auto_.txt","w")

In [4]:
genius = lg.Genius('zleUnA417r4r6kLBco4G748hX7O43uBFJoOFF55Kv2K9YF1fnsKxTbXoIHpzg3fN', skip_non_songs=True,
                   excluded_terms=["(Remix)", "(Live)"],remove_section_headers=True)

In [5]:
# use lyricsgenius to get song lyrics for input artist, return tokenized lyrics
def get_artist_lyrics(artist, n):
    i = 0
    song_tokens = []
    songs = (genius.search_artist(artist, max_songs=n, sort='popularity')).songs
    s = [song.lyrics for song in songs]
    for song in s:
        lyrics = song[song.index('\n')+1:]
        tokes = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(lyrics)
        tokes = [w.lower() for w in tokes if not w.lower() in nltk.corpus.stopwords.words()]
        song_tokens.append(tokes)
    return song_tokens

In [6]:
# same function but lemmatizing words
def get_artist_lyrics_lemma(artist, n):
    i = 0
    song_tokens = []
    songs = (genius.search_artist(artist, max_songs=n, sort='popularity')).songs
    s = [song.lyrics for song in songs]
    for song in s:
        lyrics = song[song.index('\n')+1:]
        tokes = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(lyrics)
        tokes = [w.lower() for w in tokes if not w.lower() in nltk.corpus.stopwords.words()]
        Lem = nltk.stem.wordnet.WordNetLemmatizer()
        lem_tokes = []
        for toke in tokes:
            lem_tokes.append(Lem.lemmatize(toke))
        song_tokens.append(lem_tokes)
    return song_tokens

In [8]:
# combine tokens from all songs into one list
def combine_tokens(song_tokens):
    combined_tokens = []
    for song in song_tokens:
        for toke in song:
            combined_tokens.append(toke)
    return combined_tokens

In [9]:
# given list of songs, returns most commonly used words
def most_common_lyrics(song_tokens):
    combined_tokens = combine_tokens(song_tokens)
    fd = nltk.FreqDist(combined_tokens)
    return fd.most_common(10)

In [10]:
def most_common_bigrams(song_tokens):
    combined_tokens = combine_tokens(song_tokens)
    bgs = nltk.bigrams(combined_tokens)
    fd = nltk.FreqDist(bgs)
    return fd.most_common(3)

In [11]:
# THIS IS WRONG BECAUSE SOME ARTISTS HAVE LOTS OF WORDS IN A SONG, SOME HAVE FEW WORDS
# given list of song lyrics, return approximate size of vocabulary
def vocab_size(song_tokens):
    combined_tokens = combine_tokens(song_tokens)
    fd = nltk.FreqDist(combined_tokens)
    return len(fd)

In [12]:
# given list of artists and number of songs, return most commonly used words/bigrams and size of vocabulary
def lyric_anal(artists,n):
    print('========Lyric Analysis!========\n')
    for a in artists:
        print('Getting lyrics for '+a+'...\n')
        # try...except block to account for random timeouts when pulling lyrics
        song_tokens = []
        while True:
            try:
                song_tokens = get_artist_lyrics_lemma(a,n)
                break
            except:
                pass
        print('Analysis...\n')
        common_words = most_common_lyrics(song_tokens)
        common_bigrams = most_common_bigrams(song_tokens)
        vocabulary = vocab_size(song_tokens)
        print('------------------------')
        print(a+'\'s most common words are:\n')
        print(common_words)
        print('------------------------')
        print(a+'\'s most common bigrams are:\n')
        print(common_bigrams)
        print('------------------------')
        print(a+'\'s [in progress]:\n')
        print(vocabulary)
        print('------------------------')

In [None]:
artists = ['Jimmy Hendrix','Led Zeppelin','Pink Floyd']
result = lyric_anal(artists,5)
print(result)


Getting lyrics for Jimmy Hendrix...

Searching for songs by Jimmy Hendrix...

Couldn't find the lyrics section. Please report this if the song has lyrics.
Song URL: https://genius.com/Joe-bonamassa-hey-baby-new-rising-sun-lyrics
Done. Found 0 songs.
Analysis...

------------------------
Jimmy Hendrix's most common words are:

[]
------------------------
Jimmy Hendrix's most common bigrams are:

[]
------------------------
Jimmy Hendrix's approximate vocabulary size is:

0
------------------------
Getting lyrics for Led Zeppelin...

Searching for songs by Led Zeppelin...

Song 1: "Stairway to Heaven"
Song 2: "Immigrant Song"
Song 3: "Kashmir"
Song 4: "Black Dog"
