# Song lyric collection and analysis with lyricsgenius

In [1]:
# get lyrics, tokenize by line, word tokenize, analysis
# IDEAS : - filter out adlibs
# -to check vocab size need to do : # unique words / total words
# - need to lemmatize words before removing stopwords

In [2]:
import numpy as np
import pandas as pd
import re
import nltk

import lyricsgenius as lg

nltk.download('omw-1.4')    # wordnet
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\cedge\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cedge\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cedge\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
#file = open("/Users/cedge/JupyterNotebooks/LyricAnalysis/data/auto_.txt","w")

In [4]:
genius = lg.Genius('zleUnA417r4r6kLBco4G748hX7O43uBFJoOFF55Kv2K9YF1fnsKxTbXoIHpzg3fN', skip_non_songs=True,
                   excluded_terms=["(Remix)", "(Live)"],remove_section_headers=True)

In [5]:
# use lyricsgenius to get song lyrics for input artist, return tokenized lyrics
def get_artist_lyrics(artist, n):
    print("test1")
    i = 0
    song_tokens = []
    
    while True:
        try:
            songs = (genius.search_artist(artist, max_songs=n, sort='popularity')).songs    
            break
        except:
            pass
    
    s = [song.lyrics for song in songs]
    for song in s:
        lyrics = song[song.index('\n')+1:]
        tokes = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(lyrics)
        tokes = [w.lower() for w in tokes if not w.lower() in nltk.corpus.stopwords.words()]
        song_tokens.append(tokes)
    return song_tokens

In [6]:
# same function but lemmatizing words
def get_artist_lyrics_lemma(artist, n):
    i = 0
    song_tokens = []
    songs = (genius.search_artist(artist, max_songs=n, sort='popularity')).songs
    s = [song.lyrics for song in songs]
    for song in s:
        print(len(s))
        if(len(s) == 0):    # need check for when songs have no lyrics
            return 'No Lyrics Found'
        lyrics = song[song.index('\n')+1:]
        tokes = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(lyrics)
        tokes = [w.lower() for w in tokes if not w.lower() in nltk.corpus.stopwords.words()]
        Lem = nltk.stem.wordnet.WordNetLemmatizer()
        lem_tokes = []
        for toke in tokes:
            lem_tokes.append(Lem.lemmatize(toke))
        song_tokens.append(lem_tokes)
    return song_tokens

In [7]:
# combine tokens from all songs into one list
def combine_tokens(song_tokens):
    combined_tokens = []
    for song in song_tokens:
        for toke in song:
            combined_tokens.append(toke)
    return combined_tokens

In [8]:
# given list of songs, returns most commonly used words
def most_common_lyrics(song_tokens):
    combined_tokens = combine_tokens(song_tokens)
    fd = nltk.FreqDist(combined_tokens)
    return fd.most_common(10)

In [9]:
def most_common_bigrams(song_tokens):
    combined_tokens = combine_tokens(song_tokens)
    bgs = nltk.bigrams(combined_tokens)
    fd = nltk.FreqDist(bgs)
    return fd.most_common(3)

In [10]:
# THIS IS WRONG BECAUSE SOME ARTISTS HAVE LOTS OF WORDS IN A SONG, SOME HAVE FEW WORDS
# given list of song lyrics, return approximate size of vocabulary
def vocab_size(song_tokens):
    combined_tokens = combine_tokens(song_tokens)
    fd = nltk.FreqDist(combined_tokens)
    score = fd.B() / fd.N() # num buckets / num outcomes
    print('buckets: '+str(fd.B()))
    print('outcomes: '+str(fd.N()))
    return score

In [11]:
# given list of artists and number of songs, return most commonly used words/bigrams and size of vocabulary
def lyric_anal(artists,n):
    print('========Lyric Analysis!========\n')
    for a in artists:
        print('Getting lyrics for '+a+'...\n')
        # try...except block to account for random timeouts when pulling lyrics
        song_tokens = []

        song_tokens = get_artist_lyrics_lemma(a,n)

        print('Analysis...\n')
        common_words = most_common_lyrics(song_tokens)
        common_bigrams = most_common_bigrams(song_tokens)
        vocabulary = vocab_size(song_tokens)
        print('------------------------')
        print(a+'\'s most common words are:\n')
        print(common_words)
        print('------------------------')
        print(a+'\'s most common bigrams are:\n')
        print(common_bigrams)
        print('------------------------')
        print(a+'\'s [in progress]:\n')
        print(vocabulary)
        print('------------------------')

In [12]:
#lyric_anal(['Young Thug', 'King Von'], 5)
song_tokens = get_artist_lyrics('Young Thug', 3)

test1
Searching for songs by Young Thug...

Song 1: "Best Friend"
Song 2: "The London"
Song 3: "Check"

Reached user-specified song limit (3).
Done. Found 3 songs.


In [13]:
vocab_size(song_tokens)

buckets: 363
outcomes: 786


0.4618320610687023

In [14]:
combined_tokens = combine_tokens(song_tokens)
fd = nltk.FreqDist(combined_tokens)
print(fd)

t = 0
print(fd.B())
print(fd.N())
fd.B() / fd.N()

<FreqDist with 363 samples and 786 outcomes>
363
786


0.4618320610687023

In [15]:
artists = ['Young Thug', 'MF DOOM', 'Riff Raff', 'Chief Keef']
result = lyric_anal(artists,3)


Getting lyrics for Young Thug...

Searching for songs by Young Thug...

Song 1: "Best Friend"
Song 2: "The London"
Song 3: "Check"

Reached user-specified song limit (3).
Done. Found 3 songs.
3
3
3
Analysis...

buckets: 355
outcomes: 786
------------------------
Young Thug's most common words are:

[('baby', 37), ('bitch', 24), ('check', 24), ('nigga', 23), ('money', 13), ('friend', 12), ('knew', 12), ('rack', 11), ('gon', 10), ('fleek', 9)]
------------------------
Young Thug's most common bigrams are:

[(('baby', 'baby'), 28), (('check', 'check'), 15), (('knew', 'knew'), 9)]
------------------------
Young Thug's [in progress]:

0.45165394402035625
------------------------
Getting lyrics for MF DOOM...

Searching for songs by MF DOOM...

Song 1: "Doomsday"
Song 2: "Beef Rapp"
Song 3: "Rapp Snitch Knishes"

Reached user-specified song limit (3).
Done. Found 3 songs.
3
3
3
Analysis...

buckets: 621
outcomes: 848
------------------------
MF DOOM's most common words are:

[('nigga', 11),

Timeout: Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)