In [33]:
import pandas as pd
import numpy as np
from lyricsgenius import Genius
import re
import nltk


Getting Lyric to Base Script On

In [34]:
genius = Genius(
    "osqluZ575A3DVk4voz6gXj22gXGl2dcQkvLHY3BCgQcc0eG_KHGJnnOSLvllIIYn")
list_lyrics = []
list_title = []
list_artist = []

artist = genius.search_artist("Green Day", max_songs=5)
songs = artist.songs

for song in songs:
    list_lyrics.append(song.lyrics)
    list_title.append(song.title)
    list_artist.append(song.artist)



Searching for songs by Green Day...

Song 1: "Boulevard of Broken Dreams"
Song 2: "American Idiot"
Song 3: "Holiday"
Song 4: "Wake Me Up When September Ends"
Song 5: "Basket Case"

Reached user-specified song limit (5).
Done. Found 5 songs.


In [51]:
df = pd.DataFrame({"artist":list_artist, "title": list_title, "lyrics":list_lyrics})

Lyric Cleaning

In [52]:
#1. Lowercase
lowercase = []
for item in df["lyrics"]:
    result = item.lower()
    result = result.replace("\n"," ")
    lowercase.append(result)

df["lyrics"] = lowercase
df.head()

Unnamed: 0,artist,title,lyrics
0,Green Day,Boulevard of Broken Dreams,translationsitalianofrançaisboulevard of broke...
1,Green Day,American Idiot,translationsfrançaisamerican idiot lyrics[vers...
2,Green Day,Holiday,holiday lyrics[intro] say hey! cha! [verse 1]...
3,Green Day,Wake Me Up When September Ends,translationsрусскийwake me up when september e...
4,Green Day,Basket Case,translationsfrançaisbasket case lyrics[verse 1...


In [53]:
#2. Tokenize
token = []
for item in df["lyrics"]:
    res = item.split()
    token.append(res)

df["lyrics"] = token
df.head()

Unnamed: 0,artist,title,lyrics
0,Green Day,Boulevard of Broken Dreams,"[translationsitalianofrançaisboulevard, of, br..."
1,Green Day,American Idiot,"[translationsfrançaisamerican, idiot, lyrics[v..."
2,Green Day,Holiday,"[holiday, lyrics[intro], say, hey!, cha!, [ver..."
3,Green Day,Wake Me Up When September Ends,"[translationsрусскийwake, me, up, when, septem..."
4,Green Day,Basket Case,"[translationsfrançaisbasket, case, lyrics[vers..."


In [54]:
#3. Remove special characters and spaces
stripped = []
for item in df["lyrics"]:
    result = []
    for text in item:
        if "translations" in text:
            res.append("")
        elif "lyrics" in text:
            res.append("")
        elif "verse" in text:
            res.append("")
        elif "chorus" in text:
            res.append("")
        elif "outro" in text:
            res.append("")
        elif "intro" in text:
            res.append("")
        elif "instrumental" in text:
            res.append("")
        else:
            result.append(re.sub(r"[^a-zA-Z]","",text))
        result = [x for x in result if x != '']
    stripped.append(result)
df["lyrics"] = stripped
df.head()

Unnamed: 0,artist,title,lyrics
0,Green Day,Boulevard of Broken Dreams,"[of, broken, dreams, i, walk, a, lonely, road,..."
1,Green Day,American Idiot,"[idiot, dont, wanna, be, an, american, idiot, ..."
2,Green Day,Holiday,"[holiday, say, hey, cha, hear, the, sound, of,..."
3,Green Day,Wake Me Up When September Ends,"[me, up, when, september, ends, summer, has, c..."
4,Green Day,Basket Case,"[case, do, you, have, the, time, to, listen, t..."


In [55]:
#4 Stop Words

from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

checkList = df["lyrics"]
for index,item in enumerate(checkList):
    print(index)
    checkList[index] = [text for text in item if text not in english_stopwords]
df["lyrics"] = checkList
df.head()
        

0
1
2
3
4


Unnamed: 0,artist,title,lyrics
0,Green Day,Boulevard of Broken Dreams,"[broken, dreams, walk, lonely, road, one, ever..."
1,Green Day,American Idiot,"[idiot, dont, wanna, american, idiot, dont, wa..."
2,Green Day,Holiday,"[holiday, say, hey, cha, hear, sound, fallin, ..."
3,Green Day,Wake Me Up When September Ends,"[september, ends, summer, come, passed, innoce..."
4,Green Day,Basket Case,"[case, time, listen, whine, nothing, everythin..."


In [56]:
#5 Lemmatization
lemmatizer = nltk.WordNetLemmatizer()
lem = []
for item in df["lyrics"]:
    lem.append(lemmatizer.lemmatize(str(item)))
    
df["lyrics"] = lem
df.head()

Unnamed: 0,artist,title,lyrics
0,Green Day,Boulevard of Broken Dreams,"['broken', 'dreams', 'walk', 'lonely', 'road',..."
1,Green Day,American Idiot,"['idiot', 'dont', 'wanna', 'american', 'idiot'..."
2,Green Day,Holiday,"['holiday', 'say', 'hey', 'cha', 'hear', 'soun..."
3,Green Day,Wake Me Up When September Ends,"['september', 'ends', 'summer', 'come', 'passe..."
4,Green Day,Basket Case,"['case', 'time', 'listen', 'whine', 'nothing',..."


In [57]:
df.head()

Unnamed: 0,artist,title,lyrics
0,Green Day,Boulevard of Broken Dreams,"['broken', 'dreams', 'walk', 'lonely', 'road',..."
1,Green Day,American Idiot,"['idiot', 'dont', 'wanna', 'american', 'idiot'..."
2,Green Day,Holiday,"['holiday', 'say', 'hey', 'cha', 'hear', 'soun..."
3,Green Day,Wake Me Up When September Ends,"['september', 'ends', 'summer', 'come', 'passe..."
4,Green Day,Basket Case,"['case', 'time', 'listen', 'whine', 'nothing',..."
