In [33]:
import pandas as pd
import numpy as np
from lyricsgenius import Genius
import re
import nltk
import matplotlib.pyplot as plt


In [34]:
df = pd.read_csv("lyrics.csv")
df.head(5)

Unnamed: 0,Artist,Song,Lyrics
0,Kidz Bop Kids,Ghostbusters,Ghostbusters Lyrics(Ghostbusters!)\nIf there's...
1,Creedence Clearwater Revival,Have You Ever Seen The Rain,Have You Ever Seen the Rain? Lyrics[Verse 1]\n...
2,All Time Low,Blinding Lights,Blinding Lights Lyrics[Verse 1]\nI've been try...
3,Bastille,Pompeii,Translationsไทย / Phasa ThaiEspañolPortuguêsIt...
4,Bryan Adams,Summer Of '69,Summer of ’69 Lyrics[Intro]\nI got my first re...


Lyric Cleaning

In [35]:
#1. Lowercase
lowercase = []
for item in df["Lyrics"]:
    result = item.lower()
    result = result.replace("\n"," ")
    lowercase.append(result)

df["Lyrics"] = lowercase
df.head()

Unnamed: 0,Artist,Song,Lyrics
0,Kidz Bop Kids,Ghostbusters,ghostbusters lyrics(ghostbusters!) if there's ...
1,Creedence Clearwater Revival,Have You Ever Seen The Rain,have you ever seen the rain? lyrics[verse 1] s...
2,All Time Low,Blinding Lights,blinding lights lyrics[verse 1] i've been tryn...
3,Bastille,Pompeii,translationsไทย / phasa thaiespañolportuguêsit...
4,Bryan Adams,Summer Of '69,summer of ’69 lyrics[intro] i got my first rea...


In [36]:
#2. Tokenize
token = []
for item in df["Lyrics"]:
    res = item.split()
    token.append(res)

df["Lyrics"] = token
df.head()

Unnamed: 0,Artist,Song,Lyrics
0,Kidz Bop Kids,Ghostbusters,"[ghostbusters, lyrics(ghostbusters!), if, ther..."
1,Creedence Clearwater Revival,Have You Ever Seen The Rain,"[have, you, ever, seen, the, rain?, lyrics[ver..."
2,All Time Low,Blinding Lights,"[blinding, lights, lyrics[verse, 1], i've, bee..."
3,Bastille,Pompeii,"[translationsไทย, /, phasa, thaiespañolportugu..."
4,Bryan Adams,Summer Of '69,"[summer, of, ’69, lyrics[intro], i, got, my, f..."


In [37]:
#3. Remove special characters and spaces
stripped = []
for item in df["Lyrics"]:
    result = []
    for text in item:
        if "translations" in text:
            res.append("")
        elif "lyrics" in text:
            res.append("")
        elif "verse" in text:
            res.append("")
        elif "chorus" in text:
            res.append("")
        elif "outro" in text:
            res.append("")
        elif "intro" in text:
            res.append("")
        elif "instrumental" in text:
            res.append("")
        else:
            result.append(re.sub(r"[^a-zA-Z]","",text))
        result = [x for x in result if x != '']
    stripped.append(result)
df["Lyrics"] = stripped
df.head()

Unnamed: 0,Artist,Song,Lyrics
0,Kidz Bop Kids,Ghostbusters,"[ghostbusters, if, theres, somethin, strange, ..."
1,Creedence Clearwater Revival,Have You Ever Seen The Rain,"[have, you, ever, seen, the, rain, someone, to..."
2,All Time Low,Blinding Lights,"[blinding, lights, ive, been, tryna, call, ive..."
3,Bastille,Pompeii,"[phasa, thaiespaolportugusitalianopompeii, by,..."
4,Bryan Adams,Summer Of '69,"[summer, of, i, got, my, first, real, sixstrin..."


In [38]:
#4 Stop Words

from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

checkList = df["Lyrics"]
for index,item in enumerate(checkList):
    checkList[index] = [text for text in item if text not in english_stopwords]
df["Lyrics"] = checkList
df.head()
        

Unnamed: 0,Artist,Song,Lyrics
0,Kidz Bop Kids,Ghostbusters,"[ghostbusters, theres, somethin, strange, neig..."
1,Creedence Clearwater Revival,Have You Ever Seen The Rain,"[ever, seen, rain, someone, told, long, ago, t..."
2,All Time Low,Blinding Lights,"[blinding, lights, ive, tryna, call, ive, long..."
3,Bastille,Pompeii,"[phasa, thaiespaolportugusitalianopompeii, jes..."
4,Bryan Adams,Summer Of '69,"[summer, got, first, real, sixstring, bought, ..."


In [39]:
#5 Lemmatization
lemmatizer = nltk.WordNetLemmatizer()
lem = []
for item in df["Lyrics"]:
    lem.append(lemmatizer.lemmatize(str(item)))
    
df["Lyrics"] = lem
df.head()

Unnamed: 0,Artist,Song,Lyrics
0,Kidz Bop Kids,Ghostbusters,"['ghostbusters', 'theres', 'somethin', 'strang..."
1,Creedence Clearwater Revival,Have You Ever Seen The Rain,"['ever', 'seen', 'rain', 'someone', 'told', 'l..."
2,All Time Low,Blinding Lights,"['blinding', 'lights', 'ive', 'tryna', 'call',..."
3,Bastille,Pompeii,"['phasa', 'thaiespaolportugusitalianopompeii',..."
4,Bryan Adams,Summer Of '69,"['summer', 'got', 'first', 'real', 'sixstring'..."


In [40]:
df.to_csv("cleaned_lyrics.csv")