In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import os
for dirname, _, filenames in os.walk('../../data/lyrics'):
    for filename in filenames:
        print(os.path.join(dirname, filename).replace("\\","/"))


from src.progress_bar import printProgressBar

../../data/lyrics/artist_song_lyrics.csv
../../data/lyrics/bb-t100-lyrics.csv
../../data/lyrics/bb_t100_lyrics_en.csv
../../data/lyrics/corona-lyrics.csv
../../data/lyrics/lyrics_invalid.json
../../data/lyrics/lyrics_invalid_updated.csv
../../data/lyrics/backups/artist_song_lyrics_bak.csv


## Preparing Billboard Lyrics

In [2]:
lyrics = pd.read_csv('../../data/lyrics/artist_song_lyrics.csv', encoding='utf-8', index_col=0)
lyrics.head()


Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,url
0,1093,2 Chainz Featuring Ariana Grande,2 Chainz,Rule The World,2,94,Yeah\nUh-huh\n(Hitmaka)\n2 Chainz\n\nTop down ...,https://genius.com/2-chainz-rule-the-world-lyrics
1,1099,2 Chainz Featuring Kendrick Lamar,2 Chainz,Momma I Hit A Lick,1,100,"I want it, I want it, I want it-it-it-it\n\nCh...",https://genius.com/2-chainz-momma-i-hit-a-lick...
2,1074,2 Chainz Featuring Travis Scott,2 Chainz,Whip,1,75,"Yeah\nDo it no hands, yeah, do it, no handstan...",https://genius.com/2-chainz-whip-lyrics
3,85,21 Savage,21 Savage,1.5,1,86,"My earrings cost a half a ticket, I don't hear...",https://genius.com/21-savage-15-lyrics
4,36,21 Savage,21 Savage,A Lot,23,12,I love you\nTurn my headphone down a little bi...,https://genius.com/21-savage-a-lot-lyrics


In [3]:
lyrics_updated = pd.read_csv('../../data/lyrics/lyrics_invalid_updated.csv', encoding='utf-8', index_col=0)
lyrics_updated.to_excel('../../data/output/lyrics_updated.xlsx')
lyrics_updated = lyrics_updated.fillna(" ")
lyrics_updated.tail()


Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,url,url_first_word,artist_first_word
1316,20,"benny blanco, Halsey & Khalid","benny blanco, Halsey",Eastside,52,9,"Uh\nYeah, yeah\n\nWhen I was young, I fell in ...",https://genius.com/Benny-blanco-halsey-and-kha...,,benny
1317,1092,"benny blanco, Tainy, Selena Gomez & J Balvin","benny blanco, Tainy, Selena Gomez",I Can't Get Enough,5,66,"Crazy\nI like that, you like that, so let's be...",https://genius.com/Benny-blanco-tainy-selena-g...,,benny
1318,3890,blackbear,blackbear,Hot Girl Bummer,42,11,"Fuck you, and you, and you\nI hate your friend...",https://genius.com/Blackbear-hot-girl-bummer-l...,,blackbear
1319,3693,for KING & COUNTRY,for KING,God Only Knows,1,94,Wide awake while the world is sound asleepin'\...,https://genius.com/For-king-and-country-god-on...,,for
1321,6822,twenty one pilots,twenty one pilots,Level Of Concern,11,23,"Need you, tell me\nNeed you, tell me\n\nPanic ...",https://genius.com/Twenty-one-pilots-level-of-...,,twenty


In [4]:
lyrics.iloc[231]

id                                                            10194
artist                                             Carrie Underwood
first_artist                                       Carrie Underwood
song                                          Favorite Time Of Year
weeks_on_chart                                                    4
peak_rank                                                        62
lyrics            Last updated: 10/6/2018, 5:16PM MSTOctoberOcto...
url               https://genius.com/Gerald-haywood-2018-haywood...
Name: 231, dtype: object

In [5]:
lyrics.update(lyrics_updated)
lyrics.iloc[231]

id                              10194.0
artist                 Carrie Underwood
first_artist           Carrie Underwood
song              Favorite Time Of Year
weeks_on_chart                      4.0
peak_rank                          62.0
lyrics                          !Error!
url                                    
Name: 231, dtype: object

In [6]:
def clean_string(string):
    i = string.find('EmbedShare URLCopyEmbedCopy')
    if i == 0:
        return string
    i -=1
    while i > 0 and string[i].isdigit():
        i -=1
    return string[:i+1]

lyrics['lyrics'] = lyrics['lyrics'].apply(lambda string: clean_string(string))


In [7]:
print(lyrics.shape)
print(lyrics.columns)

(1322, 8)
Index(['id', 'artist', 'first_artist', 'song', 'weeks_on_chart', 'peak_rank',
       'lyrics', 'url'],
      dtype='object')


In [8]:
import spacy
from spacy.language import Language
from langdetect import DetectorFactory
from spacy_langdetect import LanguageDetector

@Language.factory("language_detector")
def create_language_detector(nlp, name):
   return LanguageDetector(language_detection_function=None)
#

In [9]:
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe('language_detector')

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x1f1ab982a30>

In [10]:
#document level
def detect_language(text, nlp):
    DetectorFactory.seed = 0
    if type(text) == str:
        doc = nlp(text)
        result = doc._.language['language'], doc._.language['score']
    else:
        result = {'-': -1}
    return result


# Retrieve data from cache to save time
try:
    languages = pd.read_csv('../../data/cache/lyrics_preparation_language.csv', index_col=0)
    scores = pd.read_csv('../../data/cache/lyrics_preparation_language_score.csv', index_col=0)
except:
    languages = []
    scores = []
    l = len(lyrics)

    printProgressBar(0, l+1, prefix = 'Progress:', suffix = 'Complete', length = 50)
    for i in range(0,l):
        language, score = detect_language(lyrics.loc[i, 'lyrics'], nlp)
        languages.append(language)
        scores.append(score)
        printProgressBar(i+1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)

lyrics['language'] = languages
lyrics['language_score'] = scores
lyrics['language'].to_csv('../../data/cache/lyrics_preparation_language.csv')
lyrics['language_score'].to_csv('../../data/cache/lyrics_preparation_language_score.csv')

In [11]:
lyrics['length'] = lyrics['lyrics'].apply(lambda string: len(str(string)))
lyrics['word_count'] = lyrics['lyrics'].apply(lambda string: len(str(string).split()))

In [12]:
lyrics.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1322 entries, 0 to 1321
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1322 non-null   float64
 1   artist          1322 non-null   object 
 2   first_artist    1322 non-null   object 
 3   song            1322 non-null   object 
 4   weeks_on_chart  1322 non-null   float64
 5   peak_rank       1322 non-null   float64
 6   lyrics          1322 non-null   object 
 7   url             1322 non-null   object 
 8   language        1322 non-null   object 
 9   language_score  1322 non-null   float64
 10  length          1322 non-null   int64  
 11  word_count      1322 non-null   int64  
dtypes: float64(4), int64(2), object(6)
memory usage: 134.3+ KB


In [13]:
lyrics = lyrics.rename(columns={'id' : 'billboard_id'})
lyrics['billboard_id'] = lyrics['billboard_id'].astype(int)
lyrics.sort_values(by='billboard_id', inplace=True)
lyrics.reset_index(inplace=True,drop=True)
lyrics['lyrics_id'] = lyrics.index
lyrics = lyrics[['billboard_id','lyrics_id', 'artist', 'first_artist', 'song', 'weeks_on_chart', 'peak_rank',
       'lyrics', 'url', 'length', 'word_count', 'language', 'language_score']]

lyrics

Unnamed: 0,billboard_id,lyrics_id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,url,length,word_count,language,language_score
0,0,0,Ariana Grande,Ariana Grande,"Thank U, Next",28.0,1.0,Thought I'd end up with Sean\nBut he wasn't a ...,https://genius.com/Ariana-grande-thank-u-next-...,2409,460,en,0.999997
1,1,1,Halsey,Halsey,Without Me,52.0,1.0,Found you when your heart was broke\nI filled ...,https://genius.com/Halsey-without-me-lyrics,2095,435,en,0.999995
2,2,2,Mariah Carey,Mariah Carey,All I Want For Christmas Is You,43.0,1.0,I don't want a lot for Christmas\nThere is jus...,https://genius.com/Mariah-carey-all-i-want-for...,1918,388,en,0.999996
3,3,3,Travis Scott,Travis Scott,Sicko Mode,52.0,1.0,"Astro, yeah\nSun is down, freezin' cold\nThat'...",https://genius.com/Travis-scott-sicko-mode-lyrics,3943,771,en,0.999998
4,4,4,Post Malone & Swae Lee,Post Malone,Sunflower (Spider-Man: Into The Spider-Verse),53.0,1.0,"Ayy, ayy, ayy, ayy (Ooh)\nOoh, ooh, ooh, ooh (...",https://genius.com/Post-malone-and-swae-lee-su...,1534,305,en,0.999997
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1317,10459,1317,Eminem,Eminem,Gnat,1.0,60.0,"Yeah, yeah, sick\n(d.a. got that dope)\n\nThey...",https://genius.com/Eminem-gnat-lyrics,5762,1113,en,0.999999
1318,10477,1318,Gabby Barrett,Gabby Barrett,The First Noel,1.0,78.0,"The First Noel, the Angels did say\nWas to cer...",https://genius.com/Gabby-barrett-the-first-noe...,666,132,en,0.999996
1319,10483,1319,Popp Hunna,Popp Hunna,Adderall (Corvette Corvette),1.0,84.0,"(Bitch)\nCorvette, Corvette\nHop in a motherfu...",https://genius.com/Popp-hunna-adderall-corvett...,2366,515,en,0.999998
1320,10492,1320,Lil Durk,Lil Durk,Backdoor,1.0,93.0,"(Malik on the beat)\n(Ayo Bleu)\nNo, no, no, n...",https://genius.com/Lil-durk-backdoor-lyrics,2778,547,en,0.999996


In [14]:
pd.set_option('display.max_colwidth', 150)
lyrics_non_english = lyrics.loc[lyrics['language'] != 'en']
lyrics_non_english

Unnamed: 0,billboard_id,lyrics_id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,url,length,word_count,language,language_score
30,30,30,Bad Bunny Featuring Drake,Bad Bunny,MIA,27.0,5.0,"Bless them\nAy, there ain't no second guess, then you done know so right now we carry di guess fi di gyal dem\nBad Bunny, Drake, SP\nBad Bunny, ba...",https://genius.com/Bad-bunny-and-drake-mia-dom-da-bomb-remix-lyrics,3115,637,es,0.714282
91,91,91,Anuel AA & Romeo Santos,Anuel AA,Ella Quiere Beber,20.0,61.0,"Check, check (Remix)\nI'mma show you why I'm the king of this shit\nUah\n(KOB, Real Hasta La Muerte, baby)\n\nCuando una mujer decide ser mala y n...",https://genius.com/Anuel-aa-and-romeo-santos-ella-quiere-beber-remix-lyrics,2609,511,es,0.999996
100,131,100,Pinkfong,Pinkfong,Baby Shark,20.0,32.0,"Baby shark, doo doo doo doo doo doo\nBaby shark, doo doo doo doo doo doo\nBaby shark, doo doo doo doo doo doo\nBaby shark!\n\nMommy shark, doo doo...",https://genius.com/Pinkfong-baby-shark-lyrics,1138,246,so,0.857141
121,192,121,Bad Bunny,Bad Bunny,Solo de Mi,1.0,93.0,"No me vuelvas a decir ""Bebé"" (¡No!)\nYo no soy tuyo ni de nadie, yo soy sólo de mí\nNo me vuelvas a decir ""Bebé"" (Yeh, eh)\nYa tú lo sabe' que yo ...",https://genius.com/Bad-bunny-solo-de-mi-lyrics,1852,399,es,0.999994
147,467,147,Anuel AA & Karol G,Anuel AA,Secreto,11.0,68.0,"Bebecita\nBebe-bebecita, uah\nBebecita\n\nLo de nosotro' e' un secreto, que nadie se entere (Uah)\nBaby, yo siempre me vengo contigo cuando tú te ...",https://genius.com/Karol-g-and-anuel-aa-secreto-lyrics,2709,498,es,0.999996
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266,10180,1266,Bad Bunny,Bad Bunny,Hoy Cobre,1.0,81.0,"Yeh-yeh-yeh, ey\nBad Bunny, baby, be-bé\nEy, ey\n\nHoy cobré y hoy mismo lo' vo'a explotar, ey\nEl precio ni vo'a preguntar, ey\nFumando en la Guc...",https://genius.com/Bad-bunny-hoy-cobre-lyrics,1814,360,es,0.999996
1268,10186,1268,Bad Bunny,Bad Bunny,Maldita Pobreza,1.0,87.0,"Yo quiero comprarle un Ferrari a mi novia\nPero no puedo, pero no puedo\nYo quiero comprarle un Ferrari a mi novia\nPero no puedo, no tengo dinero...",https://genius.com/Bad-bunny-maldita-pobreza-lyrics,1783,334,es,0.999996
1269,10193,1269,Bad Bunny,Bad Bunny,La Droga,1.0,94.0,"Uh, uh-uh\nUh-uh, uh-uh\n\nTú ere' la droga de la que mami me hablaba (¡Ey!)\nLa que moría si probaba (¡Oh!)\nY yo de idiota pensando que me amaba...",https://genius.com/Bad-bunny-la-droga-lyrics,1929,380,es,0.999996
1270,10194,1270,Carrie Underwood,Carrie Underwood,Favorite Time Of Year,4.0,62.0,!Error,,6,1,de,0.999994


In [15]:
# lyrics_english = lyrics.loc[(lyrics['language'] == 'en') & (lyrics['language_sent'] == 'en')]
lyrics_english = lyrics.loc[(lyrics['language'] == 'en')]


In [16]:
lyrics.to_excel('../../data/output/bb-t100-lyrics.xlsx')
lyrics.to_csv('../../data/lyrics/bb-t100-lyrics.csv')

In [17]:
invalid_lyrics = pd.read_excel('..\..\data\output\lyrics_invalid.xlsx', index_col=0)
invalid_lyrics['lyrics_id'] = invalid_lyrics.index
invalid_lyrics.rename(columns={'id' : 'billboard_id'}, inplace=True)
invalid_lyrics.columns

Index(['billboard_id', 'artist', 'first_artist', 'song', 'weeks_on_chart',
       'peak_rank', 'lyrics', 'url', 'url_first_word', 'artist_first_word',
       'lyrics_id'],
      dtype='object')

In [18]:
pd.set_option('display.max_colwidth', 150)
invalid_lyrics_example = invalid_lyrics[['billboard_id', 'lyrics_id', 'artist', 'song', 'lyrics']].loc[[1321, 506, 97]]
invalid_lyrics_example.reset_index(drop=True, inplace=True)
print(invalid_lyrics_example.to_latex(index=False))

\begin{tabular}{rrlll}
\toprule
 billboard\_id &  lyrics\_id &                                             artist &             song &                                                                                                                                                 lyrics \\
\midrule
         6822 &       1321 &                                  twenty one pilots & Level Of Concern &                                                                                                                                                !Error! \\
         8299 &        506 &                                            J. Cole &   the.climb.back &                                                                                                                                               !NoSong! \\
         9707 &         97 & Ariana Grande Feat. Doja Cat \& Megan Thee Stallion &            34+35 & Mmm\textbackslash n\textbackslash nPensarás que estoy loca\textbackslash nPor la forma en 

In [19]:
lyrics_lat = lyrics[['billboard_id','lyrics_id', 'artist', 'first_artist', 'song', 'weeks_on_chart', 'peak_rank',
       'lyrics']].head()

print(lyrics_lat.to_latex(index=False))

\begin{tabular}{rrlllrrl}
\toprule
 billboard\_id &  lyrics\_id &                 artist &  first\_artist &                                          song &  weeks\_on\_chart &  peak\_rank &                                                                                                                                                 lyrics \\
\midrule
            0 &          0 &          Ariana Grande & Ariana Grande &                                 Thank U, Next &            28.0 &        1.0 & Thought I'd end up with Sean\textbackslash nBut he wasn't a match\textbackslash nWrote some songs about Ricky\textbackslash nNow I listen and laugh\textbackslash nEven almost got married\textbackslash nAnd for Pete, I... \\
            1 &          1 &                 Halsey &        Halsey &                                    Without Me &            52.0 &        1.0 & Found you when your heart was broke\textbackslash nI filled your cup until it overflowed\textbackslash nTook it so far to kee

## Preparing Corona Lyrics

In [20]:
corona_lyrics = pd.read_excel('../../data/input/corona-lyrics.xlsx')
corona_lyrics.head()

Unnamed: 0,artist,song,lyrics,url
0,Adam Hambrick,Between Me and the End of the World,Thank you's don't cut it\nI mean what do you get\nFor the one who sees a crowd running up\nAnd then decides to run in\nIts grace under fire\nIts c...,https://genius.com/Adam-hambrick-between-me-and-the-end-of-the-world-lyrics
1,Alexander 23,IDK You Yet,How can you miss someone you've never met?\n'Cause I need you now but I don't know you yet\nBut can you find me soon because I'm in my head?\nYeah...,https://genius.com/Alexander-23-idk-you-yet-lyrics
2,Alicia Keys,My House,"This song is dedicated to the moment; I have a feelin’ that we’re all going through a similar thing, so, I figured, we could break it down like th...",
3,ArtistsCAN,Lean on Me,"Sometimes in our lives, we all have pain\nWe all have sorrow\nBut if we are wise\nWe know that there's always tomorrow\n\nLean on me, when you're ...",https://genius.com/Artistscan-lean-on-me-lyrics
4,Avril Lavigne,We Are Warriors,We'll pick our battles 'cause we know we're gonna win the war (Win the war)\nWe're not rattled 'cause we shattered all of this before (This before...,https://genius.com/Avril-lavigne-we-are-warriors-lyrics


In [21]:
def get_lines(lyrics):
    lines = []
    list = lyrics.split('\n')
    for line in list:
        if len(line) > 0:
            line = line.replace('\u2005', ' ').replace('\u205f', ' ')
            lines.append(line.lower())
    return lines



In [22]:
corona_lines = pd.DataFrame(columns=['line', 'artist', 'song'])

for row in corona_lyrics.itertuples():
    df = pd.DataFrame(columns=['line', 'artist', 'song'])
    artist = row[1]
    song = row[2]
    lyrics = row[3]
    lines = get_lines(lyrics)
    df['line'] = lines
    df['artist'] = artist
    df['song'] = song
    corona_lines = corona_lines.append(df, ignore_index=True)

In [23]:
corona_lines.drop_duplicates(inplace=True, ignore_index=True)
corona_lines

Unnamed: 0,line,artist,song
0,thank you's don't cut it,Adam Hambrick,Between Me and the End of the World
1,i mean what do you get,Adam Hambrick,Between Me and the End of the World
2,for the one who sees a crowd running up,Adam Hambrick,Between Me and the End of the World
3,and then decides to run in,Adam Hambrick,Between Me and the End of the World
4,its grace under fire,Adam Hambrick,Between Me and the End of the World
...,...,...,...
1814,"just like you wanna stick a dick up in me, i want some motherfuckin' money",21 Savage & Metro Boomin Featuring Drake,Mr. Right Now
1815,and don't get in your motherfuckin' feelings when you see me in the motherfuckin' club randomly,21 Savage & Metro Boomin Featuring Drake,Mr. Right Now
1816,and one of your niggas wanna get down on me,21 Savage & Metro Boomin Featuring Drake,Mr. Right Now
1817,and i wanna give them some pussy 'cause they gave me some money,21 Savage & Metro Boomin Featuring Drake,Mr. Right Now


In [24]:
corona_keywords = ['corona', 'coronavirus', 'pandemic', 'quarantine', 'sars', 'disease', 'social', 'distance', 'distancing', 'flatten',
                   'curve', 'home', 'sanitizing', 'covid', 'germs', 'vaccination', 'isolation', 'virus', 'inside']
def is_corona_line(keywords):
    for word in corona_keywords:
        if word in keywords:
            return True
    return  False

In [25]:
corona_lines['corona'] = corona_lines['line'].apply(lambda line: is_corona_line(line))

In [26]:
corona_lines.sort_values(by='corona', inplace=True, ascending=False)
corona_lines

Unnamed: 0,line,artist,song,corona
1390,"solitary diamonds for my dawg inside, he waited, yeah (straight up)",Future Featuring Travis Scott,Solitaires,True
1132,why would a disease come around when you get rich?,"Turbo, Gunna and Young Thug",Quarantine Clean,True
319,and call this isolation a date,Drive-by Truckers,Quarantine Together,True
1430,it's like pneumonia symptoms and contracting covid instantly,Eminem,Gnat,True
89,gotta defeat covid,Alicia Keys,My House,True
...,...,...,...,...
629,probably over-wash my hands,Luke Combs,Six Feet Apart,False
628,watch a ball game from the stands,Luke Combs,Six Feet Apart,False
627,"catch a movie, catch a cab",Luke Combs,Six Feet Apart,False
626,pay some extra on the tab,Luke Combs,Six Feet Apart,False


In [27]:
corona_lines.to_excel('../../data/input/corona_lines_3.xlsx')