In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import os
for dirname, _, filenames in os.walk('data/lyrics'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total:
        print()

data/lyrics\artist_song_lyrics.csv
data/lyrics\artist_song_lyrics_bak.csv
data/lyrics\filtered_lyrics.csv
data/lyrics\long_lyrics.csv
data/lyrics\lyrics_invalid.xlsx
data/lyrics\lyrics_invalid_updated.csv
data/lyrics\lyrics_valid.xlsx
data/lyrics\missing_lyrics.csv
data/lyrics\short_lyrics.csv


In [2]:
lyrics = pd.read_csv('data/lyrics/artist_song_lyrics.csv', encoding='utf-8', index_col=0)
lyrics.head()


Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,url
0,1093,2 Chainz Featuring Ariana Grande,2 Chainz,Rule The World,2,94,Yeah\nUh-huh\n(Hitmaka)\n2 Chainz\n\nTop down ...,https://genius.com/2-chainz-rule-the-world-lyrics
1,1099,2 Chainz Featuring Kendrick Lamar,2 Chainz,Momma I Hit A Lick,1,100,"I want it, I want it, I want it-it-it-it\n\nCh...",https://genius.com/2-chainz-momma-i-hit-a-lick...
2,1074,2 Chainz Featuring Travis Scott,2 Chainz,Whip,1,75,"Yeah\nDo it no hands, yeah, do it, no handstan...",https://genius.com/2-chainz-whip-lyrics
3,85,21 Savage,21 Savage,1.5,1,86,"My earrings cost a half a ticket, I don't hear...",https://genius.com/21-savage-15-lyrics
4,36,21 Savage,21 Savage,A Lot,23,12,I love you\nTurn my headphone down a little bi...,https://genius.com/21-savage-a-lot-lyrics


In [3]:
lyrics_updated = pd.read_csv('data/lyrics/lyrics_invalid_updated.csv', encoding='utf-8', index_col=0)
lyrics_updated.to_excel('data/output/lyrics_updated.xlsx')
lyrics_updated.tail()

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,url,url_first_word,artist_first_word
1316,20,"benny blanco, Halsey & Khalid","benny blanco, Halsey",Eastside,52,9,"Uh\nYeah, yeah\n\nWhen I was young, I fell in ...",https://genius.com/Benny-blanco-halsey-and-kha...,,benny
1317,1092,"benny blanco, Tainy, Selena Gomez & J Balvin","benny blanco, Tainy, Selena Gomez",I Can't Get Enough,5,66,"Crazy\nI like that, you like that, so let's be...",https://genius.com/Benny-blanco-tainy-selena-g...,,benny
1318,3890,blackbear,blackbear,Hot Girl Bummer,42,11,"Fuck you, and you, and you\nI hate your friend...",https://genius.com/Blackbear-hot-girl-bummer-l...,,blackbear
1319,3693,for KING & COUNTRY,for KING,God Only Knows,1,94,Wide awake while the world is sound asleepin'\...,https://genius.com/For-king-and-country-god-on...,,for
1321,6822,twenty one pilots,twenty one pilots,Level Of Concern,11,23,"Need you, tell me\nNeed you, tell me\n\nPanic ...",https://genius.com/Twenty-one-pilots-level-of-...,,twenty


### Consolidation and Validation

In [4]:
lyrics.iloc[153]

id                                                             6252
artist                                     Bad Bunny X Daddy Yankee
first_artist                                              Bad Bunny
song                                                       La Santa
weeks_on_chart                                                    2
peak_rank                                                        53
lyrics            1,000,000+ views\n🔸 Bad Bunny - Yonaguni // 1M...
url               https://genius.com/Polka-delamusic-polkas-popu...
Name: 153, dtype: object

In [5]:
lyrics.update(lyrics_updated)
lyrics.iloc[153]

id                                                           6252.0
artist                                     Bad Bunny X Daddy Yankee
first_artist                                              Bad Bunny
song                                                       La Santa
weeks_on_chart                                                  2.0
peak_rank                                                      53.0
lyrics            Tú no ere' una santa, ni yo soy un santo\nNos ...
url               https://genius.com/Bad-bunny-and-daddy-yankee-...
Name: 153, dtype: object

In [6]:
def clean_string(string):
    i = string.find('EmbedShare URLCopyEmbedCopy')
    if i == 0:
        return string
    i -=1
    while i > 0 and string[i].isdigit():
        i -=1
    return string[:i+1]

lyrics['lyrics'] = lyrics['lyrics'].apply(lambda string: clean_string(string))


In [7]:
print(lyrics.shape)
print(lyrics.columns)

(1322, 8)
Index(['id', 'artist', 'first_artist', 'song', 'weeks_on_chart', 'peak_rank',
       'lyrics', 'url'],
      dtype='object')


In [8]:
import spacy
from spacy.language import Language
from langdetect import DetectorFactory
from spacy_langdetect import LanguageDetector

@Language.factory("language_detector")
def create_language_detector(nlp, name):
   return LanguageDetector(language_detection_function=None)
#

In [9]:
nlp = spacy.load("en_core_web_lg", exclude=['sentencizer'])
nlp.add_pipe("sentencizer", config={"punct_chars": ['\n']})
nlp.add_pipe('language_detector')


<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x247ed799130>

In [20]:
import ast
#sentence level, each line is treated as one sentence
def detect_languages(text, nlp):
    DetectorFactory.seed = 0
    result = {}
    if type(text) == str:
        doc = nlp(text)
        for sent in doc.sents:
            language = sent._.language['language']
            score = sent._.language['score']
            if score < 0.75:
                break
            if language in result.keys():
                result[language] += 1
            else:
                result[language] = 1
    else:
        result = {'': 0}
    return dict(sorted(result.items(), key=lambda item: item[1], reverse=True))

# Retrieve data from cache to save time
try:
    languages = pd.read_csv('data/cache/lyrics_preparation_languages_sent.csv', index_col=0)
    languages['languages_sent'] = languages['languages_sent'].apply(lambda dict: ast.literal_eval(dict))
except:
    languages = []
    l = len(lyrics)

    printProgressBar(0, l+1, prefix = 'Progress:', suffix = 'Complete', length = 50)
    for i in range(0,l):
        languages.append(detect_languages(lyrics.loc[i, 'lyrics'], nlp))
        printProgressBar(i+1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)

lyrics['languages_sent'] = languages
lyrics['languages_sent'].to_csv('data/cache/lyrics_preparation_languages_sent.csv')

In [19]:
#document level
def detect_language(text, nlp):
    DetectorFactory.seed = 0
    if type(text) == str:
        doc = nlp(text)
        result = doc._.language['language'], doc._.language['score']
    else:
        result = {'-': -1}
    return result


# Retrieve data from cache to save time
try:
    languages = pd.read_csv('data/cache/lyrics_preparation_language.csv', index_col=0)
    scores = pd.read_csv('data/cache/lyrics_preparation_language_score.csv', index_col=0)
except:
    languages = []
    scores = []
    l = len(lyrics)

    printProgressBar(0, l+1, prefix = 'Progress:', suffix = 'Complete', length = 50)
    for i in range(0,l):
        language, score = detect_language(lyrics.loc[i, 'lyrics'], nlp)
        languages.append(language)
        scores.append(score)
        printProgressBar(i+1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)

lyrics['language'] = languages
lyrics['language_score'] = scores
lyrics['language'].to_csv('data/cache/lyrics_preparation_language.csv')
lyrics['language_score'].to_csv('data/cache/lyrics_preparation_language_score.csv')

In [12]:
lyrics['length'] = lyrics['lyrics'].apply(lambda string: len(str(string)))
lyrics['word_count'] = lyrics['lyrics'].apply(lambda string: len(str(string).split()))

In [13]:
lyrics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1322 entries, 0 to 1321
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1322 non-null   float64
 1   artist          1322 non-null   object 
 2   first_artist    1322 non-null   object 
 3   song            1322 non-null   object 
 4   weeks_on_chart  1322 non-null   float64
 5   peak_rank       1322 non-null   float64
 6   lyrics          1322 non-null   object 
 7   url             1322 non-null   object 
 8   languages_sent  1322 non-null   object 
 9   language        1322 non-null   object 
 10  language_score  1322 non-null   float64
 11  length          1322 non-null   int64  
 12  word_count      1322 non-null   int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 144.6+ KB


In [14]:
def get_main_lang(languages):
    if len(languages) > 0:
        language = list(languages)[0]
        weight = languages[language] / sum(languages.values())
        return language, weight

    else:
        return '', 0

lyrics['language_sent'] = lyrics['languages_sent'].apply(lambda languages: get_main_lang(languages)[0])
lyrics['language_sent_weight'] = lyrics['languages_sent'].apply(lambda languages: get_main_lang(languages)[1])

lyrics.describe()

Unnamed: 0,id,weeks_on_chart,peak_rank,language_score,length,word_count,language_sent_weight
count,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0
mean,5177.734493,8.881997,51.16112,0.99535,2344.66944,463.667927,0.794394
std,3242.740525,11.146522,28.710498,0.036788,992.718851,192.15167,0.316905
min,0.0,1.0,1.0,0.428569,6.0,1.0,0.0
25%,2366.75,1.0,28.0,0.999996,1633.25,324.0,0.666667
50%,5586.5,3.0,53.0,0.999997,2131.0,425.0,1.0
75%,7851.0,15.0,75.0,0.999998,2909.5,578.75,1.0
max,10497.0,61.0,100.0,1.0,9019.0,1171.0,1.0


In [15]:
pd.set_option('display.max_colwidth', 150)
lyrics_non_english = lyrics.loc[lyrics['language'] != 'en']
lyrics_non_english

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,url,languages_sent,language,language_score,length,word_count,language_sent,language_sent_weight
32,8098.0,6ix9ine,6ix9ine,YaYa,1.0,99.0,"Ransom got that sauce in it\nSe pinta los labio', hoy no tiene horario\nQuiere que la busque en la Lambo\nEsta noche coronamo'\nDale, para termina...",https://genius.com/6ix9ine-yaya-lyrics,"{'id': 2, 'es': 1}",es,0.857140,1673,279,id,0.666667
52,7475.0,Agust D,Agust D,Daechwita,1.0,76.0,"명금일하 대취타 하랍신다\n예이!\n\nYeah, uh\n대취타 대취타 자 울려라 대취타\n대취타 대취타 자 울려라 대취타\n대취타 대취타 자 울려라 대취타\n대취타 대취타 자 울려라 대취타\n\n대취타 대취타 자 울려라 대취타 (Yeah-yeah)\n빛이나 빛...",https://genius.com/Agust-d-daechwita-lyrics,{'ko': 2},ko,0.999997,1219,329,ko,1.000000
62,9190.0,Anitta Featuring Cardi B & Myke Towers,Anitta,Me Gusta,1.0,91.0,"Uh-uh-uh-uh-uh-uh-uh\nGo!\nYah, yah, yah\nTra\nYah, yah, yah\n\nA mí me gusta\nEvery time you look at me that way\nA mí me gusta\nAll the dirty th...",https://genius.com/Anitta-me-gusta-lyrics,{'id': 1},id,0.714283,2111,439,id,1.000000
64,5982.0,Anuel AA,Anuel AA,KEII,1.0,83.0,"Ella ya no piensa en él (En él)\nÉl la convirtió en alguien que ella no e'\nNo le basta dar amor y ser fiel (Ser fiel)\nHoy se va modo Romeo, ella...",https://genius.com/Anuel-aa-keii-lyrics,{'es': 5},es,0.999997,2744,554,es,1.000000
65,7585.0,Anuel AA & Bad Bunny,Anuel AA,Hasta Que Dios Diga,1.0,86.0,"Brr\nHoy la noche se acaba, tú desnuda en mi cama (Cama)\nAnoche te soñé (Soñé), y me quedé con las gana' (Gana'; uah)\nBaby, apaga el celular que...",https://genius.com/Anuel-aa-and-bad-bunny-hasta-que-dios-diga-lyrics,{'es': 1},es,0.999998,3496,673,es,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
972,131.0,Pinkfong,Pinkfong,Baby Shark,20.0,32.0,"Baby shark, doo doo doo doo doo doo\nBaby shark, doo doo doo doo doo doo\nBaby shark, doo doo doo doo doo doo\nBaby shark!\n\nMommy shark, doo doo...",https://genius.com/Pinkfong-baby-shark-lyrics,{'so': 1},so,0.857141,1138,246,so,1.000000
1054,7565.0,Rosalia & Travis Scott,Rosalia,TKN,1.0,66.0,"Cosa' de familia, no la' tienen que escucha'\nLo' capo' con lo' capo' y yo soy la mamá\nLos secreto' solo con quien pueda' confia'\nMás-Más te val...",https://genius.com/Rosalia-and-travis-scott-tkn-lyrics,"{'es': 3, 'en': 1}",es,0.714283,1475,293,es,0.750000
1082,8963.0,"Sech, Daddy Yankee & J Balvin Featuring Rosalia & Farruko","Sech, Daddy Yankee & J Balvin",Relacion,11.0,64.0,"Ahora todo cambió, le toca a ella\nLatino gang\nAyer la vi perreando solita (Welcome to the remix)\nSe ve má' bonita\nAhora que no está con ese ma...",https://genius.com/Sech-daddy-yankee-and-j-balvin-relacion-remix-lyrics,"{'it': 1, 'es': 1}",es,0.999998,3695,725,it,0.500000
1083,2499.0,"Sech, Darell, Nicky Jam, Ozuna & Anuel AA","Sech, Darell, Nicky Jam, Ozuna",Otro Trago,20.0,34.0,"Sigue aquí tomándose otro tra–\n(¡This is the remix!)\nLa vida para ti no ha sido fácil\nHa sido, en el amor, muy demasia'o difícil\nHay algo en é...",https://genius.com/Sech-ozuna-and-anuel-aa-otro-trago-remix-lyrics,{},es,0.999996,4631,895,,0.000000


In [32]:
# lyrics_english = lyrics.loc[(lyrics['language'] == 'en') & (lyrics['language_sent'] == 'en')]
lyrics_english = lyrics.loc[(lyrics['language'] == 'en')]
lyrics_english = lyrics_english.rename(columns={'id' : 'billboard_id'})
lyrics_english['billboard_id'] = lyrics_english['billboard_id'].astype(int)
lyrics_english.sort_values(by='billboard_id', inplace=True)
lyrics_english.reset_index(inplace=True,drop=True)
lyrics_english['lyrics_id'] = lyrics_english.index
lyrics_english = lyrics_english[['billboard_id','lyrics_id', 'artist', 'first_artist', 'song', 'weeks_on_chart', 'peak_rank',
       'lyrics', 'length', 'word_count', 'language', 'language_score', 'languages_sent', 'language_sent', 'language_sent_weight']]

lyrics_english

Unnamed: 0,billboard_id,lyrics_id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score,languages_sent,language_sent,language_sent_weight
0,0,0,Ariana Grande,Ariana Grande,"Thank U, Next",28.0,1.0,"Thought I'd end up with Sean\nBut he wasn't a match\nWrote some songs about Ricky\nNow I listen and laugh\nEven almost got married\nAnd for Pete, ...",2409,460,en,0.999997,{'en': 8},en,1.000000
1,1,1,Halsey,Halsey,Without Me,52.0,1.0,Found you when your heart was broke\nI filled your cup until it overflowed\nTook it so far to keep you close (Keep you close)\nI was afraid to lea...,2095,435,en,0.999995,"{'en': 10, 'cy': 1, 'fi': 1}",en,0.833333
2,2,2,Mariah Carey,Mariah Carey,All I Want For Christmas Is You,43.0,1.0,I don't want a lot for Christmas\nThere is just one thing I need\nI don't care about the presents\nUnderneath the Christmas tree\nI just want you ...,1918,388,en,0.999996,"{'en': 16, 'et': 1}",en,0.941176
3,3,3,Travis Scott,Travis Scott,Sicko Mode,52.0,1.0,"Astro, yeah\nSun is down, freezin' cold\nThat's how we already know, winter's here\nMy dawg would probably do it for a Louis belt\nThat's just all...",3943,771,en,0.999998,{},,0.000000
4,4,4,Post Malone & Swae Lee,Post Malone,Sunflower (Spider-Man: Into The Spider-Verse),53.0,1.0,"Ayy, ayy, ayy, ayy (Ooh)\nOoh, ooh, ooh, ooh (Ooh)\nAyy, ayy\nOoh, ooh, ooh, ooh\n\nNeedless to say, I keep in check\nShe was a bad-bad, neverthel...",1534,305,en,0.999997,"{'en': 3, 'tl': 1, 'so': 1}",en,0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,10459,1250,Eminem,Eminem,Gnat,1.0,60.0,"Yeah, yeah, sick\n(d.a. got that dope)\n\nThey say these bars are like COVID (Bars are like COVID; what?)\nYou get 'em right off the bat (You get ...",5762,1113,en,0.999999,{'en': 6},en,1.000000
1251,10477,1251,Gabby Barrett,Gabby Barrett,The First Noel,1.0,78.0,"The First Noel, the Angels did say\nWas to certain poor shepherds in fields as they lay\nIn fields where they lay keeping their sheep\nOn a cold w...",666,132,en,0.999996,{'en': 6},en,1.000000
1252,10483,1252,Popp Hunna,Popp Hunna,Adderall (Corvette Corvette),1.0,84.0,"(Bitch)\nCorvette, Corvette\nHop in a motherfuckin' jet like jet\nDidn't even think it could get like that\nThey be like, ""Popp, why you walk like...",2366,515,en,0.999998,{'en': 1},en,1.000000
1253,10492,1253,Lil Durk,Lil Durk,Backdoor,1.0,93.0,"(Malik on the beat)\n(Ayo Bleu)\nNo, no, no, no\nYeah, yeah, yeah\n(Aura)\nNo, no, no, no\n(Turn Me Up Josh)\nWhoa, oh, oh\n\nThem niggas act like...",2778,547,en,0.999996,{},,0.000000


In [33]:
lyrics_english.to_excel('data/output/bb-t100-lyrics.xlsx')
lyrics_english.to_csv('data/lyrics/bb-t100-lyrics.csv')


