## Initial Setup

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import os
for dirname, _, filenames in os.walk('data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


data\artist_song\artist_song.csv
data\artist_song\artist_song.json
data\billboard\billboard_top_100.csv
data\input\songs-2.csv
data\input\top100.json
data\lyrics\artist_song_lyrics _bak.csv
data\lyrics\artist_song_lyrics.csv
data\lyrics\artist_song_lyrics_bak_old.csv
data\lyrics\short_lyrics.csv


In [2]:
lyrics = pd.read_csv('data/lyrics/artist_song_lyrics.csv', index_col=0, encoding='utf-16')

In [3]:
lyrics.head()

Unnamed: 0,artist,song,weeks_on_chart,peak_rank,lyrics
0,2 Chainz Featuring Ariana Grande,Rule The World,2,94,[Intro: Ariana Grande & 2 Chainz ] Yeah Uh-hu...
1,2 Chainz Featuring Kendrick Lamar,Momma I Hit A Lick,1,100,"I want it, I want it, I want it-it-it-it [Chor..."
2,2 Chainz Featuring Travis Scott,Whip,1,75,"Yeah Do it no hands, yeah, do it, no handstand..."
3,21 Savage,1.5,1,86,"My earrings cost a half a ticket, I don't hear..."
4,21 Savage,A Lot,23,12,I love you Turn my headphone down a little bit...


In [4]:
lyrics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 1308
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   artist          1309 non-null   object
 1   song            1309 non-null   object
 2   weeks_on_chart  1309 non-null   int64 
 3   peak_rank       1309 non-null   int64 
 4   lyrics          1308 non-null   object
dtypes: int64(2), object(3)
memory usage: 61.4+ KB


In [5]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

@Language.factory("language_detector")
def create_language_detector(nlp, name):
    return LanguageDetector(language_detection_function=None)

In [6]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('language_detector')

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x1f637ad3c88>

In [15]:
def detect_language(text, nlp, ret):
    if type(text) == str:
        doc = nlp(text)
        result = doc._.language
    else:
        result = {'language' : '-', 'score' : -1}
    return result[ret]

In [16]:
lyrics['length'] = lyrics['lyrics'].apply(lambda string: len(str(string)))
lyrics['word_count'] = lyrics['lyrics'].apply(lambda string: len(str(string).split()))
lyrics['language'] = lyrics['lyrics'].apply(lambda string: detect_language(string, nlp, 'language'))
lyrics['language_score'] = lyrics['lyrics'].apply(lambda string: detect_language(string, nlp, 'score'))

lyrics.describe()

Unnamed: 0,weeks_on_chart,peak_rank,length,word_count,language_score
count,1309.0,1309.0,1309.0,1309.0,1309.0
mean,8.887701,51.262796,2656.345302,518.160428,0.992357
std,11.162152,28.697031,4788.666829,857.647005,0.069351
min,1.0,1.0,3.0,1.0,-1.0
25%,1.0,28.0,1633.0,321.0,0.999996
50%,3.0,53.0,2139.0,427.0,0.999997
75%,15.0,75.0,3005.0,591.0,0.999998
max,61.0,100.0,135356.0,24046.0,1.0


## Validating lyrics


### Exploring shortest lyrics
Based on the description, I will consider lyrics with a word count less than 321 as short.

In [17]:
lyrics_sort_asc = lyrics.sort_values(by='length')
lyrics_sort_asc.head(10)

Unnamed: 0,artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
1300,Zayn,Better,1,89,,3,1,-,-1.0
657,Kid Cudi & Eminem,The Adventures Of Moon Man & Slim Shady,1,22,!Error!,7,1,de,0.85714
48,A Boogie Wit da Hoodie Featuring Young Thug,Might Not Give Up,1,66,!Error!,7,1,de,0.999996
555,Juice WRLD,Fighting Demons,2,35,!Error!,7,1,de,0.999994
832,Mariah Carey,All I Want For Christmas Is You,42,1,!Error!,7,1,de,0.999995
953,Panic! At The Disco,Into The Unknown,2,98,!Error!,7,1,de,0.999995
1004,Post Malone,Myself,1,52,!Error!,7,1,de,0.999996
1058,Sam Smith,To Die For,6,46,!Error!,7,1,de,0.999997
294,DaBaby Featuring Offset,Baby Sitter,20,59,!Error!,7,1,de,0.999998
683,Lauv & Troye Sivan,I'm So Tired...,6,81,!Error!,7,1,de,0.999995


In [18]:
short_lyrics = lyrics_sort_asc[['lyrics','length', 'word_count']].loc[lyrics['word_count'] < 321]
# for string in short_lyrics:
#     print(string)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_seq_items', None)
print(short_lyrics)

                                                 lyrics  length  word_count
1300                                                NaN       3           1
657                                             !Error!       7           1
48                                              !Error!       7           1
555                                             !Error!       7           1
832                                             !Error!       7           1
953                                             !Error!       7           1
1004                                            !Error!       7           1
1058                                            !Error!       7           1
294                                             !Error!       7           1
683                                             !Error!       7           1
1203                                            !Error!       7           1
190                                             !Error!       7           1
529         

Exporting short/missing lyrics

In [19]:
short_lyrics = lyrics.loc[lyrics['word_count'] <= 321]
short_lyrics.to_csv('data/lyrics/short_lyrics.csv')

### Exploring longest lyrics

In [20]:
lyrics_sort_desc = lyrics.sort_values(by='length', ascending=False)
lyrics_sort_desc.head(10)


Unnamed: 0,artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
531,John Rich Featuring The Five,Shut Up About Politics,1,91,JAY AND SILENT BOB STRIKE BACK Screenplay by K...,135356,24046,en,0.999997
229,Carrie Underwood,Favorite Time Of Year,3,80,"Last updated: 10/6/2018, 5:16PM MST OctoberOct...",79241,13651,en,0.999994
1165,The Kid LAROI Featuring Machine Gun Kelly,"Fuck You, Goodbye",1,99,"# 03' Adolescence - J. Cole 1, 2 - mxmtoon 1.5...",58416,11291,en,0.999995
1051,SZA X Justin Timberlake,The Other Side,7,61,"2/1 FELIX SANDMAN - ""BOYS WITH EMOTIONS"" GARZ...",20267,3594,en,0.999997
631,Kelsea Ballerini x Halsey,The Other Girl,1,95,"2/1 FELIX SANDMAN - ""BOYS WITH EMOTIONS"" GARZ...",20267,3594,en,0.999995
646,Khalid x Disclosure,Know Your Worth,8,57,"2/1 FELIX SANDMAN - ""BOYS WITH EMOTIONS"" GARZ...",20267,3594,en,0.999996
1209,Trevor Daniel x Selena Gomez,Past Life,5,77,"8/1 Rob Markman - ""Easy Money Sniper"" vowl. & ...",19182,3406,en,0.999996
890,Moneybagg Yo X Megan Thee Stallion,All Dat,4,70,"10/1 Ant Clemons - ""4 Letter Word"" ft. Timba...",13926,2528,en,0.999995
191,Black Eyed Peas X J Balvin,RITMO (Bad Boys For Life),27,26,"10/1 Ant Clemons - ""4 Letter Word"" ft. Timbala...",13441,2528,en,0.999995
152,Bad Bunny X Anuel AA,Esta Cabron Ser Yo,1,97,favoritos 2020 por Rogelio 📝 Canciones 📝 • 3ra...,12635,2607,es,0.571428


I assume most of the songs have a length between 1 and 5 minutes. According to speech and publication coach Daphne Gray-Grant,
a fast speaker might get to 800 words per minute in 5 minutes. Now assuming that the lyrics will contain words that aren't sung or spoken
like "Intro" and some songs will contain interjections like "hey" which are quick to say, a word count of maximum 1000 words in
a chart song seems realistic. Some songs might be in a different language and thus can have a greater word count.

To validate long lyrics and detect outliers, I am looking a songs with a word count greater than 1000.

In [21]:
long_lyrics = lyrics_sort_desc[['lyrics','length', 'word_count']].loc[lyrics['word_count'] > 1000]
print(long_lyrics)

                                                 lyrics  length  word_count
531   JAY AND SILENT BOB STRIKE BACK Screenplay by K...  135356       24046
229   Last updated: 10/6/2018, 5:16PM MST OctoberOct...   79241       13651
1165  # 03' Adolescence - J. Cole 1, 2 - mxmtoon 1.5...   58416       11291
1051  2/1 FELIX SANDMAN -  "BOYS WITH EMOTIONS" GARZ...   20267        3594
631   2/1 FELIX SANDMAN -  "BOYS WITH EMOTIONS" GARZ...   20267        3594
646   2/1 FELIX SANDMAN -  "BOYS WITH EMOTIONS" GARZ...   20267        3594
1209  8/1 Rob Markman - "Easy Money Sniper" vowl. & ...   19182        3406
890   10/1 Ant Clemons -  "4 Letter Word"  ft. Timba...   13926        2528
191   10/1 Ant Clemons - "4 Letter Word" ft. Timbala...   13441        2528
152   favoritos 2020 por Rogelio 📝 Canciones 📝 • 3ra...   12635        2607
647   4/1 Ariana Grande & Victoria Monet - "MONOPOLY...   10503        1979
575   7/1 Hammerhedd - Grand Currents -8/8 ZICO (지코)...    9802        1988
216   Most P

Exporting long lyrics

In [None]:
long_lyrics = lyrics_sort_desc.loc[lyrics['word_count'] > 1000]
long_lyrics.to_csv('data/lyrics/long_lyrics.csv')


### Exploring songs classified as non english

In [25]:
non_english = lyrics.loc[lyrics['language'] != 'en']
non_english.head(15)

Unnamed: 0,artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
32,6ix9ine,YaYa,1,99,Ransom got that sauce in it Se pinta los labio...,1675,279,es,0.857139
48,A Boogie Wit da Hoodie Featuring Young Thug,Might Not Give Up,1,66,!Error!,7,1,de,0.999996
52,Agust D,Daechwita,1,76,"명금일하 대취타 하랍신다 예이! Yeah, uh 대취타 대취타 자 울려라 대취타 ...",1222,329,ko,1.0
64,Anuel AA,KEII,1,83,Ella ya no piensa en él (En él) Él la convirti...,2746,554,es,0.999995
65,Anuel AA & Bad Bunny,Hasta Que Dios Diga,1,86,"Brr Hoy la noche se acaba, tú desnuda en mi ca...",3498,673,es,0.999995
66,Anuel AA & Karol G,Secreto,11,68,"Bebecita Bebe-bebecita, uah Bebecita Lo de no...",2711,498,es,0.999997
67,Anuel AA & Romeo Santos,Ella Quiere Beber,20,61,"[Intro: Romeo Santos & Anuel AA ] Check, chec...",2762,544,es,0.999996
68,"Anuel AA, Daddy Yankee, Karol G, Ozuna & J Balvin",China,18,43,"[Intro: Rikrok, Anuel AA , Daddy Yankee & ...",4703,954,es,0.999996
69,Arcangel x Sech,Sigues Con El,3,78,* = letra incompleta. = Canción filtrada 02/1...,7029,1338,es,0.571425
98,Ariana Grande Featuring Doja Cat,Motive,2,32,Mu-Mu-Mu-Murda Dime por qué tengo este presen...,2099,383,es,0.999996


In [24]:
non_english.describe()

Unnamed: 0,weeks_on_chart,peak_rank,length,word_count,language_score
count,141.0,141.0,141.0,141.0,141.0
mean,8.177305,50.574468,2329.624113,422.907801,0.951364
std,9.836727,30.550248,1822.521654,333.01246,0.194553
min,1.0,1.0,3.0,1.0,-1.0
25%,1.0,22.0,1311.0,257.0,0.999995
50%,3.0,54.0,2099.0,400.0,0.999996
75%,15.0,76.0,3145.0,548.0,0.999997
max,52.0,99.0,12635.0,2607.0,1.0


In [29]:
non_english.loc[non_english['language_score'] < 0.80].head(20)

Unnamed: 0,artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
69,Arcangel x Sech,Sigues Con El,3,78,* = letra incompleta. = Canción filtrada 02/1...,7029,1338,es,0.571425
120,BTS,Filter,1,87,"너의 따분한 그 표정 지루한 발끝 Please, look at me now 핸드폰은...",823,203,ko,0.714282
122,BTS,Life Goes On,3,1,어느 날 세상이 멈췄어 아무런 예고도 하나 없이 봄은 기다림을 몰라서 눈치 없이 와...,1071,263,ko,0.571427
151,Bad Bunny Featuring Drake,MIA,27,5,"[Intro: Sean Paul, Bad Bunny & Drake ] Ble...",3497,743,es,0.571427
152,Bad Bunny X Anuel AA,Esta Cabron Ser Yo,1,97,favoritos 2020 por Rogelio 📝 Canciones 📝 • 3ra...,12635,2607,es,0.571428
214,CJ,Whoopty,6,51,"Heavy on the SSO shit, man Loyalty over royalt...",9020,444,lv,0.571428
270,"DJ Snake, J. Balvin & Tyga",Loco Contigo,2,95,"Tú me tienes loco, loco contigo Yo trato y tra...",1992,385,es,0.714284
383,Eminem Featuring Joyner Lucas,Lucky You,14,6,Wow.. shetty shetty Haan.. haa haa Haa tu rahe...,4607,911,id,0.714285
482,Hikaru Utada & Skrillex,Face My Fears,1,98,宇多田ヒカル & Skrillexの「Face My Fears」歌詞 ねえ　どれくらい ...,535,107,fr,0.714283
516,Jason Derulo,Take You Dancing,3,94,Da-da-da-da-da-da Da-da-da-da-da-da Da-da-da-d...,2250,385,id,0.571425
