## Initial Setup

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import os
for dirname, _, filenames in os.walk('data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


data\artist_song\artist_song.csv
data\artist_song\artist_song.json
data\billboard\billboard_top_100.csv
data\billboard\billboard_top_100_2020.csv
data\input\songs-2.csv
data\input\top100.json
data\input\top100_old.json
data\lyrics\artist_song_lyrics.csv
data\lyrics\artist_song_lyrics.xlsx
data\lyrics\artist_song_lyrics_bak.csv
data\lyrics\artist_song_lyrics_manual.csv
data\lyrics\filtered_lyrics.csv
data\lyrics\filtered_lyrics.xlsx
data\lyrics\filtered_lyrics_bak.csv
data\lyrics\filtered_lyrics_bak.xlsx
data\lyrics\long_lyrics.csv
data\lyrics\long_lyrics.xlsx
data\lyrics\lyrics.xlsx
data\lyrics\missing_lyrics.csv
data\lyrics\missing_lyrics.xlsx
data\lyrics\missing_lyrics_bak.csv
data\lyrics\missing_lyrics_bak.xlsx
data\lyrics\short_lyrics.csv
data\lyrics\~$filtered_lyrics_bak.xlsx


In [2]:
lyrics = pd.read_csv('data/lyrics/artist_song_lyrics.csv', index_col=0, encoding='utf-8')

In [3]:
lyrics.head()

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics
0,1093,2 Chainz Featuring Ariana Grande,2 Chainz,Rule The World,2,94,Yeah Uh-huh (Hitmaka) 2 Chainz Top down on th...
1,1099,2 Chainz Featuring Kendrick Lamar,2 Chainz,Momma I Hit A Lick,1,100,"I want it, I want it, I want it-it-it-it Chil..."
2,1074,2 Chainz Featuring Travis Scott,2 Chainz,Whip,1,75,"Yeah Do it no hands, yeah, do it, no handstand..."
3,85,21 Savage,21 Savage,1.5,1,86,"My earrings cost a half a ticket, I don't hear..."
4,36,21 Savage,21 Savage,A Lot,23,12,I love you Turn my headphone down a little bit...


In [4]:
lyrics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1322 entries, 0 to 1321
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              1322 non-null   int64 
 1   artist          1322 non-null   object
 2   first_artist    1322 non-null   object
 3   song            1322 non-null   object
 4   weeks_on_chart  1322 non-null   int64 
 5   peak_rank       1322 non-null   int64 
 6   lyrics          1321 non-null   object
dtypes: int64(3), object(4)
memory usage: 82.6+ KB


In [5]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

@Language.factory("language_detector")
def create_language_detector(nlp, name):
   return LanguageDetector(language_detection_function=None)
#

In [6]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('language_detector')


<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x259ee7ca430>

In [7]:
def detect_language(text, nlp, ret):
    if type(text) == str:
        doc = nlp(text)
        result = doc._.language
    else:
        result = {'language' : '-', 'score' : -1}
    return result[ret]

In [8]:
lyrics['length'] = lyrics['lyrics'].apply(lambda string: len(str(string)))
lyrics['word_count'] = lyrics['lyrics'].apply(lambda string: len(str(string).split()))
# lyrics['language'] = ''
# lyrics['language_scrore'] = ''
lyrics['language'] = lyrics['lyrics'].apply(lambda string: detect_language(string, nlp, 'language'))
lyrics['language_score'] = lyrics['lyrics'].apply(lambda string: detect_language(string, nlp, 'score'))

lyrics.describe()

Unnamed: 0,id,weeks_on_chart,peak_rank,length,word_count,language_score
count,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0
mean,5177.734493,8.881997,51.16112,2347.776097,462.195915,0.993945
std,3242.740525,11.146522,28.710498,2406.488584,425.475731,0.064779
min,0.0,1.0,1.0,3.0,1.0,-1.0
25%,2366.75,1.0,28.0,1585.5,314.0,0.999996
50%,5586.5,3.0,53.0,2097.0,419.0,0.999997
75%,7851.0,15.0,75.0,2885.0,569.0,0.999998
max,10497.0,61.0,100.0,79241.0,13651.0,1.0


## Validating lyrics


### Exploring shortest lyrics
Based on the description, I will consider lyrics with a word count less than 321 as short.

In [9]:
lyrics_sort_asc = lyrics.sort_values(by='length')
lyrics_sort_asc.head(10)

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
1320,4080,j-hope Featuring Becky G.,j-hope,Chicken Noodle Soup,1,81,,3,1,-,-1.0
1247,24,Wham!,Wham!,Last Christmas,19,9,!Error!,7,1,de,0.999996
1001,8033,Pop Smoke Featuring Quavo,Pop Smoke,Aim For The Moon,2,34,!Error!,7,1,de,0.999994
1183,4910,The Weeknd,The Weeknd,Blinding Lights,55,1,!Error!,7,1,de,0.857138
1005,8056,Pop Smoke Featuring Swae Lee,Pop Smoke,Creature,1,57,!Error!,7,1,de,0.999994
182,198,Billie Eilish,Billie Eilish,Ocean Eyes,20,84,!Error!,7,1,de,0.999994
353,8601,Drake Featuring Lil Durk,Drake,Laugh Now Cry Later,19,2,!Error!,7,1,de,0.999997
1050,4474,Roddy Ricch & Gunna,Roddy Ricch,Start Wit Me,13,56,!Error!,7,1,de,0.999996
1046,7995,Rod Wave Featuring ATR Son Son,Rod Wave,Rags2Riches,20,12,!Error!,7,1,de,0.999995
1042,6777,Rod Wave,Rod Wave,The Greatest,1,78,!Error!,7,1,de,0.999995


In [10]:
short_lyrics = lyrics_sort_asc[['lyrics','length', 'word_count']].loc[lyrics['word_count'] < 321]
# for string in short_lyrics:
#     print(string)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_seq_items', None)
print(short_lyrics)

                                                 lyrics  length  word_count
1320                                                NaN       3           1
1247                                            !Error!       7           1
1001                                            !Error!       7           1
1183                                            !Error!       7           1
1005                                            !Error!       7           1
182                                             !Error!       7           1
353                                             !Error!       7           1
1050                                            !Error!       7           1
1046                                            !Error!       7           1
1042                                            !Error!       7           1
1007                                            !Error!       7           1
91                                              !Error!       7           1
1214        

Exporting short/missing lyrics

In [11]:
short_lyrics = lyrics.loc[lyrics['word_count'] <= 321]
short_lyrics.to_csv('data/lyrics/short_lyrics.csv')

### Exploring longest lyrics

In [12]:
lyrics_sort_desc = lyrics.sort_values(by='length', ascending=False)
lyrics_sort_desc.head(10)


Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
231,10194,Carrie Underwood,Carrie Underwood,Favorite Time Of Year,4,62,"Last updated: 10/6/2018, 5:16PM MST OctoberOct...",79241,13651,en,0.999996
1003,8053,Pop Smoke Featuring Quavo & Future,Pop Smoke,Snitchin,1,54,Definitely did take a long time 2020 been a sa...,16671,3123,en,0.999996
727,8961,Lil Durk,Lil Durk,The Voice,1,62,IDK when i listened to these The Heist - Mackl...,9941,2069,en,0.999993
216,9896,CJ,CJ,Whoopty,7,51,"Heavy on the SSO shit, man Loyalty over royalt...",9020,444,lv,0.571424
1114,9596,T.I. Featuring Lil Baby,T.I.,Pardon,1,97,"June, you're a genius Hitmaka Heard somebody ...",6018,1171,en,0.999996
801,1904,Logic Featuring Eminem,Logic,Homicide,6,5,"Son, you know why you the greatest alive? Why,...",5955,1089,en,0.999998
381,5627,Eminem,Eminem,Darkness,2,28,"I don't wanna be alone, I don't wanna be I don...",5929,1095,en,0.999996
949,964,Offset Featuring J. Cole,Offset,How Did I Get Here,1,65,"Uh, nobody, you know what I'm sayin', nobody, ...",5885,1088,en,0.999998
1207,9296,Tory Lanez,Tory Lanez,Money Over Fallouts,1,97,It's one of the crazier stories we’ve heard of...,5883,1148,en,0.999996
390,193,Eminem Featuring Joyner Lucas,Eminem,Lucky You,14,6,"Woah, Joyner, Joyner, yeah, yeah, yeah Yeah, ...",5800,1145,en,0.999998


I assume most of the songs have a length between 1 and 5 minutes. According to speech and publication coach Daphne Gray-Grant,
a fast speaker might get to 800 words per minute in 5 minutes. Now assuming that the lyrics will contain words that aren't sung or spoken
like "Intro" and some songs will contain interjections like "hey" which are quick to say, a word count of maximum 1000 words in
a chart song seems realistic. Some songs might be in a different language and thus can have a greater word count.

To validate long lyrics and detect outliers, I am looking a songs with a word count greater than 1000.

In [13]:
long_lyrics = lyrics_sort_desc[['lyrics','length', 'word_count']].loc[lyrics['word_count'] > 1000]
print(long_lyrics)

                                                 lyrics  length  word_count
231   Last updated: 10/6/2018, 5:16PM MST OctoberOct...   79241       13651
1003  Definitely did take a long time 2020 been a sa...   16671        3123
727   IDK when i listened to these The Heist - Mackl...    9941        2069
1114  June, you're a genius Hitmaka  Heard somebody ...    6018        1171
801   Son, you know why you the greatest alive? Why,...    5955        1089
381   I don't wanna be alone, I don't wanna be I don...    5929        1095
949   Uh, nobody, you know what I'm sayin', nobody, ...    5885        1088
1207  It's one of the crazier stories we’ve heard of...    5883        1148
390   Woah, Joyner, Joyner, yeah, yeah, yeah  Yeah, ...    5800        1145
382   Yeah, yeah, sick (d.a. got that dope)  They sa...    5765        1113
919   (Yeah) Hey! Leave me alone  Panic-stricken, ha...    5713        1064
712   Protests and growing national outcry continues...    5611        1132
948   Woo  I

Exporting long lyrics

In [14]:
long_lyrics = lyrics_sort_desc.loc[lyrics['word_count'] > 1000]
long_lyrics.to_excel('data/lyrics/long_lyrics.xlsx', encoding='utf-8')


### Exploring songs classified as non english

In [15]:
non_english = lyrics.loc[lyrics['language'] != 'en']
non_english.head(15)

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
32,8098,6ix9ine,6ix9ine,YaYa,1,99,Ransom got that sauce in it Se pinta los labio...,1675,279,es,0.999996
52,7475,Agust D,Agust D,Daechwita,1,76,"명금일하 대취타 하랍신다 예이! Yeah, uh 대취타 대취타 자 울려라 대취타 ...",1221,329,ko,0.999996
64,5982,Anuel AA,Anuel AA,KEII,1,83,Ella ya no piensa en él (En él) Él la convirti...,2746,554,es,0.999994
65,7585,Anuel AA & Bad Bunny,Anuel AA,Hasta Que Dios Diga,1,86,"Brr Hoy la noche se acaba, tú desnuda en mi ca...",3498,673,es,0.999996
66,467,Anuel AA & Karol G,Anuel AA,Secreto,11,68,"Bebecita Bebe-bebecita, uah Bebecita Lo de no...",2711,498,es,0.999996
67,91,Anuel AA & Romeo Santos,Anuel AA,Ella Quiere Beber,20,61,"Check, check (Remix) I'mma show you why I'm th...",2612,511,es,0.999997
68,3051,"Anuel AA, Daddy Yankee, Karol G, Ozuna & J Balvin","Anuel AA, Daddy Yankee, Karol G, Ozuna",China,18,43,Honey came in and she caught me red-handed Cre...,4180,819,es,0.999995
69,6877,Arcangel x Sech,Arcangel,Sigues Con El,3,78,Si tú te vuelves loca por mí (Por mí) Y yo me ...,2526,487,es,0.999994
76,724,Ariana Grande,Ariana Grande,Ghostin,2,25,!Error!,7,1,de,0.999996
91,0,Ariana Grande,Ariana Grande,"Thank U, Next",28,1,!Error!,7,1,de,0.85714


In [16]:
non_english.describe()


Unnamed: 0,id,weeks_on_chart,peak_rank,length,word_count,language_score
count,102.0,102.0,102.0,102.0,102.0,102.0
mean,5809.480392,10.578431,53.529412,1347.911765,250.313725,0.948175
std,3559.741863,13.225715,31.902297,1434.886859,235.065273,0.211594
min,0.0,1.0,1.0,3.0,1.0,-1.0
25%,2518.5,1.0,26.75,7.0,1.0,0.999994
50%,6274.0,4.0,62.5,1338.5,281.5,0.999995
75%,8916.75,18.0,80.5,2169.75,410.0,0.999996
max,10483.0,55.0,99.0,9020.0,821.0,0.999999


In [17]:
non_english.loc[non_english['language_score'] > 0.80].head(20)

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
32,8098,6ix9ine,6ix9ine,YaYa,1,99,Ransom got that sauce in it Se pinta los labio...,1675,279,es,0.999996
52,7475,Agust D,Agust D,Daechwita,1,76,"명금일하 대취타 하랍신다 예이! Yeah, uh 대취타 대취타 자 울려라 대취타 ...",1221,329,ko,0.999996
64,5982,Anuel AA,Anuel AA,KEII,1,83,Ella ya no piensa en él (En él) Él la convirti...,2746,554,es,0.999994
65,7585,Anuel AA & Bad Bunny,Anuel AA,Hasta Que Dios Diga,1,86,"Brr Hoy la noche se acaba, tú desnuda en mi ca...",3498,673,es,0.999996
66,467,Anuel AA & Karol G,Anuel AA,Secreto,11,68,"Bebecita Bebe-bebecita, uah Bebecita Lo de no...",2711,498,es,0.999996
67,91,Anuel AA & Romeo Santos,Anuel AA,Ella Quiere Beber,20,61,"Check, check (Remix) I'mma show you why I'm th...",2612,511,es,0.999997
68,3051,"Anuel AA, Daddy Yankee, Karol G, Ozuna & J Balvin","Anuel AA, Daddy Yankee, Karol G, Ozuna",China,18,43,Honey came in and she caught me red-handed Cre...,4180,819,es,0.999995
69,6877,Arcangel x Sech,Arcangel,Sigues Con El,3,78,Si tú te vuelves loca por mí (Por mí) Y yo me ...,2526,487,es,0.999994
76,724,Ariana Grande,Ariana Grande,Ghostin,2,25,!Error!,7,1,de,0.999996
91,0,Ariana Grande,Ariana Grande,"Thank U, Next",28,1,!Error!,7,1,de,0.85714


### Excel Export

In [18]:
lyrics.to_excel('data/lyrics/lyrics.xlsx',encoding='utf-8')

### Collecting Missing Lyrics

In [19]:
print(lyrics.loc[(lyrics['length'] <=  8 )].index)
missing_lyrics_ids = lyrics.loc[(lyrics['length'] <=  8 )].index.tolist()
missing_lyrics_ids.extend([1003, 190, 231, 727, 97, 196, 339, 374, 379, 497, 607, 654, 1019, 1178, 1249, 1254])
print(missing_lyrics_ids)

Int64Index([  76,   91,  153,  182,  192,  244,  263,  321,  353,  506,  538,
             633,  731,  736,  842,  887,  970,  993,  998,  999, 1001, 1005,
            1007, 1009, 1010, 1017, 1042, 1046, 1050, 1091, 1094, 1104, 1122,
            1131, 1183, 1190, 1214, 1216, 1247, 1320],
           dtype='int64')
[76, 91, 153, 182, 192, 244, 263, 321, 353, 506, 538, 633, 731, 736, 842, 887, 970, 993, 998, 999, 1001, 1005, 1007, 1009, 1010, 1017, 1042, 1046, 1050, 1091, 1094, 1104, 1122, 1131, 1183, 1190, 1214, 1216, 1247, 1320, 1003, 190, 231, 727, 97, 196, 339, 374, 379, 497, 607, 654, 1019, 1178, 1249, 1254]


In [20]:
missing_lyrics = lyrics.loc[missing_lyrics_ids]
missing_lyrics

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
76,724,Ariana Grande,Ariana Grande,Ghostin,2,25,!Error!,7,1,de,0.999996
91,0,Ariana Grande,Ariana Grande,"Thank U, Next",28,1,!Error!,7,1,de,0.85714
153,6252,Bad Bunny X Daddy Yankee,Bad Bunny,La Santa,2,53,!Error!,7,1,de,0.999995
182,198,Billie Eilish,Billie Eilish,Ocean Eyes,20,84,!Error!,7,1,de,0.999994
192,10435,Bing Crosby & The Andrews Sisters,Bing Crosby,Mele Kalikimaka (Merry Christmas),1,36,!NoSong!,8,1,tl,0.999995
244,2408,Chris Brown Featuring Drake,Chris Brown,No Guidance,46,5,!Error!,7,1,de,0.857139
263,2118,DJ Khaled Featuring Cardi B & 21 Savage,DJ Khaled,Wish Wish,15,19,!Error!,7,1,de,0.714282
321,3477,Diplo Presents Thomas Wesley Featuring Morgan ...,Diplo Presents Thomas Wesley,Heartless,21,39,!NoSong!,8,1,tl,0.999993
353,8601,Drake Featuring Lil Durk,Drake,Laugh Now Cry Later,19,2,!Error!,7,1,de,0.999997
506,8299,J. Cole,J. Cole,the.climb.back,2,52,!NoSong!,8,1,tl,0.999995


#### Adding available genius links to missing lyrics

In [21]:
lyrics_links = [
    'https://genius.com/Ariana-grande-ghostin-lyrics',
    'https://genius.com/Ariana-grande-thank-u-next-lyrics',
    '',
    'https://genius.com/Billie-eilish-ocean-eyes-lyrics',
    'https://genius.com/Bing-crosby-and-the-andrews-sisters-mele-kalikimaka-lyrics',
    'https://genius.com/Chris-brown-no-guidance-lyrics',
    'https://genius.com/Dj-khaled-wish-wish-lyrics',
    'https://genius.com/Diplo-heartless-lyrics',
    'https://genius.com/Drake-laugh-now-cry-later-lyrics',
    'https://genius.com/J-cole-t-h-e-c-l-i-m-b-b-a-c-k-lyrics',
    'https://genius.com/John-lennon-and-yoko-ono-happy-xmas-war-is-over-lyrics',
    'https://genius.com/Keith-urban-and-p-nk-one-too-many-lyrics',
    'https://genius.com/Lil-mosey-blueberry-faygo-lyrics',
    'https://genius.com/Lil-nas-x-old-town-road-lyrics',
    'https://genius.com/Mariah-carey-all-i-want-for-christmas-is-you-lyrics',
    'https://genius.com/Migos-give-no-fxk-lyrics',
    'https://genius.com/Perry-como-its-beginning-to-look-a-lot-like-christmas-lyrics',
    'https://genius.com/Pop-smoke-tunnel-vision-outro-lyrics',
    'https://genius.com/Pop-smoke-diana-lyrics',
    'https://genius.com/Pop-smoke-for-the-night-lyrics',
    'https://genius.com/Pop-smoke-aim-for-the-moon-lyrics',
    'https://genius.com/Pop-smoke-creature-lyrics',
    'https://genius.com/Popp-hunna-adderall-corvette-corvette-lyrics',
    'https://genius.com/Post-malone-allergic-lyrics',
    'https://genius.com/Post-malone-better-now-lyrics',
    'https://genius.com/Post-malone-saint-tropez-lyrics',
    'https://genius.com/Rod-wave-the-greatest-lyrics',
    'https://genius.com/Rod-wave-rags2riches-lyrics',
    'https://genius.com/Roddy-ricch-start-wit-me-lyrics',
    'https://genius.com/Shawn-mendes-and-camila-cabello-senorita-lyrics',
    'https://genius.com/Sheck-wes-mo-bamba-lyrics',
    'https://genius.com/Summer-walker-playing-games-lyrics',
    'https://genius.com/Taylor-swift-cardigan-lyrics',
    'https://genius.com/Taylor-swift-dorothea-lyrics',
    'https://genius.com/The-weeknd-blinding-lights-lyrics',
    'https://genius.com/The-weeknd-scared-to-live-lyrics',
    'https://genius.com/Travis-scott-goosebumps-lyrics',
    'https://genius.com/Travis-scott-sicko-mode-lyrics',
    'https://genius.com/Wham-last-christmas-lyrics',
    'https://genius.com/J-hope-chicken-noodle-soup-lyrics',
    'https://genius.com/Pop-smoke-snitching-lyrics',
    'https://genius.com/Bing-crosby-ill-be-home-for-christmas-if-only-in-my-dreams-lyrics',
    '',
    'https://genius.com/Lil-durk-the-voice-lyrics',
     'https://genius.com/Ariana-grande-34-35-lyrics',
    'https://genius.com/Blake-shelton-nobody-but-you-lyrics',
'https://genius.com/Drake-how-bout-now-lyrics',
    'https://genius.com/Ed-sheeran-chris-stapleton-and-bruno-mars-blow-lyrics',
    'https://genius.com/Ellie-goulding-and-diplo-close-to-me-lyrics',
    'https://genius.com/Internet-money-lemonade-lyrics',
    'https://genius.com/Kane-brown-swae-lee-and-khalid-be-like-that-lyrics',
    'https://genius.com/Khalid-and-john-mayer-outta-my-head-lyrics',
'https://genius.com/Post-malone-and-swae-lee-sunflower-lyrics',
    'https://genius.com/The-kid-laroi-fck-you-goodbye-lyrics',
    'https://genius.com/Why-dont-we-fallin-adrenaline-lyrics',
    'https://genius.com/Xxxtentacion-and-lil-pump-arms-around-you-lyrics'

]
missing_lyrics['link'] = lyrics_links
missing_lyrics


Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score,link
76,724,Ariana Grande,Ariana Grande,Ghostin,2,25,!Error!,7,1,de,0.999996,https://genius.com/Ariana-grande-ghostin-lyrics
91,0,Ariana Grande,Ariana Grande,"Thank U, Next",28,1,!Error!,7,1,de,0.85714,https://genius.com/Ariana-grande-thank-u-next-...
153,6252,Bad Bunny X Daddy Yankee,Bad Bunny,La Santa,2,53,!Error!,7,1,de,0.999995,
182,198,Billie Eilish,Billie Eilish,Ocean Eyes,20,84,!Error!,7,1,de,0.999994,https://genius.com/Billie-eilish-ocean-eyes-ly...
192,10435,Bing Crosby & The Andrews Sisters,Bing Crosby,Mele Kalikimaka (Merry Christmas),1,36,!NoSong!,8,1,tl,0.999995,https://genius.com/Bing-crosby-and-the-andrews...
244,2408,Chris Brown Featuring Drake,Chris Brown,No Guidance,46,5,!Error!,7,1,de,0.857139,https://genius.com/Chris-brown-no-guidance-lyrics
263,2118,DJ Khaled Featuring Cardi B & 21 Savage,DJ Khaled,Wish Wish,15,19,!Error!,7,1,de,0.714282,https://genius.com/Dj-khaled-wish-wish-lyrics
321,3477,Diplo Presents Thomas Wesley Featuring Morgan ...,Diplo Presents Thomas Wesley,Heartless,21,39,!NoSong!,8,1,tl,0.999993,https://genius.com/Diplo-heartless-lyrics
353,8601,Drake Featuring Lil Durk,Drake,Laugh Now Cry Later,19,2,!Error!,7,1,de,0.999997,https://genius.com/Drake-laugh-now-cry-later-l...
506,8299,J. Cole,J. Cole,the.climb.back,2,52,!NoSong!,8,1,tl,0.999995,https://genius.com/J-cole-t-h-e-c-l-i-m-b-b-a-...


#### Export to run LyricsGenius on remaining lyrics



In [22]:
missing_lyrics.to_excel('data/lyrics/missing_lyrics.xlsx', encoding='utf-8')

#### Now the lyrics are nearly complete

In [28]:
missing_lyrics_complete = pd.read_csv('data/lyrics/missing_lyrics.csv')
missing_lyrics_complete.set_index('Unnamed: 0.1', inplace=True)
missing_lyrics_complete.index.name = None
missing_lyrics_complete = missing_lyrics_complete.drop(['Unnamed: 0', 'link'], axis=1)
missing_lyrics_complete.head()

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
76,724,Ariana Grande,Ariana Grande,Ghostin,2,25,I know you hear me when I cry I try to hold it...,7,1,de,0.999996
91,0,Ariana Grande,Ariana Grande,"Thank U, Next",28,1,Thought I'd end up with Sean But he wasn't a m...,7,1,de,0.85714
153,6252,Bad Bunny X Daddy Yankee,Bad Bunny,La Santa,2,53,!Error!,7,1,de,0.999995
182,198,Billie Eilish,Billie Eilish,Ocean Eyes,20,84,I've been watchin' you for some time Can't sto...,7,1,de,0.999994
192,10435,Bing Crosby & The Andrews Sisters,Bing Crosby,Mele Kalikimaka (Merry Christmas),1,36,Mele Kalikimaka is the thing to say On a brigh...,8,1,tl,0.999995


In [29]:
missing_lyrics_complete['length'] = missing_lyrics_complete['lyrics'].apply(lambda string: len(str(string)))
missing_lyrics_complete['word_count'] = missing_lyrics_complete['lyrics'].apply(lambda string: len(str(string).split()))
missing_lyrics_complete['language'] = missing_lyrics_complete['lyrics'].apply(lambda string: detect_language(string, nlp, 'language'))
missing_lyrics_complete['language_score'] = missing_lyrics_complete['lyrics'].apply(lambda string: detect_language(string, nlp, 'score'))
missing_lyrics_complete.head()


Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
76,724,Ariana Grande,Ariana Grande,Ghostin,2,25,I know you hear me when I cry I try to hold it...,1709,342,en,0.999997
91,0,Ariana Grande,Ariana Grande,"Thank U, Next",28,1,Thought I'd end up with Sean But he wasn't a m...,2427,459,en,0.999996
153,6252,Bad Bunny X Daddy Yankee,Bad Bunny,La Santa,2,53,!Error!,7,1,de,0.999994
182,198,Billie Eilish,Billie Eilish,Ocean Eyes,20,84,I've been watchin' you for some time Can't sto...,1196,211,en,0.999997
192,10435,Bing Crosby & The Andrews Sisters,Bing Crosby,Mele Kalikimaka (Merry Christmas),1,36,Mele Kalikimaka is the thing to say On a brigh...,1216,230,en,0.999995


In [30]:
new_lyrics = lyrics
new_lyrics.update(missing_lyrics_complete)
new_lyrics['id'] = new_lyrics['id'].astype(int)
new_lyrics.head()

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
0,1093,2 Chainz Featuring Ariana Grande,2 Chainz,Rule The World,2.0,94.0,Yeah Uh-huh (Hitmaka) 2 Chainz Top down on th...,2905.0,604.0,en,0.999994
1,1099,2 Chainz Featuring Kendrick Lamar,2 Chainz,Momma I Hit A Lick,1.0,100.0,"I want it, I want it, I want it-it-it-it Chil...",2703.0,530.0,en,0.999997
2,1074,2 Chainz Featuring Travis Scott,2 Chainz,Whip,1.0,75.0,"Yeah Do it no hands, yeah, do it, no handstand...",3192.0,629.0,en,0.999994
3,85,21 Savage,21 Savage,1.5,1.0,86.0,"My earrings cost a half a ticket, I don't hear...",2482.0,471.0,en,0.999995
4,36,21 Savage,21 Savage,A Lot,23.0,12.0,I love you Turn my headphone down a little bit...,5037.0,1056.0,en,0.999997


In [32]:
filtered_lyrics = new_lyrics.loc[new_lyrics['language'] == 'en']
filtered_lyrics = filtered_lyrics.loc[(new_lyrics['length'] >  8 )]
filtered_lyrics.head()

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,length,word_count,language,language_score
0,1093,2 Chainz Featuring Ariana Grande,2 Chainz,Rule The World,2.0,94.0,Yeah Uh-huh (Hitmaka) 2 Chainz Top down on th...,2905.0,604.0,en,0.999994
1,1099,2 Chainz Featuring Kendrick Lamar,2 Chainz,Momma I Hit A Lick,1.0,100.0,"I want it, I want it, I want it-it-it-it Chil...",2703.0,530.0,en,0.999997
2,1074,2 Chainz Featuring Travis Scott,2 Chainz,Whip,1.0,75.0,"Yeah Do it no hands, yeah, do it, no handstand...",3192.0,629.0,en,0.999994
3,85,21 Savage,21 Savage,1.5,1.0,86.0,"My earrings cost a half a ticket, I don't hear...",2482.0,471.0,en,0.999995
4,36,21 Savage,21 Savage,A Lot,23.0,12.0,I love you Turn my headphone down a little bit...,5037.0,1056.0,en,0.999997


In [33]:
filtered_lyrics.to_csv('data/lyrics/filtered_lyrics.csv')
filtered_lyrics.to_excel('data/lyrics/filtered_lyrics.xlsx', encoding='utf-8')