In [42]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import os
for dirname, _, filenames in os.walk('../../data/billboard'):
    for filename in filenames:
        print(os.path.join(dirname, filename).replace("\\","/"))

../../data/billboard/bb_t100_en.csv
../../data/billboard/billboard.csv
../../data/billboard/billboard_top_100_2020.csv
../../data/billboard/backups/bb-t100_backup.csv


#### Grouping song and artist

To avoid redundent lyrics requests for lyrics I might already I have, I grouped artists and songs.

In [79]:
bb_t100 = pd.read_csv('../../data/billboard/billboard.csv', index_col=0)

'2019-01-05'

In [83]:
artist_song_pt = pd.pivot_table(bb_t100, index=['artist', 'song'],
                                values=['peak_rank', 'weeks_on_chart', 'id', 'date'],
                                aggfunc={'peak_rank' : 'min', 'weeks_on_chart' : 'max', 'id' : 'min', 'date' : 'min'})
artist_song_pt

Unnamed: 0_level_0,Unnamed: 1_level_0,date,id,peak_rank,weeks_on_chart
artist,song,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2 Chainz Featuring Ariana Grande,Rule The World,2019-03-16,1093,94,2
2 Chainz Featuring Kendrick Lamar,Momma I Hit A Lick,2019-03-16,1099,100,1
2 Chainz Featuring Travis Scott,Whip,2019-03-16,1074,75,1
21 Savage,1.5,2019-01-05,85,86,1
21 Savage,A Lot,2019-01-05,36,12,23
...,...,...,...,...,...
"benny blanco, Tainy, Selena Gomez & J Balvin",I Can't Get Enough,2019-03-16,1092,66,5
blackbear,Hot Girl Bummer,2019-09-28,3890,11,42
for KING & COUNTRY,God Only Knows,2019-09-14,3693,94,1
j-hope Featuring Becky G.,Chicken Noodle Soup,2019-10-12,4080,81,1


In [90]:
artist_song_pt[(artist_song_pt.index.get_level_values('song') == 'Thank U, Next')]

Unnamed: 0_level_0,Unnamed: 1_level_0,date,id,peak_rank,weeks_on_chart
artist,song,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ariana Grande,"Thank U, Next",2019-01-05,0,1,28


In [45]:
print('Unique artist names:', len(bb_t100['artist'].unique()))
print('Unique song names:',len(bb_t100['song'].unique()))

Unique artist names: 733
Unique song names: 1291


turning the pivot table into a data frame

In [98]:
artist_song_list = []

for row in artist_song_pt.itertuples():
    artist_song_list.append(row)

artist_song = pd.DataFrame(artist_song_list)

artist_song[['artist', 'song']] = pd.DataFrame(artist_song['Index'].tolist())

artist_song = artist_song[['id','artist', 'song', 'weeks_on_chart', 'peak_rank', 'date']]
artist_song.head()

artist_song.loc[artist_song['song'] == 'Thank U, Next']

Unnamed: 0,id,artist,song,weeks_on_chart,peak_rank,date
91,0,Ariana Grande,"Thank U, Next",28,1,2019-01-05


In [47]:
artist_song_lat = artist_song[['artist', 'song', 'weeks_on_chart', 'peak_rank', 'date']].head()
print(artist_song_lat.to_latex())

\begin{tabular}{lllrrl}
\toprule
{} &                             artist &                song &  weeks\_on\_chart &  peak\_rank &        date \\
\midrule
0 &   2 Chainz Featuring Ariana Grande &      Rule The World &               2 &         94 &  2019-03-16 \\
1 &  2 Chainz Featuring Kendrick Lamar &  Momma I Hit A Lick &               1 &        100 &  2019-03-16 \\
2 &    2 Chainz Featuring Travis Scott &                Whip &               1 &         75 &  2019-03-16 \\
3 &                          21 Savage &                 1.5 &               1 &         86 &  2019-01-05 \\
4 &                          21 Savage &               A Lot &              23 &         12 &  2019-01-05 \\
\bottomrule
\end{tabular}



rough overview

In [48]:
artist_song.describe()

Unnamed: 0,id,weeks_on_chart,peak_rank
count,1322.0,1322.0,1322.0
mean,5177.734493,8.881997,51.16112
std,3242.740525,11.146522,28.710498
min,0.0,1.0,1.0
25%,2366.75,1.0,28.0
50%,5586.5,3.0,53.0
75%,7851.0,15.0,75.0
max,10497.0,61.0,100.0


#### Working with censored songs

Explicit songnames are censored on billboard but uncensored on genius, which can cause problems when searching for the lyrics.

Identifying censored words.

In [49]:
censored_songnames = artist_song.loc[artist_song['song'].str.contains('\*')]['song']
censored_songnames = censored_songnames.tolist()

censored_words = []
for songname in censored_songnames:
        words = songname.split(' ')
        for word in words:
            if '*' in word:
                censored_words.append(word)

print(len(censored_words), 'censored words:', censored_words)

9 censored words: ['N*ggas', 'N*gga', 'Sh*t', 'SH*T', 'N***a', 'P*$$y', 'N**gas', 'B*tch', 'F*ck']


Putting every word in a dictionary makes it easy to replace them later with the created `get_uncensored_string` function

In [50]:
#manual solution due to small size of list, may change later
censorship = {
                 'N*ggas' : 'Niggas',
                 'N*gga' : 'Nigga',
                 'Sh*t' : 'Shit',
                 'SH*T' : 'SHIT',
                'N***a' : 'Nigga',
                 'P*$$y' : 'Pu$$y',
                 'N**gas' : 'Niggas',
                 'B*tch' : 'Bitch',
                 'F*ck' : 'Fuck'
}


def get_uncensored_string(censored_string):
    uncensored_words = []
    words = censored_string.split(' ')
    for word in words:
        if word in censorship:
            word = censorship[word]
        uncensored_words.append(word)
    return ' '.join(uncensored_words)

`get_uncesored_string` in action:

In [51]:
artist_song.loc[artist_song['song'].str.contains("SH\*T") == True]

Unnamed: 0,id,artist,song,weeks_on_chart,peak_rank,date
293,6952,DaBaby Featuring Future & jetsonmade,LIGHTSKIN SH*T,1,53,2020-05-02


In [52]:
artist_song['song'] = artist_song['song'].apply(lambda song: get_uncensored_string(song))
artist_song.iloc[293]


id                                                6952
artist            DaBaby Featuring Future & jetsonmade
song                                    LIGHTSKIN SHIT
weeks_on_chart                                       1
peak_rank                                           53
date                                        2020-05-02
Name: 293, dtype: object

In [53]:
def get_first_artist(artist):
    first_artist = artist
    separators = [' Featuring ', ' & ', ' x ', ' X ', ', ', ' featuring ', ' Ft. ', ' ft. ']
    for sep in separators:
        if sep in artist:
            first_artist = artist.split(sep=sep, maxsplit=1)[0]
            return  first_artist
    return first_artist

In [100]:
artist_song['first_artist'] = artist_song['artist'].apply(lambda artist: get_first_artist(artist))



In [101]:
print(artist_song.columns)

artist_song = artist_song[['id', 'artist', 'first_artist', 'song', 'weeks_on_chart', 'peak_rank', 'date']]

artist_song.head()
artist_song.loc[artist_song['song'] == 'Thank U, Next']

Index(['id', 'artist', 'song', 'weeks_on_chart', 'peak_rank', 'date',
       'first_artist'],
      dtype='object')


Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,date
91,0,Ariana Grande,Ariana Grande,"Thank U, Next",28,1,2019-01-05


Exporting DataFrame as a json to get lyrics with the genius framework

In [102]:
artist_song.to_json('../../data/artist_song/artist_song.json')
print(artist_song['date'])

artist_song.loc[artist_song['song'] == 'Thank U, Next']

0       2019-03-16
1       2019-03-16
2       2019-03-16
3       2019-01-05
4       2019-01-05
           ...    
1317    2019-03-16
1318    2019-09-28
1319    2019-09-14
1320    2019-10-12
1321    2020-04-25
Name: date, Length: 1322, dtype: object


Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,date
91,0,Ariana Grande,Ariana Grande,"Thank U, Next",28,1,2019-01-05


In [116]:
lyrics_updated = pd.read_csv('../../data/lyrics/lyrics_invalid_updated.csv', encoding='utf-8', index_col=0)
lyrics = pd.read_csv('../../data/lyrics/bb-t100-lyrics.csv', encoding='utf-8', index_col=0)

artist_song = artist_song.sort_values(by='id', ascending=True, ignore_index=True)
artist_song.tail()
artist_song.loc[artist_song['song'] == 'Thank U, Next']

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,date
0,0,Ariana Grande,Ariana Grande,"Thank U, Next",28,1,2019-01-05


In [117]:
lyrics['first_appearance'] = artist_song['date']

In [118]:
lyrics.loc[lyrics['song'] == 'Thank U, Next']

Unnamed: 0,billboard_id,lyrics_id,artist,first_artist,song,weeks_on_chart,peak_rank,lyrics,url,length,word_count,language,language_score,first_appearance
0,0,0,Ariana Grande,Ariana Grande,"Thank U, Next",28.0,1.0,Thought I'd end up with Sean\nBut he wasn't a ...,https://genius.com/Ariana-grande-thank-u-next-...,2409,460,en,0.999997,2019-01-05


In [119]:
artist_song.loc[artist_song['song'] == 'Thank U, Next']

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,date
0,0,Ariana Grande,Ariana Grande,"Thank U, Next",28,1,2019-01-05


In [120]:
lyrics.to_csv('../../data/lyrics/bb-t100-lyrics.csv')
