In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('../../data/billboard'):
    for filename in filenames:
        print(os.path.join(dirname, filename).replace("\\","/"))

../../data/billboard/BB-T100.csv
../../data/billboard/bb_t100_en.csv
../../data/billboard/bb_t100_en_new.csv
../../data/billboard/billboard_top_100_2020.csv
../../data/billboard/backups/bb-t100_backup.csv


#### Grouping song and artist

To avoid redundent lyrics requests for lyrics I might already have, I grouped artists and songs.

In [2]:
bb_t100 = pd.read_csv('../../data/billboard/BB-T100.csv', index_col=0)

In [3]:
bb_as_pt = pd.pivot_table(bb_t100, index=['artist', 'song'],
                                values=['peak_rank', 'weeks_on_chart', 'id', 'date'],
                                aggfunc={'peak_rank' : 'min', 'weeks_on_chart' : 'max', 'id' : 'min', 'date' : 'min'})
bb_as_pt

Unnamed: 0_level_0,Unnamed: 1_level_0,date,id,peak_rank,weeks_on_chart
artist,song,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2 Chainz Featuring Ariana Grande,Rule The World,2019-03-16,1093,94,2
2 Chainz Featuring Kendrick Lamar,Momma I Hit A Lick,2019-03-16,1099,100,1
2 Chainz Featuring Travis Scott,Whip,2019-03-16,1074,75,1
21 Savage,1.5,2019-01-05,85,86,1
21 Savage,A Lot,2019-01-05,36,12,23
...,...,...,...,...,...
"benny blanco, Tainy, Selena Gomez & J Balvin",I Can't Get Enough,2019-03-16,1092,66,5
blackbear,Hot Girl Bummer,2019-09-28,3890,11,42
for KING & COUNTRY,God Only Knows,2019-09-14,3693,94,1
j-hope Featuring Becky G.,Chicken Noodle Soup,2019-10-12,4080,81,1


In [4]:
bb_as_pt[(bb_as_pt.index.get_level_values('song') == 'Thank U, Next')]

Unnamed: 0_level_0,Unnamed: 1_level_0,date,id,peak_rank,weeks_on_chart
artist,song,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ariana Grande,"Thank U, Next",2019-01-05,0,1,28


In [5]:
print('Unique artist names:', len(bb_t100['artist'].unique()))
print('Unique song names:',len(bb_t100['song'].unique()))

Unique artist names: 733
Unique song names: 1291


turning the pivot table into a data frame

In [6]:
bb_as_list = []

for row in bb_as_pt.itertuples():
    bb_as_list.append(row)

bb_as = pd.DataFrame(bb_as_list)

bb_as[['artist', 'song']] = pd.DataFrame(bb_as['Index'].tolist())

bb_as = bb_as[['id','artist', 'song', 'weeks_on_chart', 'peak_rank', 'date']]
bb_as.rename(columns = {'date':'first_appearance'}, inplace = True)
bb_as.head()

bb_as.loc[bb_as['song'] == 'Thank U, Next']

Unnamed: 0,id,artist,song,weeks_on_chart,peak_rank,first_appearance
91,0,Ariana Grande,"Thank U, Next",28,1,2019-01-05


rough overview

In [8]:
bb_as.describe()

Unnamed: 0,id,weeks_on_chart,peak_rank
count,1322.0,1322.0,1322.0
mean,5177.734493,8.881997,51.16112
std,3242.740525,11.146522,28.710498
min,0.0,1.0,1.0
25%,2366.75,1.0,28.0
50%,5586.5,3.0,53.0
75%,7851.0,15.0,75.0
max,10497.0,61.0,100.0


#### Working with censored songs

Explicit songnames are censored on billboard but uncensored on genius, which can cause problems when searching for the lyrics.

Identifying censored words.

In [9]:
censored_songnames = bb_as.loc[bb_as['song'].str.contains('\*')]['song']
censored_songnames = censored_songnames.tolist()

censored_words = []
for songname in censored_songnames:
        words = songname.split(' ')
        for word in words:
            if '*' in word:
                censored_words.append(word)

print(len(censored_words), 'censored words:', censored_words)

9 censored words: ['N*ggas', 'N*gga', 'Sh*t', 'SH*T', 'N***a', 'P*$$y', 'N**gas', 'B*tch', 'F*ck']


Putting every word in a dictionary makes it easy to replace them later with the created `get_uncensored_string` function

In [10]:
#manual solution due to small size of list, may change later
censorship = {
                 'N*ggas' : 'Niggas',
                 'N*gga' : 'Nigga',
                 'Sh*t' : 'Shit',
                 'SH*T' : 'SHIT',
                'N***a' : 'Nigga',
                 'P*$$y' : 'Pu$$y',
                 'N**gas' : 'Niggas',
                 'B*tch' : 'Bitch',
                 'F*ck' : 'Fuck'
}


def get_uncensored_string(censored_string):
    uncensored_words = []
    words = censored_string.split(' ')
    for word in words:
        if word in censorship:
            word = censorship[word]
        uncensored_words.append(word)
    return ' '.join(uncensored_words)

`get_uncesored_string` in action:

In [11]:
bb_as.loc[bb_as['song'].str.contains("SH\*T") == True]

Unnamed: 0,id,artist,song,weeks_on_chart,peak_rank,first_appearance
293,6952,DaBaby Featuring Future & jetsonmade,LIGHTSKIN SH*T,1,53,2020-05-02


In [12]:
bb_as['song'] = bb_as['song'].apply(lambda song: get_uncensored_string(song))
bb_as.iloc[293]


id                                                  6952
artist              DaBaby Featuring Future & jetsonmade
song                                      LIGHTSKIN SHIT
weeks_on_chart                                         1
peak_rank                                             53
first_appearance                              2020-05-02
Name: 293, dtype: object

In [13]:
def get_first_artist(artist):
    first_artist = artist
    separators = [' Featuring ', ' & ', ' x ', ' X ', ', ', ' featuring ', ' Ft. ', ' ft. ']
    for sep in separators:
        if sep in artist:
            first_artist = artist.split(sep=sep, maxsplit=1)[0]
            return  first_artist
    return first_artist

In [14]:
bb_as['first_artist'] = bb_as['artist'].apply(lambda artist: get_first_artist(artist))



In [15]:
bb_as = bb_as[['id', 'artist', 'first_artist', 'song', 'weeks_on_chart', 'peak_rank', 'first_appearance']]

bb_as.head()

Unnamed: 0,id,artist,first_artist,song,weeks_on_chart,peak_rank,first_appearance
0,1093,2 Chainz Featuring Ariana Grande,2 Chainz,Rule The World,2,94,2019-03-16
1,1099,2 Chainz Featuring Kendrick Lamar,2 Chainz,Momma I Hit A Lick,1,100,2019-03-16
2,1074,2 Chainz Featuring Travis Scott,2 Chainz,Whip,1,75,2019-03-16
3,85,21 Savage,21 Savage,1.5,1,86,2019-01-05
4,36,21 Savage,21 Savage,A Lot,23,12,2019-01-05


Exporting DataFrame as a json to get lyrics with the genius framework

In [16]:
bb_as.to_json('../../data/artist_song/BB-AS.json')