In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import os
for dirname, _, filenames in os.walk('data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data\artist_song\artist_song.csv
data\artist_song\artist_song.json
data\billboard\billboard_top_100.csv
data\input\songs-2.csv
data\input\top100.json
data\lyrics\artist_song_lyrics _bak.csv
data\lyrics\artist_song_lyrics.csv
data\lyrics\artist_song_lyrics_bak_old.csv
data\lyrics\short_lyrics.csv


#### Grouping song and artist

To avoid redundent lyrics requests for lyrics I might already I have, I grouped artists and songs.

In [2]:
billboard_top_100 = pd.read_csv('data/billboard/billboard_top_100.csv')

In [3]:
artist_song_pt = pd.pivot_table(billboard_top_100, index=['artist', 'song'], values=['peak_rank', 'weeks_on_chart'], aggfunc={'peak_rank' : 'min', 'weeks_on_chart' : 'max'})
artist_song_pt

Unnamed: 0_level_0,Unnamed: 1_level_0,peak_rank,weeks_on_chart
artist,song,Unnamed: 2_level_1,Unnamed: 3_level_1
2 Chainz Featuring Ariana Grande,Rule The World,94,2
2 Chainz Featuring Kendrick Lamar,Momma I Hit A Lick,100,1
2 Chainz Featuring Travis Scott,Whip,75,1
21 Savage,1.5,86,1
21 Savage,A Lot,12,23
...,...,...,...
"benny blanco, Tainy, Selena Gomez & J Balvin",I Can't Get Enough,66,5
blackbear,Hot Girl Bummer,11,42
for KING & COUNTRY,God Only Knows,94,1
j-hope Featuring Becky G.,Chicken Noodle Soup,81,1


In [4]:
print('Unique song names:', len(billboard_top_100['artist'].unique()))
print('Unique artist names:',len(billboard_top_100['song'].unique()))

Unique song names: 722
Unique artist names: 1279


turning the pivot table into a data frame

In [5]:
artist_song_list = []

for row in artist_song_pt.itertuples():
    artist_song_list.append(row)

artist_song = pd.DataFrame(artist_song_list)

artist_song[['artist', 'song']] = pd.DataFrame(artist_song['Index'].tolist())

artist_song = artist_song[['artist', 'song', 'weeks_on_chart', 'peak_rank']]
artist_song.head()

Unnamed: 0,artist,song,weeks_on_chart,peak_rank
0,2 Chainz Featuring Ariana Grande,Rule The World,2,94
1,2 Chainz Featuring Kendrick Lamar,Momma I Hit A Lick,1,100
2,2 Chainz Featuring Travis Scott,Whip,1,75
3,21 Savage,1.5,1,86
4,21 Savage,A Lot,23,12


rough overview

In [6]:
artist_song.describe()

Unnamed: 0,weeks_on_chart,peak_rank
count,1309.0,1309.0
mean,8.887701,51.262796
std,11.162152,28.697031
min,1.0,1.0
25%,1.0,28.0
50%,3.0,53.0
75%,15.0,75.0
max,61.0,100.0


#### Working with censored songs

Explicit songnames are censored on billboard but uncensored on genius, which can cause problems when searching for the lyrics.

Identifying censored words.

In [7]:
censored_songnames = artist_song.loc[artist_song['song'].str.contains('\*')]['song']
censored_songnames = censored_songnames.tolist()

censored_words = []
for songname in censored_songnames:
        words = songname.split(' ')
        for word in words:
            if '*' in word:
                censored_words.append(word)

print(len(censored_words), 'censored words:', censored_words)

9 censored words: ['N*ggas', 'N*gga', 'Sh*t', 'SH*T', 'N***a', 'P*$$y', 'N**gas', 'B*tch', 'F*ck']


Putting every word in a dictionary makes it easy to replace them later with the created `get_uncensored_string` function

In [8]:
#manual solution due to small size of list, may change later
censorship = {
                 'N*ggas' : 'Niggas',
                 'N*gga' : 'Nigga',
                 'Sh*t' : 'Shit',
                 'SH*T' : 'SHIT',
                'N***a' : 'Nigga',
                 'P*$$y' : 'Pu$$y',
                 'N**gas' : 'Niggas',
                 'B*tch' : 'Bitch',
                 'F*ck' : 'Fuck'
}


def get_uncensored_string(censored_string):
    uncensored_words = []
    words = censored_string.split(' ')
    for word in words:
        if word in censorship:
            word = censorship[word]
        uncensored_words.append(word)
    return ' '.join(uncensored_words)

`get_uncesored_string` in action:

In [9]:
artist_song.loc[artist_song['song'].str.contains("F\*ck") == True]

Unnamed: 0,artist,song,weeks_on_chart,peak_rank
1165,The Kid LAROI Featuring Machine Gun Kelly,"F*ck You, Goodbye",1,99


In [10]:
artist_song['song'] = artist_song['song'].apply(lambda song: get_uncensored_string(song))
artist_song.iloc[1165]

artist            The Kid LAROI Featuring Machine Gun Kelly
song                                      Fuck You, Goodbye
weeks_on_chart                                            1
peak_rank                                                99
Name: 1165, dtype: object

In [11]:
artist_song.head()

Unnamed: 0,artist,song,weeks_on_chart,peak_rank
0,2 Chainz Featuring Ariana Grande,Rule The World,2,94
1,2 Chainz Featuring Kendrick Lamar,Momma I Hit A Lick,1,100
2,2 Chainz Featuring Travis Scott,Whip,1,75
3,21 Savage,1.5,1,86
4,21 Savage,A Lot,23,12


Exporting DataFrame as a json to get lyrics with the genius framework

In [12]:
artist_song.to_json('data/artist_song/artist_song.json')


