## Web Scraping the Billboard Top 100 Songs chart 
## With BeautifulSoup and Requests

In [16]:
import requests
from bs4 import BeautifulSoup
import re
import pickle
import numpy as np
import pandas as pd
import datetime

In [17]:
# takes you to the most recent week
base_url = 'https://www.billboard.com/charts/hot-100/'

In [18]:
# get dates for the past 12 weeks
today = datetime.datetime.today()
numweeks = 12
date_list = [today - datetime.timedelta(weeks=x) for x in range(numweeks)]
dates = [d.strftime('%Y-%m-%d') for d in date_list]
print(dates)


['2020-04-21', '2020-04-14', '2020-04-07', '2020-03-31', '2020-03-24', '2020-03-17', '2020-03-10', '2020-03-03', '2020-02-25', '2020-02-18', '2020-02-11', '2020-02-04']


In [19]:
page = requests.get(base_url)
soup = BeautifulSoup(page.content, 'html.parser')

In [20]:
songs = soup.find_all('span', attrs={'class':'chart-element__information__song'})
artists = soup.find_all('span', attrs={'class':'chart-element__information__artist'})

In [21]:
# Default ranking
delta_default = soup.find_all('span', attrs={'class':'chart-element__information__delta__text text--default'})
# Last week ranking
delta_last = soup.find_all('span', attrs={'class':'chart-element__information__delta__text text--last'})
# Peak ranking
delta_peak = soup.find_all('span', attrs={'class':'chart-element__information__delta__text text--peak'})
# Num weeks on chart
delta_week = soup.find_all('span', attrs={'class':'chart-element__information__delta__text text--week'})

In [22]:
# make sure all same length
vars = [songs, artists, delta_default, delta_last, delta_peak, delta_week]
for v in vars:
    print(len(v))

100
100
100
100
100
100


In [23]:
songs_list = [s.text for s in songs]
artists_list = [a.text for a in artists]
delta_default_list = [d.text for d in delta_default]
delta_last_list = [d.text for d in delta_last]
delta_peak_list = [d.text for d in delta_peak]
delta_week_list = [d.text for d in delta_week]


In [24]:
song_tuples = list(zip(songs_list, artists_list, delta_default_list, delta_last_list, delta_peak_list, delta_week_list))

In [25]:
columns = ["Song", "Artist", "Rank_Default", "Rank_LastWeek", "Rank_Peak", "Num_Weeks_on_Chart"]

In [85]:
songs_df = pd.DataFrame(song_tuples, columns=columns)
print(songs_df.shape)
print(songs_df.head())

(100, 6)
              Song       Artist Rank_Default Rank_LastWeek    Rank_Peak  \
0  Blinding Lights   The Weeknd           +1   2 Last Week  1 Peak Rank   
1     Toosie Slide        Drake           -1   1 Last Week  1 Peak Rank   
2          The Box  Roddy Ricch            -   3 Last Week  1 Peak Rank   
3  Don't Start Now     Dua Lipa            -   4 Last Week  2 Peak Rank   
4           Say So     Doja Cat           +3   8 Last Week  5 Peak Rank   

  Num_Weeks_on_Chart  
0  20 Weeks on Chart  
1   2 Weeks on Chart  
2  19 Weeks on Chart  
3  24 Weeks on Chart  
4  15 Weeks on Chart  


In [86]:
# extract only numbers from all 'rankings' columns - convert all to integers
songs_df['Rank_LastWeek'] = songs_df['Rank_LastWeek'].str.extract('(\d+)')
songs_df['Rank_LastWeek'] = pd.to_numeric(songs_df['Rank_LastWeek'])

songs_df['Rank_Peak'] = songs_df['Rank_Peak'].str.extract('(\d+)')
songs_df['Rank_Peak'] = pd.to_numeric(songs_df['Rank_Peak'])

songs_df['Num_Weeks_on_Chart'] = songs_df['Num_Weeks_on_Chart'].str.extract('(\d+)')
songs_df['Num_Weeks_on_Chart'] = pd.to_numeric(songs_df['Num_Weeks_on_Chart'])


In [87]:
songs_df.head()

Unnamed: 0,Song,Artist,Rank_Default,Rank_LastWeek,Rank_Peak,Num_Weeks_on_Chart
0,Blinding Lights,The Weeknd,+1,2.0,1,20
1,Toosie Slide,Drake,-1,1.0,1,2
2,The Box,Roddy Ricch,-,3.0,1,19
3,Don't Start Now,Dua Lipa,-,4.0,2,24
4,Say So,Doja Cat,+3,8.0,5,15


In [88]:
# Rank_Default: convert to integers convert +n to just n and -n to neg numebrs
songs_df['Rank_Default'] = songs_df['Rank_Default'].map(lambda x: x.lstrip('+'))
songs_df['Rank_Default'] = songs_df['Rank_Default'].str.replace('^-$', '', regex=True)
songs_df['Rank_Default'] = songs_df['Rank_Default'].str.strip()
songs_df['Rank_Default'] = pd.to_numeric(songs_df['Rank_Default'])

In [89]:
songs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Song                100 non-null    object 
 1   Artist              100 non-null    object 
 2   Rank_Default        70 non-null     float64
 3   Rank_LastWeek       86 non-null     float64
 4   Rank_Peak           100 non-null    int64  
 5   Num_Weeks_on_Chart  100 non-null    int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 4.8+ KB


In [90]:
songs_df.head()

Unnamed: 0,Song,Artist,Rank_Default,Rank_LastWeek,Rank_Peak,Num_Weeks_on_Chart
0,Blinding Lights,The Weeknd,1.0,2.0,1,20
1,Toosie Slide,Drake,-1.0,1.0,1,2
2,The Box,Roddy Ricch,,3.0,1,19
3,Don't Start Now,Dua Lipa,,4.0,2,24
4,Say So,Doja Cat,3.0,8.0,5,15


In [93]:
# count how many missing values in each col
songs_df.isnull().sum()

Song                   0
Artist                 0
Rank_Default          30
Rank_LastWeek         14
Rank_Peak              0
Num_Weeks_on_Chart     0
dtype: int64

In [94]:
# replace missing values with zeroes
songs_df.fillna(0, inplace=True)

In [98]:
# final df
songs_df.info()
songs_df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Song                100 non-null    object 
 1   Artist              100 non-null    object 
 2   Rank_Default        100 non-null    float64
 3   Rank_LastWeek       100 non-null    float64
 4   Rank_Peak           100 non-null    int64  
 5   Num_Weeks_on_Chart  100 non-null    int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 4.8+ KB


Unnamed: 0,Rank_Default,Rank_LastWeek,Rank_Peak,Num_Weeks_on_Chart
count,100.0,100.0,100.0,100.0
mean,-0.22,39.15,34.36,11.98
std,6.029523,30.184743,26.836023,10.236575
min,-29.0,0.0,1.0,1.0
25%,-2.0,11.75,12.0,4.0
50%,0.0,36.5,27.5,10.0
75%,2.0,62.25,53.25,17.5
max,14.0,100.0,100.0,49.0


In [99]:
songs_df.to_csv('billboard_top100.csv', index=None)