### Helpers

In [10]:
import requests
from IPython.display import HTML, display
from bs4 import BeautifulSoup
import pickle
import numpy as np
import pandas as pd
import sys
import re

In [11]:
#from 49 Years of Music - A Data Driven Study of Lyrics by Carl Sharpe
def collect_songs_from_billboard(start_year,end_year):
    '''This function takes in a start year and and end year, then iterates through each year to 
    pull song data from billboard or bobborst as needed. Then it uses beautiful soup to clean
    the data. Finally it stores the cleaned data in a dataframe and returns it
    
    Parameters:
    
    start_year (int): the year to start at.
    end_year (int): the year to end at.
    Returns: 
    
    dataframe.
    '''
    
    years = np.arange(start_year, end_year + 1).astype(int)
    dataset = pd.DataFrame()
    url_list = []
    all_years = pd.DataFrame()
    ### Billboard doesn't have it's own complete results from 1970 to 2016,
    ### so we'll use bobborst.com as our primary and collect from billboard as needed
    alternate_site_collection_range = np.arange(start_year, 2017)
    #URL Constructor
    for i in range (0, len(years)):
        url_list.append("https://www.billboard.com/charts/year-end/" + str(years[i]) + "/hot-100-songs")      
    for i in range(0, len(url_list)):
        if years[i] in alternate_site_collection_range:
            sys.stdout.write("\r" + "Collecting Songs from " +str(years[i]) + " via http://www.bobborst.com")
            sys.stdout.flush()    
            url = "http://www.bobborst.com/popculture/top-100-songs-of-the-year/?year=" + str(years[i])
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "html.parser")
            table = soup.find('table', {'class': 'sortable alternate songtable'})
            rows = table.find_all('tr')
            for j in range(2,102):
                columns = rows[j].find_all('td')
                #print(columns)
                row = {
                    "Rank": columns[0].get_text(strip=True),
                    "Artist": columns[1].get_text(strip=True),
                    "Song Title": columns[2].get_text(strip=True),
                    "Year": years[i]
                }
                dataset = dataset.append(row, ignore_index=True)
            
        else:
            sys.stdout.write("\r" + "Collecting Songs from " +str(years[i]) + " via https://www.billboard.com")
            sys.stdout.flush()
            url = "https://www.billboard.com/charts/year-end/" + str(years[i]) + "/hot-100-songs"
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "html.parser")
            all_ranks = soup.find_all("div", class_="ye-chart-item__rank")
            all_titles = soup.find_all('div', class_="ye-chart-item__title")
            all_artists = soup.find_all("div", class_="ye-chart-item__artist")
            for j in range (0, len(all_ranks)):
                row = {
                    "Rank": all_ranks[j].get_text(strip=True),
                    "Song Title": all_titles[j].get_text(strip=True),
                    "Artist": all_artists[j].get_text(strip=True),
                    "Year": years[i]
                }
                dataset = dataset.append(row, ignore_index=True)
    dataset['Year'] = dataset['Year'].astype(int)
    return dataset

In [12]:
all_songs = collect_songs_from_billboard(1970, 2018)

Collecting Songs from 2018 via https://www.billboard.com

In [13]:
display(all_songs.head(5))
display(all_songs.tail(5))
display(all_songs.shape)

Unnamed: 0,Artist,Rank,Song Title,Year
0,Simon and Garfunkel,1,Bridge Over Troubled Water,1970
1,Carpenters,2,(They Long To Be) Close To You,1970
2,Guess Who,3,American Woman / No Sugar Tonight,1970
3,B.J. Thomas,4,Raindrops Keep Fallin' On My Head,1970
4,Edwin Starr,5,War,1970


Unnamed: 0,Artist,Rank,Song Title,Year
4895,Luke Combs,96,One Number Away,2018
4896,Rae Sremmurd & Juicy J,97,Powerglide,2018
4897,Dua Lipa,98,IDGAF,2018
4898,J Balvin & Willy William Featuring Beyonce,99,Mi Gente,2018
4899,Imagine Dragons,100,Believer,2018


(4900, 4)