This notebook can be used to scrape data from the Billboard music chart rankings:
- A timeframe is defined for which weekly dates in-between are generated. 
- The weekly dates are concatenated to the base Billboard URL and integrated in a loop for the number of weeks. Inside this loop, the HTML file for each week is downloaded and BeautifulSoup is used to scrape the data into a dataframe. 
- A new variable is defined for songs designated as NEW and Re-ENTRYs. 
- Dateframe is exported as an Excel file. 

In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
from os.path import join
from os import makedirs
from datetime import datetime, timedelta
import datetime
import math

In [2]:
# Generate dates for previous weeks

week_date = []
weeks_since = math.floor((int(str(datetime.datetime(2022,4,30)-datetime.datetime(2020,1,4))[0:4])/7))
date = datetime.datetime(2020,1,4)
for i in range(0,weeks_since): 
    date += datetime.timedelta(days=7)
    date2 = date.strftime('%y/%m/%d')
    date3 = '20'+date2[0:2]+'-'+date2[3:5]+'-'+date2[6:8]
    week_date.append(date3)

In [3]:
# Define columns for dataframe

this_week = []
artist = []
tracks = []
last_week = []
peak_pos = []
wks_chart = []
week = []
new_re = []

In [4]:
# looping through weeks to scrape data

for j in range(0,weeks_since):
    billboard_url = "https://www.billboard.com/charts/billboard-korea-100/"+week_date[j]+"/"
    
    r = requests.get(billboard_url)
    
    with requests.Session() as s:
        r = s.get(billboard_url)
        new_name = week_date[j]+".html"
        path = join("html", new_name)
        # makedirs(join("html"))
    
    with open(path,"x",encoding="utf-8") as file:
            file.write(r.text)
    with open(path,"r",encoding="utf-8") as file:
        soup = BeautifulSoup(markup=file, features="lxml",
                        parse_only=SoupStrainer(name="div",
                        attrs={"class":"o-chart-results-list-row-container"}))
    rows = soup("ul")
        
    for i in range(0,len(rows),3):
        for text in rows[i].find_all('span')[1].stripped_strings:
            if text=="NEW" or text=="RE-\nENTRY":
                new_re.append(text)
                for text in rows[i].find_all('span')[3].stripped_strings:
                    artist.append(text)
            else:
                artist.append(text)
                new_re.append("NULL")
        for text in rows[i].h3.stripped_strings:
            tracks.append(text)
        for text in rows[i].find_all('span')[2].stripped_strings:
                last_week.append(text)
        for text in rows[i].find_all('span')[3].stripped_strings:
            peak_pos.append(text)
        for text in rows[i].find_all('span')[4].stripped_strings:
            wks_chart.append(text)
        this_week.append(math.floor((i+3)/3))
        week.append(week_date[j])

In [5]:
# If HTML files are already downloaded

# for j in range(0,weeks_since):
#     billboard_url = "https://www.billboard.com/charts/billboard-korea-100/"+week_date[j]+"/"
    
#     new_name = week_date[j]+".html"
#     path = join("html", new_name)
  
#     with open(path,"r",encoding="utf-8") as file:
#         soup = BeautifulSoup(markup=file, features="lxml",
#                         parse_only=SoupStrainer(name="div",
#                         attrs={"class":"o-chart-results-list-row-container"}))
#     rows = soup("ul")
        
#     for i in range(0,len(rows),3):
#         for text in rows[i].find_all('span')[1].stripped_strings:
#             if text=="NEW" or text=="RE-\nENTRY":
#                 new_re.append(text)
#                 for text in rows[i].find_all('span')[3].stripped_strings:
#                     artist.append(text)
#             else:
#                 artist.append(text)
#                 new_re.append("NULL")
#         for text in rows[i].h3.stripped_strings:
#             tracks.append(text)
#         for text in rows[i].find_all('span')[2].stripped_strings:
#                 last_week.append(text)
#         for text in rows[i].find_all('span')[3].stripped_strings:
#             peak_pos.append(text)
#         for text in rows[i].find_all('span')[4].stripped_strings:
#             wks_chart.append(text)
#         this_week.append(math.floor((i+3)/3))
#         week.append(week_date[j])

In [6]:
# Create dataframe 

dic = {"This Week":this_week,"New/Re-Entry":new_re,"Artist":artist,"Track":tracks,"Last Week":last_week,
       "Peak Position":peak_pos,"Weeks on Chart":wks_chart,"Week Of":week}
df = pd.DataFrame(dic)
df.head()

Unnamed: 0,This Week,New/Re-Entry,Artist,Track,Last Week,Peak Position,Weeks on Chart,Week Of
0,1,,CHANGMO,Meteor,1,1,7,2020-01-11
1,2,,Red Velvet,Psycho,2,2,3,2020-01-11
2,3,,IU,Blueming,3,1,8,2020-01-11
3,4,,MAMAMOO,Hip,4,2,9,2020-01-11
4,5,,Noel,Late Night,6,3,10,2020-01-11


In [7]:
# Check for non-integers in variables 

def check_letters(string):
    if any(c.isalpha() for c in string):
        return 'NULL'
    else:
        return string 
    
def check_letters2(string):
    for c in string:
        if c=="-":
            return 'NULL'
    return string 

In [8]:
df['Last Week'] = df['Last Week'].apply(check_letters)
df['Peak Position'] = df['Peak Position'].apply(check_letters)
df['Weeks on Chart'] = df['Weeks on Chart'].apply(check_letters2)
df.head()

Unnamed: 0,This Week,New/Re-Entry,Artist,Track,Last Week,Peak Position,Weeks on Chart,Week Of
0,1,,CHANGMO,Meteor,1,1,7,2020-01-11
1,2,,Red Velvet,Psycho,2,2,3,2020-01-11
2,3,,IU,Blueming,3,1,8,2020-01-11
3,4,,MAMAMOO,Hip,4,2,9,2020-01-11
4,5,,Noel,Late Night,6,3,10,2020-01-11


In [9]:
# Export to Excel file for further analysis
df.to_excel(r"C:\Users\Theo Lee\Desktop\Coding\KpopPopularity\billboard_data.xlsx", index = False)