In [1]:
# Import dependencies 
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
from datetime import datetime, timedelta
import time

## Scraping the Billboard top 50 TikTok Songs

In [2]:
#Create an empty list to store dates
date_list = []
# Create an empty list to store dictionaries
top_chart_list = []

In [3]:
#Create datetime objects for the start and end of our dataset
start_date = datetime.strptime('2024-06-08', '%Y-%m-%d')
end_date = datetime.strptime('2023-09-16', '%Y-%m-%d')

#Generating a list of all the weeks to use to scrape billboard for all their different top 50 charts
current_date = start_date
while current_date >= end_date:
    date_list.append(current_date.strftime('%Y-%m-%d'))
    current_date -= timedelta(days=7)

2024-06-08
2024-06-01
2024-05-25
2024-05-18
2024-05-11
2024-05-04
2024-04-27
2024-04-20
2024-04-13
2024-04-06
2024-03-30
2024-03-23
2024-03-16
2024-03-09
2024-03-02
2024-02-24
2024-02-17
2024-02-10
2024-02-03
2024-01-27
2024-01-20
2024-01-13
2024-01-06
2023-12-30
2023-12-23
2023-12-16
2023-12-09
2023-12-02
2023-11-25
2023-11-18
2023-11-11
2023-11-04
2023-10-28
2023-10-21
2023-10-14
2023-10-07
2023-09-30
2023-09-23
2023-09-16


In [4]:
# Here we've created a nested for loop that loops through each week in the weeks list
# This guides the browser to the corresponding week's top 50
# We then utilized BeautifulSoup to scrap the artist name and song name from each weekly chart
# Results are then appended to a list of dictionaries to be turned into a DataFrame


browser = Browser('chrome')
for date in date_list:
    url = 'https://www.billboard.com/charts/tiktok-billboard-top-50/' + date + "/"
    browser.visit(url)
    # Create a Beautiful Soup object
    html = browser.html
    soup_info = soup(html, 'html.parser')
    # Extract all the text elements
    Top_chart_elements = soup_info.find_all("div", class_="o-chart-results-list-row-container")
    # Loop through the text elements
    for article in Top_chart_elements:
    
        date = date
        title = article.find("h3", class_="c-title").get_text()
        try:
            artist = article.find("span", class_="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only u-font-size-20@tablet").get_text()
        except:
            artist = artist = article.find("span", class_="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only").get_text()

    
        track_name = title.strip()
        artist_name = artist.strip()
    
        # Store in a dictionary
        article_dict = {
            "title": track_name,
            "artist": artist_name,
            "date": date
            }
    
        # Append the dictionary to the list
        top_chart_list.append(article_dict)

In [5]:
browser.quit()

In [6]:
# Initial DataFrame with Artist, song name and the date that it appeared in the top 50

top_chart_df = pd.DataFrame(top_chart_list)
top_chart_df_copy = top_chart_df.copy()

Unnamed: 0,title,artist,date
0,Million Dollar Baby,Tommy Richman,2024-06-08
1,Tell Ur Girlfriend,Lay Bankz,2024-06-08
2,U My Everything,Sexyy Red & Drake,2024-06-08
3,Birds Of A Feather,Billie Eilish,2024-06-08
4,Not Like Us,Kendrick Lamar,2024-06-08
...,...,...,...
1945,XXL,LANY,2023-09-16
1946,Falling Behind,Laufey,2023-09-16
1947,Power Trip,J. Cole Featuring Miguel,2023-09-16
1948,Beautiful,Bazzi Featuring Camila Cabello,2023-09-16


## Obtaining Song URI codes for list of songs

In order to obtain the song attributes that we are after, we first have to obtain weach song's "URI" number, which is a unique identifier 
for each song. We begin by setting up our authorization credentials to use the Spotify API (ID numbers have been removed).


In [19]:
eric_id= "xxxxxxxxx"
eric_secert = "xxxxxxxxx"
jack_id = "xxxxxxxxxxx"
jack_secret = "xxxxxxxxx"
CLIENT_ID = "your_id_here"
CLIENT_SECRET = "your_secret_here"

AUTH_URL = "https://accounts.spotify.com/api/token"
auth_response = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': jack_id,
    'client_secret': jack_secret,
})

#Convert response to JSON
auth_response_data = auth_response.json()

#Save the access token
access_token = auth_response_data['access_token']

#Need to pass access token into header to send properly formed GET request to API server
headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
}

In [14]:
# This for loop allows us to use the Spotify API query function to lookup each song by Artist and Song name to obtain the URI.

BASE_URL = 'https://api.spotify.com/v1/'

URI_list = []

for i, row in top_chart_df.iterrows():
    track = row[0]
    artist = row[1]
    r = requests.get(BASE_URL + f'search?q={artist}+{track}&type=artist,track&offset=0&limit=1', headers=headers)
    info = r.json()
    URI = info['tracks']['items'][0]['id']
    URI_list.append(URI)

URI_list

  track = row[0]
  artist = row[1]


['5AJ9hqTS2wcFQCELCFRO7A',
 '3lMzT16MjAKKXF7pSZn13B',
 '2MjXWroB9wlTG2kqv3avfS',
 '6dOtVTDdiauQNBQEDOtlAB',
 '6AI3ezQ4o3HUoP6Dhudph3',
 '4PhTvtwhgfkn9jlgPEBbue',
 '28drn6tQo95MRvO0jQEo5C',
 '7iabz12vAuVQYyekFIWJxD',
 '1C84d9abZVKWHT2YYpoean',
 '6XjDF6nds4DE2BBbagZol6',
 '6NjWCIYu1W8xa3HIvcIhd4',
 '1aHy1ipWQRNauI0ee9uYfi',
 '6Uwi2Qk3H7fM4b4W4ExrAp',
 '2FQrifJ1N335Ljm3TjTVVf',
 '3KjRNFs97uhUD2ssfU1coJ',
 '4ZJ4vzLQekI0WntDbanNC7',
 '29TPjc8wxfz4XMn21O7VsZ',
 '629DixmZGHc7ILtEntuiWE',
 '2qSkIjg1o9h3YT9RAgYN75',
 '5rcnAHBclBs0OGV4rATEnR',
 '2GxrNKugF82CnoRFbQfzPf',
 '1xUddpWyEuYl5T3mduKnOJ',
 '6O3WfmAQIgnLBGVzZJVS40',
 '2tudvzsrR56uom6smgOcSf',
 '22yRHdYBLZMi7xnvBioqkP',
 '1i8dJGpKO0xQiKGCVslJqB',
 '0mIdZEa3kU6btHfzbEibz4',
 '3hFnQwSt5zAWaYpLuMSWB7',
 '7rIhp6EWLNtM8qFIQruJPT',
 '48lxT5qJF0yYyf2z4wB4xW',
 '3QaPy1KgI7nu9FJEQUgn6h',
 '2prqm9sPLj10B4Wg0wE5x9',
 '0e1KTuawmiFLiK0Lh3nNtM',
 '2V852FRIFO7mFgfiyDMum1',
 '37adYGaYaAWTGhBaOzX4Fh',
 '6sghEq6gM4Ugadwa12H7Sa',
 '5UNzhuQS8ak02kW0EXsI24',
 

## Obtaining song attributes

Now that we have the URI for each song, we're able to perform an additional API call to obtain each track's "Audio Features"


In [20]:
# Initialize index to zero and create an empty dataframe to store data
index = 0
temp_df = pd.DataFrame()


for URI in URI_list:
    Track_URL = f'https://api.spotify.com/v1/audio-features/{URI}'
    track_info = requests.get(Track_URL, headers=headers)
    print(track_details)
    track_details = track_info.json()

    #Wait a second to not do too many API calls
    time.sleep(1)
    # Convert the results from the api call into a dataframe row, with the index iterating each call
    df = pd.DataFrame(track_details, index = [index])
    #Drop unneeded columns
    df = df.drop(columns=['type', 'id', 'track_href', 'analysis_url', 'duration_ms', 'time_signature' ])
    index+=1
    # Add the row to the growing temporary dataframe
    temp_df = pd.concat([temp_df, df])

{'error': {'status': 429}}
0
{'danceability': 0.852, 'energy': 0.697, 'key': 1, 'loudness': -5.52, 'mode': 0, 'speechiness': 0.0439, 'acousticness': 0.0973, 'instrumentalness': 0.00037, 'liveness': 0.0678, 'valence': 0.919, 'tempo': 138.029, 'type': 'audio_features', 'id': '5AJ9hqTS2wcFQCELCFRO7A', 'uri': 'spotify:track:5AJ9hqTS2wcFQCELCFRO7A', 'track_href': 'https://api.spotify.com/v1/tracks/5AJ9hqTS2wcFQCELCFRO7A', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/5AJ9hqTS2wcFQCELCFRO7A', 'duration_ms': 155152, 'time_signature': 4}
1
{'danceability': 0.866, 'energy': 0.741, 'key': 7, 'loudness': -4.66, 'mode': 1, 'speechiness': 0.245, 'acousticness': 0.0933, 'instrumentalness': 0, 'liveness': 0.0297, 'valence': 0.614, 'tempo': 135.07, 'type': 'audio_features', 'id': '3lMzT16MjAKKXF7pSZn13B', 'uri': 'spotify:track:3lMzT16MjAKKXF7pSZn13B', 'track_href': 'https://api.spotify.com/v1/tracks/3lMzT16MjAKKXF7pSZn13B', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/3lMzT1

In [23]:
# Add the temporay dataframe of metric data to the dataframe we already made with the song titles, artists, and dates
tiktok_dataset = pd.concat([top_chart_df_copy, temp_df], axis=1)

Unnamed: 0,title,artist,date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri
0,Million Dollar Baby,Tommy Richman,2024-06-08,0.852,0.697,1,-5.520,0,0.0439,0.09730,0.000370,0.0678,0.919,138.029,spotify:track:5AJ9hqTS2wcFQCELCFRO7A
1,Tell Ur Girlfriend,Lay Bankz,2024-06-08,0.866,0.741,7,-4.660,1,0.2450,0.09330,0.000000,0.0297,0.614,135.070,spotify:track:3lMzT16MjAKKXF7pSZn13B
2,U My Everything,Sexyy Red & Drake,2024-06-08,0.811,0.640,5,-5.630,0,0.2740,0.09570,0.000000,0.1270,0.527,145.031,spotify:track:2MjXWroB9wlTG2kqv3avfS
3,Birds Of A Feather,Billie Eilish,2024-06-08,0.747,0.507,2,-10.171,1,0.0358,0.20000,0.060800,0.1170,0.438,104.978,spotify:track:6dOtVTDdiauQNBQEDOtlAB
4,Not Like Us,Kendrick Lamar,2024-06-08,0.898,0.472,1,-7.001,1,0.0776,0.01070,0.000000,0.1410,0.214,101.061,spotify:track:6AI3ezQ4o3HUoP6Dhudph3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1945,XXL,LANY,2023-09-16,0.386,0.731,10,-4.494,1,0.0327,0.01470,0.000298,0.2630,0.521,151.507,spotify:track:4kFbxA8gCGx47zJrZ9KiQT
1946,Falling Behind,Laufey,2023-09-16,0.449,0.353,7,-12.986,1,0.0915,0.94200,0.078600,0.1090,0.550,165.888,spotify:track:4KGGeE7RJsgLNZmnxGFlOj
1947,Power Trip,J. Cole Featuring Miguel,2023-09-16,0.644,0.445,10,-7.428,0,0.0330,0.00482,0.945000,0.2330,0.509,100.050,spotify:track:7iv2SMNIgKxXerLuGNAXKC
1948,Beautiful,Bazzi Featuring Camila Cabello,2023-09-16,0.638,0.717,2,-4.722,1,0.0337,0.34600,0.000000,0.1050,0.249,100.027,spotify:track:4VUwkH455At9kENOfzTqmF


## Convert to csv (for reference and viewing) and json files (for use in our javascript code)

In [24]:
#Save dataframe as a csv
tiktok_dataset.to_csv("Resources/scraped_tiktok_data.csv", index = False)
#Create a json file out of the dataset
json_scraped_tiktok_data = tiktok_dataset.to_json(orient='records')

with open('Resources/json_output_scraped_tiktok_data', 'w') as f:
    f.write(json_scraped_tiktok_data)