In [45]:
import requests
from datetime import datetime
import time
from bs4 import BeautifulSoup as bs
import urllib.request as urllib
import re
import pandas as pd
import json

In [46]:
def convert_to_written_key(key_int):
    """
    Converts the int representing a key given by spotify into its actual key.
    """
    keys = ["C", "C#/Db", "D", "D#/Eb", "E", "F", "F#/Gb", "G", "G#/Ab", "A", "A#/Bb", "B"]
    return keys[key_int]

In [73]:
def get_spotify_token():
    """
    Creates an API OAuth token for the spotify API using a given client ID and client secret.
    (these really shouldn't be hard-coded but lazy)
    """
    auth_url = 'https://accounts.spotify.com/api/token'

    auth_response = requests.post(auth_url, {
        'grant_type': 'client_credentials',
        'client_id': "97319f969f3d45bab2bd7b2092771155",
        'client_secret': "b6ab8c5b42174823a83d26dba4c4d955",
    })
    
    return auth_response.json()['access_token']

In [111]:
def make_spotify_request(request_url, token):
    try:
        header = {"Authorization": f"Bearer {token}"}

        try:
            request = requests.get(request_url, headers=header)
        except ConnectionResetError:
            print("\nConnection Reset Error.")
            time.sleep(5)
            request = make_spotify_request(request_url, token)

        # If spotify says we're using its API too much, wait however long
        # it tells us to, then run again.
        if request.status_code == 429:
            wait_time = int(request.headers["Retry-After"])
            print(f"\nwaiting {wait_time} seconds, at Spotify's request.")
            time.sleep(wait_time)
            request = make_spotify_request(request_url, token)
        elif request.status_code == 404:
            print("\nbroken song.")
            return None
        
        request = request.json()
        
    except KeyboardInterrupt:
        raise Exception("Keyboard Interrupted.")
        
    except:
        print("\nRandom spotify API Error.")
        time.sleep(5)
        request = make_spotify_request(request_url, token)
    
    return request

In [54]:
# Where we will accumulate the rows of all songs
rows_to_df().to_csv("backup.csv")
rows = {}

In [113]:
def get_most_popular_100_over_time(year, month):
    # Gets the current date to make sure we don't go into the future.
    current_date = datetime.now()
    
    # Creates the token needed for spotify API requests.
    token = get_spotify_token()

    all_month_rows = []
    
    # If the date occurs in the future, continue.
    if current_date.year == year and month >= current_date.month:
        return None

    # Create the url for the given date.
    umd_url = f"http://www.umdmusic.com/default.asp?Lang=English&Chart=D&ChDay=15&ChMonth={month}&ChYear={year}&ChBand=&ChSong="

    # Call that URL and import it into a BeautifulSoup object.
    html = urllib.urlopen(umd_url)
    soup = bs(html.read())
    html.close()

    """
    If you look at the webpage at: 
    http://www.umdmusic.com/default.asp?Lang=English&Chart=D&ChDay=09&ChMonth=09&ChYear=1986&ChBand=&ChSong=

    You'll notice it stores all the songs in a big table, so first we find every table row (<tr>) in the html.
    """
    table_rows = soup.find_all("tr")

    for row in table_rows:
        """
        Second, you'll notice that every row for a song contains exactly nine columns.
        So, we check the number of cells and if it's not exactly 9, we know it's not 
        a relevant row, and continue.
        """
        cells = row.find_all("td")
        if len(cells) != 9:
            continue

        # Song name and artist are always located in the 5th cell in a row.
        # stripped_strings makes a list of items split on <br>
        cell_text = list(cells[4].stripped_strings)

        song_name = cell_text[0]
        artist = cell_text[1]

        # For API calls, spaces have to be converted into %20 characters
        # because spaces and api calls don't mix at all.
        search_term = f"{song_name}, {artist}".replace(" ", "%20")

        # Use spotify's search API to try and find a song matching the search.
        spotify_song = make_spotify_request(
            f"https://api.spotify.com/v1/search?q={search_term}&type=track&market=US&limit=1",
            token)
        
        # If no song is found on the parameters, we won't be able to get data on it,
        # so we skip it.
        if spotify_song is None or "tracks" not in spotify_song.keys() or len(spotify_song["tracks"]["items"]) == 0:
            continue

        # Get the ID of the song from first returned search item.
        song_id = spotify_song["tracks"]["items"][0]["id"]


        # Use the ID gotten above to get all song characteristics of the given song.
        song_characteristics = make_spotify_request(
            f"https://api.spotify.com/v1/audio-features/{song_id}",
            token)
        
        if song_characteristics is None:
            continue
        
        # Split the given characteristics into variables.
        danceability = song_characteristics["danceability"]
        energy = song_characteristics["energy"]
        key = convert_to_written_key(int(song_characteristics["key"]))
        speechiness = song_characteristics["speechiness"]
        acousticness = song_characteristics["acousticness"]
        instrumentalness = song_characteristics["instrumentalness"]
        liveness = song_characteristics["liveness"]
        valence = song_characteristics["valence"]
        tempo = song_characteristics["tempo"]
        duration_ms = song_characteristics["duration_ms"]

        # Create a row based off this song's data, and add it to the overall list of lists.
        all_month_rows.append([year, month, song_name, artist, danceability, energy, 
                                   key, speechiness, acousticness, instrumentalness, liveness, 
                                   valence, tempo, duration_ms])
    return all_month_rows

In [114]:
# Loop through every year from a given year to the current year (inclusive)
for year in range(2000, 2022):  
    # Loop through every month in a year
    print("                   ", end="\r")
    for month in range(1,13):
        label = f"{month}-{year}"
        if label in rows.keys():
            continue
        
        print(label, end="\r")

        month_rows = get_most_popular_100_over_time(year, month)
        rows[label] = month_rows
        
        #Break for now so that it gets less data, remove these if you want the full dataset.
#         break
#     break



12-2021            

In [117]:
# Create a dataframe based off the collected data and return it.
def rows_to_df():
    all_rows = []

    for value in rows.values():
        if value is None:
            continue
        all_rows += value

    return pd.DataFrame(data=all_rows, columns=["year", "month", "song_name", "artist", "danceability", "energy", "key",
                                            "speechiness", "acousticness", "instrumentalness", "liveness", 
                                            "valence", "tempo", "duration_ms"])

df = rows_to_df()
df

Unnamed: 0,year,month,song_name,artist,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,2000,1,What A Girl Wants,CHRISTINA AGUILERA,0.754,0.819,C,0.1120,0.0684,0.000000,0.3450,0.8180,142.137,215800
1,2000,1,Smooth,SANTANA featuring ROB THOMAS,0.691,0.764,A,0.0386,0.1120,0.004190,0.0509,0.8330,115.980,240013
2,2000,1,Back At One,BRIAN McKNIGHT,0.661,0.346,B,0.0300,0.4520,0.000000,0.1310,0.2560,129.752,263667
3,2000,1,I Knew I Loved You,SAVAGE GARDEN,0.610,0.497,A,0.0279,0.3200,0.000061,0.0884,0.7390,85.031,250360
4,2000,1,I Wanna Love You Forever,JESSICA SIMPSON,0.590,0.666,E,0.0244,0.1560,0.000000,0.1360,0.0743,104.042,263800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21045,2021,9,Summer Of Love,SHAWN MENDES & TAINY,0.776,0.808,B,0.1170,0.0297,0.000127,0.1030,0.4940,123.988,184104
21046,2021,9,Ain't Shit,DOJA CAT,0.855,0.488,D#/Eb,0.2080,0.5060,0.000000,0.3470,0.6190,124.097,174253
21047,2021,9,Motley Crew,POST MALONE,0.797,0.631,D#/Eb,0.0786,0.0904,0.000004,0.0998,0.2880,129.915,184213
21048,2021,9,Remote Control,KANYE WEST,0.792,0.548,E,0.0820,0.0784,0.000073,0.1670,0.1970,110.105,198614


In [118]:
df.to_csv("song_characteristics.csv")