In [None]:
!pip install spotipy

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.client import SpotifyException
import json
import pandas as pd

# Scraping Spotify Data

In [None]:
credentials = json.load(open('/content/authorization.json'))
client_id = credentials['client_id']
client_secret = credentials['client_secret']

client_credentials_manager = SpotifyClientCredentials(client_id=client_id,client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
def get_track_features(data):
    track = {}

    # Add Album Type
    track['album_type'] = data['album']['album_type']

    # Add Artists
    artists = []
    for i in data['album']['artists']:
        artists.append(i['name'])
    track['artists'] = ("~").join(sorted(artists))

    # Add Album Name
    track['album_name'] = data['album']['name']

    # Add Album Total Tracks
    track['total_tracks'] = data['album']['total_tracks']

    # Add Explicit
    track['explicit'] = data['explicit']

    # Add Local
    track['is_local'] = data['is_local']

    # Add Name
    track['name'] = data['name']

    # Add Popularity
    track['popularity'] = data['popularity']


    # Get Audio Features
    song_features = sp.audio_features(data['id'])[0]

    if song_features is None:
        return {}

    # https://developer.spotify.com/documentation/web-api/reference/#category-tracks
    del song_features['type']
    del song_features['id']
    del song_features['uri']
    del song_features['track_href']
    del song_features['analysis_url']

    track.update(song_features)

    return track

In [None]:
years = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
for year in years:
    x = []
    for i in range(1000):
        try:
            track_results = sp.search(q='year:{}'.format(year), type='track', limit=1, offset=i)
        except SpotifyException:
            break
        
        if track_results['tracks']['items'] != []:
            track_results = track_results['tracks']['items'][0]
        else:
            continue

        track_features = get_track_features(track_results)
        if track_features != {}:
            x.append(track_features)
    df = pd.DataFrame(x)
    df.to_csv("{}.csv".format(year), index=False)

In [None]:
df1 = pd.read_csv("/content/2010.csv")
df2 = pd.read_csv("/content/2011.csv")
df3 = pd.read_csv("/content/2012.csv")
df4 = pd.read_csv("/content/2013.csv")
df5 = pd.read_csv("/content/2014.csv")
df6 = pd.read_csv("/content/2015.csv")
df7 = pd.read_csv("/content/2016.csv")
df8 = pd.read_csv("/content/2017.csv")
df9 = pd.read_csv("/content/2018.csv")
df10 = pd.read_csv("/content/2019.csv")
df11 = pd.read_csv("/content/2020.csv")
df12 = pd.read_csv("/content/2021.csv")

In [None]:
combined_df = pd.concat([df1,df2,df3, df4, df5, df6, df7, df8, df9, df10, df11, df12])

quarter_1_df = pd.concat([df1, df2, df3, df4])
quarter_2_df = pd.concat([df5, df6, df7, df8])
quarter_3_df = pd.concat([df9, df10, df11, df12])

In [None]:
combined_df = combined_df[~combined_df.duplicated()]

In [None]:
combined_df.to_csv("2010_2021_songs.csv", index=False)
quarter_1_df.to_csv("quarter_1_songs.csv", index=False)
quarter_2_df.to_csv("quarter_2_songs.csv", index=False)
quarter_3_df.to_csv("quarter_3_songs.csv", index=False)

In [None]:
len(combined_df)

# Scraping Billboard Daily Hot 100 Data

In [None]:
credentials = json.load(open('/content/authorization.json'))
client_id = credentials['client_id']
client_secret = credentials['client_secret']

client_credentials_manager = SpotifyClientCredentials(client_id=client_id,client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
def get_track_features(data):
    track = {}

    # Add Album Type
    track['album_type'] = data['album']['album_type']

    # Add Artists
    artists = []
    for i in data['album']['artists']:
        artists.append(i['name'])
    track['artists'] = ("~").join(sorted(artists))

    # Add Album Name
    track['album_name'] = data['album']['name']

    # Add Album Total Tracks
    track['total_tracks'] = data['album']['total_tracks']

    # Add Explicit
    track['explicit'] = data['explicit']

    # Add Local
    track['is_local'] = data['is_local']

    # Add Name
    track['name'] = data['name']

    # Add Popularity
    track['popularity'] = data['popularity']


    # Get Audio Features
    song_features = sp.audio_features(data['id'])[0]

    if song_features is None:
        return {}

    # https://developer.spotify.com/documentation/web-api/reference/#category-tracks
    del song_features['type']
    del song_features['id']
    del song_features['uri']
    del song_features['track_href']
    del song_features['analysis_url']

    track.update(song_features)

    return track

In [None]:
class Error(Exception):
    """Base class for other exceptions"""
    pass

class HTTPERROR(Error):
    """Raised when the status code is not 200"""
    pass

def get_billboard_data():
    from datetime import date
    import requests
    from bs4 import BeautifulSoup
    today = str(date.today()) # Hard code Date (Fixed at a certain Date) Evaluation will differ
    URL = "https://www.billboard.com/charts/hot-100/{}".format(today)
    page = requests.get(URL)

    if page.status_code != 200:
        raise HTTPERROR("Invalid URL")
    
    soup = BeautifulSoup(page.content, "html.parser")

    all_songs = soup.find_all("span", {"class": "chart-element__information__song text--truncate color--primary"})
    all_artists = soup.find_all("span", {"class": "chart-element__information__artist text--truncate color--secondary"})

    all_songs = list(map(lambda x: x.text, all_songs))
    all_artists = list(map(lambda x: x.text, all_artists))
    seq = range(1, len(all_songs)+1)

    result = list(zip(seq, all_songs, all_artists))
    return result

In [None]:
billboard_data = get_billboard_data()

https://www.billboard.com/charts/hot-100/2021-09-11


In [None]:
billboard_data[0]

(1, 'Butter', 'BTS')

In [None]:
main = []
for i in billboard_data:
    rank, song, artist = i
    try:
        track_result = sp.search(q="artist:{} track:{}".format(artist, song))['tracks']['items'][0]
    except IndexError:
        artist = artist.split(" Featuring")[0]
        artist = artist.split(" &")[0]
        try:
            track_result = sp.search(q="artist:{} track:{}".format(artist, song))['tracks']['items'][0]
        except IndexError:
            continue
    except SpotifyException:
        raise SpotifyException
    
    track_features = get_track_features(track_result)
    track_features["rank"] = rank
    if track_features != {}:
        main.append(pd.DataFrame([track_features]))

In [None]:
pd.concat(main).to_csv("test.csv", index=False)

# Adding Zhuolin & Zea Teng Data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
df = pd.read_csv("/content/gdrive/MyDrive/DSA3101 Grp 13/Data/spotify data_zhuolin.csv")

In [None]:
df.head()

Unnamed: 0,id,title,first_artist,all_artists,danceability,energy,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,013AWvizllIUEC2FOBzOnh,Truly Madly Deeply,Savage Garden,['Savage Garden'],0.555,0.698,0,-8.874,1,0.403,0.0,0.0851,0.816,168.12,277267,4
1,015qd1I4v00JIoK7yOUgKC,Rush Rush,Paula Abdul,['Paula Abdul'],0.706,0.44,4,-11.577,1,0.763,0.0,0.0958,0.395,90.012,292933,4
2,02HqgLNHnZSOyckc9eQhq4,Freedom - Radio Version,DJ BoBo,['DJ BoBo'],0.631,0.85,0,-6.806,1,0.00764,0.0155,0.324,0.512,140.017,236693,4
3,02lTDOxHeXTHsdwXoz6lpC,In the Meantime,Spacehog,['Spacehog'],0.269,0.895,9,-6.382,1,0.083,0.00266,0.153,0.16,168.479,299507,4
4,0303qZHRLFhKkOgO80toX9,Lazy Line Painter Jane,Belle & Sebastian,"['Belle & Sebastian', 'Monica Queen']",0.343,0.747,6,-5.73,1,0.127,1e-05,0.31,0.237,125.389,347733,4


In [None]:
s = list(map(lambda x: sp.audio_features(x)[0]['speechiness'], list(df["id"])))
df["speechiness"] = s

In [None]:
df.head()

Unnamed: 0,id,title,first_artist,all_artists,danceability,energy,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,speechiness
0,013AWvizllIUEC2FOBzOnh,Truly Madly Deeply,Savage Garden,['Savage Garden'],0.555,0.698,0,-8.874,1,0.403,0.0,0.0851,0.816,168.12,277267,4,0.0287
1,015qd1I4v00JIoK7yOUgKC,Rush Rush,Paula Abdul,['Paula Abdul'],0.706,0.44,4,-11.577,1,0.763,0.0,0.0958,0.395,90.012,292933,4,0.0493
2,02HqgLNHnZSOyckc9eQhq4,Freedom - Radio Version,DJ BoBo,['DJ BoBo'],0.631,0.85,0,-6.806,1,0.00764,0.0155,0.324,0.512,140.017,236693,4,0.0436
3,02lTDOxHeXTHsdwXoz6lpC,In the Meantime,Spacehog,['Spacehog'],0.269,0.895,9,-6.382,1,0.083,0.00266,0.153,0.16,168.479,299507,4,0.118
4,0303qZHRLFhKkOgO80toX9,Lazy Line Painter Jane,Belle & Sebastian,"['Belle & Sebastian', 'Monica Queen']",0.343,0.747,6,-5.73,1,0.127,1e-05,0.31,0.237,125.389,347733,4,0.0632


Columns needed:
album_type
artists
album_name
total_tracks
explicit
is_local
name (aka title)
popularity
speechiness


In [None]:
def get_additional_features(data):
    track = {}

    # Add Album Type
    track['album_type'] = data['album']['album_type']

    # Add Artists
    artists = []
    for i in data['album']['artists']:
        artists.append(i['name'])
    track['artists'] = "~".join(sorted(artists))

    # Add Album Name
    track['album_name'] = data['album']['name']

    # Add Album Total Tracks
    track['total_tracks'] = data['album']['total_tracks']

    # Add Explicit
    track['explicit'] = data['explicit']

    # Add Local
    track['is_local'] = data['is_local']

    # Add Popularity
    track['popularity'] = data['popularity']

    return track

In [None]:
new_data = []
for i in list(df["id"]):
    new_data.append(get_additional_features(sp.track(i)))

In [None]:
final_dict = {}
for i in new_data:
    if "album_name" not in final_dict:
        final_dict["album_name"] = []
    final_dict["album_name"].append(i["album_name"])

    if "artists" not in final_dict:
        final_dict["artists"] = []
    final_dict["artists"].append(i["artists"])

    if "album_type" not in final_dict:
        final_dict["album_type"] = []
    final_dict["album_type"].append(i["album_type"])

    if "explicit" not in final_dict:
        final_dict["explicit"] = []
    final_dict["explicit"].append(i["explicit"])

    if "is_local" not in final_dict:
        final_dict["is_local"] = []
    final_dict["is_local"].append(i["is_local"]) 

    if "popularity" not in final_dict:
        final_dict["popularity"] = []
    final_dict["popularity"].append(i["popularity"]) 

    if "total_tracks" not in final_dict:
        final_dict["total_tracks"] = []
    final_dict["total_tracks"].append(i["total_tracks"]) 

In [None]:
add_df = pd.DataFrame(final_dict)

In [None]:
new_df = pd.concat([df,add_df], axis=1)

In [None]:
new_df.head()

Unnamed: 0,id,title,first_artist,all_artists,danceability,energy,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,speechiness,album_name,artists,album_type,explicit,is_local,popularity,total_tracks
0,013AWvizllIUEC2FOBzOnh,Truly Madly Deeply,Savage Garden,['Savage Garden'],0.555,0.698,0,-8.874,1,0.403,0.0,0.0851,0.816,168.12,277267,4,0.0287,Savage Garden,Savage Garden,album,False,False,72,11
1,015qd1I4v00JIoK7yOUgKC,Rush Rush,Paula Abdul,['Paula Abdul'],0.706,0.44,4,-11.577,1,0.763,0.0,0.0958,0.395,90.012,292933,4,0.0493,Spellbound,Paula Abdul,album,False,False,61,11
2,02HqgLNHnZSOyckc9eQhq4,Freedom - Radio Version,DJ BoBo,['DJ BoBo'],0.631,0.85,0,-6.806,1,0.00764,0.0155,0.324,0.512,140.017,236693,4,0.0436,Freedom,DJ BoBo,single,False,False,0,4
3,02lTDOxHeXTHsdwXoz6lpC,In the Meantime,Spacehog,['Spacehog'],0.269,0.895,9,-6.382,1,0.083,0.00266,0.153,0.16,168.479,299507,4,0.118,Resident Alien,Spacehog,album,False,False,61,13
4,0303qZHRLFhKkOgO80toX9,Lazy Line Painter Jane,Belle & Sebastian,"['Belle & Sebastian', 'Monica Queen']",0.343,0.747,6,-5.73,1,0.127,1e-05,0.31,0.237,125.389,347733,4,0.0632,Push Barman To Open Old Wounds,Belle & Sebastian,album,False,False,8,25


In [None]:
new_df = new_df[['album_type', "artists", 'album_name', 'total_tracks', 'explicit', 
                 'is_local', 'title', 'popularity', 'danceability', 'energy', 'key', 'loudness', 
                 'mode', 'speechiness', 'acousticness', 'instrumentalness', 
                 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]

In [None]:
new_df = new_df.rename(columns={'title': 'name'})

In [None]:
new_df.to_csv("spotify data_zhuolin_added.csv", index=False)

# Jieyi Data

In [None]:
client_id = '9ebf692e8c9f403a8aec3a3e51c56fbc'
client_secret = '6defdf79ede94e8d9704bdfef6a0c715'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id,client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
def get_track_features(data):
    track = {}

    # Add Album Type
    track['album_type'] = data['album']['album_type']

    # Add Artists
    artists = []
    for i in data['album']['artists']:
        artists.append(i['name'])
    track['artists'] = "~".join(sorted(artists))

    # Add Album Name
    track['album_name'] = data['album']['name']

    # Add Album Total Tracks
    track['total_tracks'] = data['album']['total_tracks']

    # Add Explicit
    track['explicit'] = data['explicit']

    # Add Local
    track['is_local'] = data['is_local']

    # Add Name
    track['name'] = data['name']

    # Add Popularity
    track['popularity'] = data['popularity']


    # Get Audio Features
    song_features = sp.audio_features(data['id'])[0]

    if song_features is None:
        return {}

    # https://developer.spotify.com/documentation/web-api/reference/#category-tracks
    del song_features['type']
    del song_features['id']
    del song_features['uri']
    del song_features['track_href']
    del song_features['analysis_url']

    track.update(song_features)

    return track

In [None]:
playlist_uri = ["2Grs4rKhpXyvDH69yU0rXc", "0COB45vygvENL4A9BDKtLi", "5UwDgZFj9d46gjtXT6H1k1"]

data=[]

for uri in playlist_uri:
    results = sp.user_playlist('jieyi33', uri , 'tracks')['tracks']['items']

    for i in results:
        track_features = get_track_features(i['track'])
        data.append(track_features)

df = pd.DataFrame(data)
#df.to_csv("{}.csv".format(year), index=False)

In [None]:
df.to_csv("spotify data_jieyi.csv", index=False)

# Combining Data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
terry_df = pd.read_csv("/content/gdrive/MyDrive/DSA3101 Grp 13/Data/2010_2021_songs.csv")
jieyi_df = pd.read_csv("/content/gdrive/MyDrive/DSA3101 Grp 13/Data/spotify data_jieyi.csv")
zhuolin_df = pd.read_csv("/content/gdrive/MyDrive/DSA3101 Grp 13/Data/spotify data_zeateng_added.csv")
zeateng_df = pd.read_csv("/content/gdrive/MyDrive/DSA3101 Grp 13/Data/spotify data_zhuolin_added.csv")


In [None]:
new_artists = list(map(lambda x: x.split("~"), terry_df['artists']))
terry_df['artists'] = new_artists

In [None]:
combined_df = pd.concat([terry_df, jieyi_df, zhuolin_df, zeateng_df])

In [None]:
combined_df

Unnamed: 0,album_type,artists,album_name,total_tracks,explicit,is_local,name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,album,Mac Miller,K.I.D.S.,16,True,False,The Spins,80,0.794,0.766,9,-3.894,0,0.0377,0.01350,0.000000,0.0785,0.550,127.007,195873,4
1,album,Waylon Jennings~Willie Nelson,Waylon & Willie,11,False,False,The Year 2003 Minus 25 - Remastered,43,0.626,0.457,4,-13.785,1,0.0384,0.35500,0.000009,0.0668,0.474,102.166,184267,4
2,album,Bruno Mars,Doo-Wops & Hooligans,10,False,False,Talking to the Moon,82,0.523,0.606,1,-4.754,0,0.0301,0.51200,0.000000,0.1060,0.065,145.837,217867,4
3,single,Anthem Lights,Best of the Year Medleys: 2007 - 2012,6,False,False,Best of 2010: Baby / Break Even / Need You Now...,42,0.417,0.443,5,-6.261,1,0.0328,0.70900,0.000000,0.2170,0.470,119.903,155342,4
4,single,Roar,I Can't Handle Change,6,True,False,I Can't Handle Change,81,0.247,0.438,5,-8.479,1,0.0358,0.04550,0.000871,0.1110,0.390,185.716,198213,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
793,album,Mariah Carey,Daydream,12,False,False,One Sweet Day,66,0.568,0.495,1,-8.964,1,0.0299,0.35300,0.000000,0.0839,0.303,128.234,281067,4
794,compilation,Mariah Carey,#1's,17,False,False,I'll Be There,0,0.402,0.509,5,-9.392,1,0.0324,0.68000,0.000000,0.8230,0.326,81.509,263933,4
795,album,Lighthouse Family,Postcards From Heaven,11,False,False,High,40,0.664,0.622,10,-8.505,1,0.0253,0.09940,0.073800,0.1120,0.614,102.342,310333,4
796,album,All Saints,All Saints,13,False,False,Never Ever,60,0.738,0.541,1,-5.485,1,0.0311,0.55900,0.000000,0.0492,0.309,134.187,387573,4


In [None]:
combined_df.drop_duplicates(inplace=True)

In [None]:
combined_df.to_csv("combined_data.csv", index=False)