<a href="https://colab.research.google.com/github/AdrianMPCodes/AMP-and-RZ---Is-Good-Music-Biased-Predicting-Grammy-Winning-Albums/blob/main/AMP_%26_RZ_Scraping_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Webscraping

### Wikipedia
- Using wikipedia to get Grammy's Album of the Year nominations and winners 
- https://en.wikipedia.org/wiki/Grammy_Award_for_Album_of_the_Year
- Using data from 2000s-2020s

In [None]:
import requests
import time
from bs4 import BeautifulSoup

In [None]:
response = requests.get("https://en.wikipedia.org/wiki/Grammy_Award_for_Album_of_the_Year")
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
# Get all tables: Each decade has its own wiki-table 
tables = soup.find_all("table", attrs={"class": "wikitable"})
# We only want 3 decades: 2000s, 2010s, 2020s
recent_tables = tables[-3:]

In [None]:
# Find all albums & artists for each table
albums = []
artists = []
for table in recent_tables:
  # tr with content starts at index = 2
  rows = table.find_all("tr")[2:]
  
  for r in rows:
    cols = r.find_all("td")
    if cols[0].find("i") != None:
      album_name = cols[0].find("i").text
      album_artist = cols[1].find("a")

      if album_artist == None: # Various artists have no a tag and shows up as none sso we must add it separately
        artists.append("Various Artists")
        albums.append(album_name)
      else:
        artists.append(album_artist.text)
        albums.append(album_name)

In [None]:
# Making a Pandas DataFrame containing all the album info
df_wiki = pd.DataFrame(albums, columns=['album_name'])
df_wiki["artists"] = artists
df_wiki["won"] = False     #won- either T or F

In [None]:
# Find winning albums for each table, replace False w/ True in the "won" column
win_albums = []
#win_artists = []
for table in recent_tables:
  win_rows = table.find_all("tr", attrs={"style": "background:#FAEB86;"})
  for row in win_rows:
    alb_name = row.find("i").text
    row_num_df = df_wiki[df_wiki['album_name'] == alb_name].index
    df_wiki["won"][row_num_df] = True


In [None]:
# Applying fixes to the dataset before using spotify API

# The "Dixie Chicks" changed their official name to the "The Chicks," so we must reflect
# this in our data set before searching through spotify's API
changing_rows_chicks = df_wiki[df_wiki["artists"] == "Dixie Chicks"].index

for row in changing_rows_chicks:
  df_wiki.at[row, "artists"] = "The Chicks"

# Must change a couple more artist names as they appear different on spotify
changing_black_eyed_peas = df_wiki[df_wiki["artists"] == "The Black Eyed Peas"].index
df_wiki.at[52, "artists"] = "Black Eyed Peas"

changing_girl = df_wiki[df_wiki["album_name"] == "Girl"].index
df_wiki.at[77, "album_name"] = "G I R L"

changing_o_brother = df_wiki[df_wiki["album_name"] == "O Brother, Where Art Thou? - Soundtrack"].index
df_wiki.at[10, "album_name"] = "O Brother, Where Art Thou? (Original Motion Picture Soundtrack)"

df_wiki

Unnamed: 0,album_name,artists,won
0,Supernatural,Santana,True
1,FanMail,TLC,False
2,Fly,The Chicks,False
3,Millennium,Backstreet Boys,False
4,When I Look in Your Eyes,Diana Krall,False
...,...,...,...
134,Music of the Spheres,Coldplay,False
135,Renaissance,Beyoncé,False
136,Special,Lizzo,False
137,Un Verano Sin Ti,Bad Bunny,False


In [None]:
df_wiki.to_csv("df_wiki.csv", index = False)

### Spotify scraping meta-data 

- Gettting each album's statistics from Spotify API

In [None]:
!pip install Spotipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Spotipy
  Downloading spotipy-2.22.1-py3-none-any.whl (28 kB)
Collecting redis>=3.5.3
  Downloading redis-4.5.3-py3-none-any.whl (238 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.6/238.6 KB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting async-timeout>=4.0.2
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Installing collected packages: async-timeout, redis, Spotipy
Successfully installed Spotipy-2.22.1 async-timeout-4.0.2 redis-4.5.3


In [None]:
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.oauth2 as oauth2

In [None]:
# Inputting input credentials
cid = 'b769747d16a348039e288a81d023e6be'
secret = '736c6a4a0e2b4345a4344ae3a07fbf25'
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
# import wiki.csv
df_spotify = pd.read_csv("/content/df_wiki.csv") 

In [None]:
# Find queries for easy to search albums on Spotify
artists, album_ids, album_uris, list_no_tracks, list_release_dates = [], [], [], [], []

for i in range(len(df_spotify['album_name'])):

  query = "album: " + df_spotify["album_name"][i] + " artist: " + df_spotify["artists"][i]

  result_list = sp.search(query, limit = 1, type='album', market='US')['albums']['items']
  if result_list:
    result = result_list[0]
    album_ids.append(result['id'])
    album_uris.append(result['uri'])
    list_no_tracks.append(result['total_tracks'])
    list_release_dates.append(result['release_date'])
    print(df_spotify['album_name'][i])
  else: # For error catching
    # Find queries for the more difficult to search albums on Spotifiy by instead only looking for album name
    print("No results found for query: ", query,". Using brute force instead.")  # Used to fix mistakes 
    print(sp.search(q=df_spotify["album_name"][i], type="album", limit=2))

Supernatural
FanMail
Fly
Millennium
When I Look in Your Eyes
Two Against Nature
Kid A
The Marshall Mathers LP
Midnite Vultures
You're the One
O Brother, Where Art Thou? (Original Motion Picture Soundtrack)
Acoustic Soul
All That You Can't Leave Behind
Love and Theft
Stankonia
Come Away with Me
The Eminem Show
Home
Nellyville
The Rising
Speakerboxxx/The Love Below
Elephant
Fallen
Justified
Under Construction
Genius Loves Company
American Idiot
The College Dropout
Confessions
The Diary of Alicia Keys
How to Dismantle an Atomic Bomb
Chaos and Creation in the Backyard
The Emancipation of Mimi
Late Registration
Love. Angel. Music. Baby.
Taking the Long Way
Continuum
FutureSex/LoveSounds
St. Elsewhere
Stadium Arcadium
River: The Joni Letters
Back to Black
Echoes, Silence, Patience & Grace
Graduation
These Days
Raising Sand
In Rainbows
Tha Carter III
Viva la Vida or Death and All His Friends
Year of the Gentleman
Fearless
Big Whiskey & the GrooGrux King
The E.N.D.
The Fame
I Am... Sasha Fierc

In [None]:
df_spotify["album_id"] = album_ids
df_spotify["num_of_tracks"] = list_no_tracks
df_spotify["rel_date"] = list_release_dates
df_spotify

Unnamed: 0,album_name,artists,won,album_id,num_of_tracks,rel_date
0,Supernatural,Santana,True,10aiDpdFGyfCFEcqpx6XTq,14,1999-06-15
1,FanMail,TLC,False,1CvjjpvqVMoyprsf74bpYW,17,1999-02-23
2,Fly,The Chicks,False,3y6G5El2I6QrJA9BdfAbqA,13,1999-08-27
3,Millennium,Backstreet Boys,False,5ySxm9hxBNss01WCL7GLyQ,12,1999-05-18
4,When I Look in Your Eyes,Diana Krall,False,6NqwpeiAjwYAppRNA7F3yD,13,1999-01-01
...,...,...,...,...,...,...
134,Music of the Spheres,Coldplay,False,06mXfvDsRZNfnsGZvX2zpb,12,2021-10-15
135,Renaissance,Beyoncé,False,6FJxoadUE4JNVwWHghBwnb,16,2022-07-29
136,Special,Lizzo,False,1NgFBv1PxMG1zhFDW1OrRr,12,2022-07-15
137,Un Verano Sin Ti,Bad Bunny,False,3RQQmkQEvNCY4prGKE6oc5,23,2022-05-06


In [None]:
# Get tracklist (list of track IDs) for each album

list_track_ids = []

for album_id in album_ids:
  all_tracks = sp.album_tracks(album_id, limit=50, offset=0, market='US')['items']
  tracks_ids = []
  for track in all_tracks:
    tracks_ids.append(track['uri'])
  list_track_ids.append(tracks_ids)

# duration_ms (avg them up)
# tracks_ids
df_spotify["track_id_list"] = list_track_ids
df_spotify

Unnamed: 0,album_name,artists,won,album_id,num_of_tracks,rel_date,track_id_list
0,Supernatural,Santana,True,10aiDpdFGyfCFEcqpx6XTq,14,1999-06-15,"[spotify:track:3ZJMi7jX3j34ORbXFjrzQi, spotify..."
1,FanMail,TLC,False,1CvjjpvqVMoyprsf74bpYW,17,1999-02-23,"[spotify:track:4O4Q1S0hojqia1lbiHEoN5, spotify..."
2,Fly,The Chicks,False,3y6G5El2I6QrJA9BdfAbqA,13,1999-08-27,"[spotify:track:7yNTZj1ugKCeRJdSP1meOk, spotify..."
3,Millennium,Backstreet Boys,False,5ySxm9hxBNss01WCL7GLyQ,12,1999-05-18,"[spotify:track:6sbXGUn9V9ZaLwLdOfpKRE, spotify..."
4,When I Look in Your Eyes,Diana Krall,False,6NqwpeiAjwYAppRNA7F3yD,13,1999-01-01,"[spotify:track:0h3dxP1Akmx0CKzoJTHbBa, spotify..."
...,...,...,...,...,...,...,...
134,Music of the Spheres,Coldplay,False,06mXfvDsRZNfnsGZvX2zpb,12,2021-10-15,"[spotify:track:1a3G9SNslcKsPAOuIikaxd, spotify..."
135,Renaissance,Beyoncé,False,6FJxoadUE4JNVwWHghBwnb,16,2022-07-29,"[spotify:track:1MpCaOeUWhox2Fgigbe1cL, spotify..."
136,Special,Lizzo,False,1NgFBv1PxMG1zhFDW1OrRr,12,2022-07-15,"[spotify:track:7GSwmKxanoWEdpurOCldDe, spotify..."
137,Un Verano Sin Ti,Bad Bunny,False,3RQQmkQEvNCY4prGKE6oc5,23,2022-05-06,"[spotify:track:6Xom58OOXk2SoU711L2IXO, spotify..."


Audio analysis for each track
- read more about Spotify's audio features: 
- https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features 

In [None]:
list_means = []
list_vars = []

for tracklist in df_spotify["track_id_list"]:
  list_dicts = []
  for track in tracklist:
    feature_dict = sp.audio_features(track)[0]
    list_dicts.append(feature_dict)
    
  df_track_feats = pd.DataFrame(list_dicts, index = tracklist)
  df_track_feats = df_track_feats.drop(columns = ['type', 'id', 'uri', 'analysis_url', 'track_href'])
  means = df_track_feats.iloc[:, 0:12].mean()
  list_means.append(means)
  vars = df_track_feats.iloc[:, 0:12].var()
  list_vars.append(vars)
list_means

[danceability             0.582786
 energy                   0.792071
 key                      7.071429
 loudness                -6.360286
 mode                     0.428571
 speechiness              0.059243
 acousticness             0.206586
 instrumentalness         0.191212
 liveness                 0.179429
 valence                  0.734000
 tempo                  112.041000
 duration_ms         319684.714286
 dtype: float64, danceability             0.691765
 energy                   0.620059
 key                      5.764706
 loudness                -7.341235
 mode                     0.529412
 speechiness              0.159865
 acousticness             0.073153
 instrumentalness         0.021650
 liveness                 0.213459
 valence                  0.607824
 tempo                  114.408765
 duration_ms         223631.294118
 dtype: float64, danceability             0.593846
 energy                   0.591192
 key                      4.153846
 loudness              

In [None]:
df_music_features = (pd.DataFrame(list_means, index = df_spotify["track_id_list"])).reset_index()

In [None]:
df_music_features.to_csv("df_music_feature_avgs.csv", index = False)
df_spotify.to_csv("df_spotify.csv", index = False)

### Webscraping: Artist Demographics
- Get personal information of each artist
- https://musicbrainz.org/ 

In [None]:
df_copy = pd.read_csv("df_wiki.csv") # Read from filepath to save time
df_artist_info = df_copy[["artists"]].copy()
df_artist_info["url"] = df_artist_info["artists"].str.replace(' ', '-')
df_artist_info["url"] = df_artist_info["url"].str.lower()
df_artist_info

Unnamed: 0,artists,url
0,Santana,santana
1,TLC,tlc
2,The Chicks,the-chicks
3,Backstreet Boys,backstreet-boys
4,Diana Krall,diana-krall
...,...,...
134,Coldplay,coldplay
135,Beyoncé,beyoncé
136,Lizzo,lizzo
137,Bad Bunny,bad-bunny


In [70]:
import re 

# For each artist/group get there gender and group type
start_link = "https://musicbrainz.org/search?query="
end_link = "&type=artist&method=indexed"

artists_type = []
artists_sex = []

for i in range (0, len(df_artist_info)):
  artist = df_artist_info["url"][i]
  artist = artist.replace("é", "e").replace("á", "a") # Remove accented characters in artists name before finding their link  
  #time.sleep(0.05)
  artist_link = start_link + artist + end_link
  response = requests.get(artist_link)
  soup = BeautifulSoup(response.text, "html.parser")

  # Get the first row of the main table of the website for each link
  first_row = soup.find("tr", attrs={"data-score": "100"})

  if (first_row != None): 
    # Check that the artist is who we are actually looking for
    artist_name = first_row.find_all("td")[0].text
    artist_name = re.sub("[\(\[].*?[\)\]]", "", artist_name) # Name from website, remove parenthetical
    artist_stage_name = df_artist_info["artists"][i] # Name from our Database

    if artist_stage_name.lower() in artist_name.lower():
      # Find each artist's type (group or person) and gender
      artist_type = first_row.find_all("td")[2].text
      artist_sex = first_row.find_all("td")[3].text

      artists_type.append(artist_type)
      artists_sex.append(artist_sex)
    else:
      artists_type.append("NaN")
      artists_sex.append("NaN")

In [71]:
# Append the new lists to the dataframe
df_artist_info["type"] = artists_type
df_artist_info["sex"] = artists_sex
df_artist_info

Unnamed: 0,artists,url,type,sex
0,Santana,santana,Group,
1,TLC,tlc,Group,
2,The Chicks,the-chicks,Group,
3,Backstreet Boys,backstreet-boys,Group,
4,Diana Krall,diana-krall,Person,Female
...,...,...,...,...
134,Coldplay,coldplay,Group,
135,Beyoncé,beyoncé,Person,Female
136,Lizzo,lizzo,Person,Female
137,Bad Bunny,bad-bunny,Person,Male


In [72]:
# Make changes to file given some artists weren't found:
df_artist_info.to_csv("artist_primary_info.csv", index = False)

# New file (from Github):
df_updated_artist_info = pd.read_csv("https://github.com/AdrianMPCodes/AMP-and-RZ---Is-Good-Music-Biased-Predicting-Grammy-Winning-Albums/blob/701c43fae679fb6bdb8d968f786509d2a85f81be/Data%20Collection%20and%20Cleaning/updated_artist_primary_info.csv?raw=true")
df_updated_artist_info

Unnamed: 0,artists,url,type,sex
0,Santana,santana,Group,Male
1,TLC,tlc,Group,Female
2,The Chicks,the-chicks,Group,Female
3,Backstreet Boys,backstreet-boys,Group,Male
4,Diana Krall,diana-krall,Person,Female
...,...,...,...,...
134,Coldplay,coldplay,Group,Male
135,Beyonce,beyonce,Person,Female
136,Lizzo,lizzo,Person,Female
137,Bad Bunny,bad-bunny,Person,Male


### Combining the album data, music data, and demographic data into one final dataframe

In [74]:
# Combined 3 DFs from before removing unwanted variables
df_all = pd.concat([df_updated_artist_info, df_spotify, df_music_features], axis = 1)
df_all = df_all.drop(columns = ["url", "album_id", "track_id_list", "rel_date"])
df_all = df_all.loc[:,~df_all.columns.duplicated()]
df_all

Unnamed: 0,artists,type,sex,album_name,won,num_of_tracks,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,Santana,Group,Male,Supernatural,True,14,0.582786,0.792071,7.071429,-6.360286,0.428571,0.059243,0.206586,0.191212,0.179429,0.734000,112.041000,319684.714286
1,TLC,Group,Female,FanMail,False,17,0.691765,0.620059,5.764706,-7.341235,0.529412,0.159865,0.073153,0.021650,0.213459,0.607824,114.408765,223631.294118
2,The Chicks,Group,Female,Fly,False,13,0.593846,0.591192,4.153846,-8.442769,1.000000,0.035015,0.207601,0.001895,0.154046,0.568231,129.986462,220581.461538
3,Backstreet Boys,Group,Male,Millennium,False,12,0.643000,0.618417,4.833333,-6.766333,0.500000,0.030092,0.217875,0.000001,0.184825,0.511167,120.629417,230227.750000
4,Diana Krall,Person,Female,When I Look in Your Eyes,False,13,0.532000,0.166946,5.769231,-16.167308,0.538462,0.040623,0.877846,0.017048,0.123092,0.326000,109.450923,250796.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,Coldplay,Group,Male,Music of the Spheres,False,12,0.419075,0.562008,3.583333,-11.275250,0.666667,0.053567,0.375252,0.330368,0.258567,0.216550,140.935750,209217.750000
135,Beyonce,Person,Female,Renaissance,False,16,0.729500,0.665750,4.000000,-7.248938,0.687500,0.138194,0.062996,0.003560,0.260787,0.429656,117.673000,233856.937500
136,Lizzo,Person,Female,Special,False,12,0.693000,0.672417,5.916667,-5.537500,0.416667,0.078292,0.109871,0.000001,0.217617,0.658500,116.024917,176376.916667
137,Bad Bunny,Person,Male,Un Verano Sin Ti,False,23,0.756652,0.657391,5.304348,-6.060696,0.478261,0.119965,0.258104,0.000249,0.216961,0.484826,118.279391,213529.173913


In [75]:
df_all.to_csv("df_combinedVariables.csv", index = False)

### Webscraping for testing data

In [76]:
import re
# using Insider to get top 10 albums so far
response = requests.get("https://www.insider.com/best-albums-ranked-metacritic-critics-2023-3")
soup = BeautifulSoup(response.text, "html.parser") 

top10 = soup.find_all("h2")

test_albums = []
test_artists = []
# Get album and artist
for t in top10:
  albumArtist = t.text
  albumArtist = albumArtist.split(".", 1)[1] 
  album = re.findall('"([^"]*)"', albumArtist)[0]
  artist = albumArtist.split("by ", 1)[1]
  test_albums.append(album)
  test_artists.append(artist)

In [None]:
# Add Our Picks for 2023
test_albums = test_albums + ["Gumbo", "Like..?", "Love Sick", "Let's Start Here", "My 21st Century Blues", "Out On Bond", "Red Moon in Venus"]
test_artists = test_artists + ["Young Nudy", "Ice Spice", "Don Toliver", "Lil Yachty", "RAYE", "Babytron", "Kali Uchis"]

In [None]:
df_test = pd.DataFrame()
df_test["album_name"] = test_albums
df_test["artists"] = test_artists
df_test

Unnamed: 0,album_name,artists
0,This Is Why,Paramore
1,This Stupid World,Yo La Tengo
2,Gigi's Recovery,The Murder Capital
3,The Candle and the Flame,Robert Forster
4,The Great White Sea Eagle,"James Yorkston, Nina Persson, and The Second H..."
5,Raven,Kelela
6,The the Car Around,Gaz Coombes
7,Heavy Heavy,Young Fathers
8,12,Ryuichi Sakamoto
9,"Desire, I Want to Turn Into You",Caroline Polachek


In [None]:
# Find queries for easy to search albums on Spotify
df_test
artists, album_ids, album_uris, list_no_tracks, list_release_dates = [], [], [], [], []

for i in range(len(df_test['album_name'])):

  query = "album: " + df_test["album_name"][i] + " artist: " + df_test["artists"][i]

  result_list = sp.search(query, limit = 1, type='album', market='US')['albums']['items']
  if result_list:
    result = result_list[0]
    album_ids.append(result['id'])
    album_uris.append(result['uri'])
    list_no_tracks.append(result['total_tracks'])
    list_release_dates.append(result['release_date'])
  else: # For error catching
    # Find queries for the more difficult to search albums on Spotifiy by instead only looking for album name
    print("No results found for query: ", query,". Using brute force instead.")  # Used to fix mistakes 
    print(sp.search(q=df_test["album_name"][i], type="album", limit=2))

df_test["album_ids"] = album_ids
df_test["album_uris"] = album_uris
df_test["tracklist"] = list_no_tracks
df_test["rel_date"] = list_release_dates
df_test

Unnamed: 0,album_name,artists,album_ids,album_uris,tracklist,rel_date
0,This Is Why,Paramore,6tG8sCK4htJOLjlWwb7gZB,spotify:album:6tG8sCK4htJOLjlWwb7gZB,10,2023-02-10
1,This Stupid World,Yo La Tengo,3LaJpJFSY3cmLFEHJl2z6E,spotify:album:3LaJpJFSY3cmLFEHJl2z6E,9,2023-02-10
2,Gigi's Recovery,The Murder Capital,6ZbJKlwcSja0MbKg2dQQLJ,spotify:album:6ZbJKlwcSja0MbKg2dQQLJ,12,2023-01-20
3,The Candle and the Flame,Robert Forster,1e0pkJp4dZGq6vGa7L47rd,spotify:album:1e0pkJp4dZGq6vGa7L47rd,9,2023-02-03
4,The Great White Sea Eagle,"James Yorkston, Nina Persson, and The Second H...",3sE3Ya11uxwRYdeMQNkePS,spotify:album:3sE3Ya11uxwRYdeMQNkePS,12,2023-01-13
5,Raven,Kelela,06uhdSmIYrWRkdnAPjcRcT,spotify:album:06uhdSmIYrWRkdnAPjcRcT,15,2023-02-10
6,The the Car Around,Gaz Coombes,6ZbeAO3P5TePTAeA0hgfVP,spotify:album:6ZbeAO3P5TePTAeA0hgfVP,9,2023-01-13
7,Heavy Heavy,Young Fathers,6CmlLROLOUJZnZ8QeCCpqD,spotify:album:6CmlLROLOUJZnZ8QeCCpqD,10,2023-02-03
8,12,Ryuichi Sakamoto,0kvmLk15RUoNqsn8acxqf4,spotify:album:0kvmLk15RUoNqsn8acxqf4,12,2023-01-17
9,"Desire, I Want to Turn Into You",Caroline Polachek,22PkV1Le9P3X4RY4xtmK0q,spotify:album:22PkV1Le9P3X4RY4xtmK0q,12,2023-02-14


In [None]:
# Get tracklist (list of track IDs) for each album
list_track_ids = []

for album_id in album_ids:
  all_tracks = sp.album_tracks(album_id, limit=50, offset=0, market='US')['items']
  tracks_ids = []
  for track in all_tracks:
    tracks_ids.append(track['uri'])
  list_track_ids.append(tracks_ids)

# duration_ms (avg them up)
# tracks_ids
df_test["track_id_list"] = list_track_ids
df_test

Unnamed: 0,album_name,artists,album_ids,album_uris,tracklist,rel_date,track_id_list
0,This Is Why,Paramore,6tG8sCK4htJOLjlWwb7gZB,spotify:album:6tG8sCK4htJOLjlWwb7gZB,10,2023-02-10,"[spotify:track:7HdXRMw14roDx2a0COWk3M, spotify..."
1,This Stupid World,Yo La Tengo,3LaJpJFSY3cmLFEHJl2z6E,spotify:album:3LaJpJFSY3cmLFEHJl2z6E,9,2023-02-10,"[spotify:track:0qe5zk6E7SnQkMmIheGx4E, spotify..."
2,Gigi's Recovery,The Murder Capital,6ZbJKlwcSja0MbKg2dQQLJ,spotify:album:6ZbJKlwcSja0MbKg2dQQLJ,12,2023-01-20,"[spotify:track:2xcxbUio7PmSclx3Ohr4Tq, spotify..."
3,The Candle and the Flame,Robert Forster,1e0pkJp4dZGq6vGa7L47rd,spotify:album:1e0pkJp4dZGq6vGa7L47rd,9,2023-02-03,"[spotify:track:43UPg7v2mdd01XObLezSOJ, spotify..."
4,The Great White Sea Eagle,"James Yorkston, Nina Persson, and The Second H...",3sE3Ya11uxwRYdeMQNkePS,spotify:album:3sE3Ya11uxwRYdeMQNkePS,12,2023-01-13,"[spotify:track:4GZxOH1P8EdjJyEZY0SfPo, spotify..."
5,Raven,Kelela,06uhdSmIYrWRkdnAPjcRcT,spotify:album:06uhdSmIYrWRkdnAPjcRcT,15,2023-02-10,"[spotify:track:1o6heh4aefLv3GJYjtfLa6, spotify..."
6,The the Car Around,Gaz Coombes,6ZbeAO3P5TePTAeA0hgfVP,spotify:album:6ZbeAO3P5TePTAeA0hgfVP,9,2023-01-13,"[spotify:track:3oJo2EQfgA6kniqH1yHpqW, spotify..."
7,Heavy Heavy,Young Fathers,6CmlLROLOUJZnZ8QeCCpqD,spotify:album:6CmlLROLOUJZnZ8QeCCpqD,10,2023-02-03,"[spotify:track:79yGgP2ruLB7SavHhjPFI3, spotify..."
8,12,Ryuichi Sakamoto,0kvmLk15RUoNqsn8acxqf4,spotify:album:0kvmLk15RUoNqsn8acxqf4,12,2023-01-17,"[spotify:track:4hMWShv7tRCNuekpPFiHgH, spotify..."
9,"Desire, I Want to Turn Into You",Caroline Polachek,22PkV1Le9P3X4RY4xtmK0q,spotify:album:22PkV1Le9P3X4RY4xtmK0q,12,2023-02-14,"[spotify:track:4bp2wYweUPvsBrQfntdYcr, spotify..."


In [None]:
list_means = []
list_vars = []

for tracklist in df_test["track_id_list"]:
  list_dicts = []
  for track in tracklist:
    feature_dict = sp.audio_features(track)[0]
    list_dicts.append(feature_dict)
  df_track_feats = pd.DataFrame(list_dicts, index = tracklist)
  df_track_feats = df_track_feats.drop(columns = ['type', 'id', 'uri', 'analysis_url', 'track_href'])
  means = df_track_feats.iloc[:, 0:12].mean()
  list_means.append(means)
  vars = df_track_feats.iloc[:, 0:12].var()
  list_vars.append(vars)
list_means

[danceability             0.630600
 energy                   0.638600
 key                      5.900000
 loudness                -8.579300
 mode                     0.400000
 speechiness              0.041980
 acousticness             0.100010
 instrumentalness         0.039207
 liveness                 0.153660
 valence                  0.506400
 tempo                  129.629100
 duration_ms         217684.200000
 dtype: float64, danceability             0.539000
 energy                   0.618111
 key                      3.333333
 loudness                -9.305444
 mode                     1.000000
 speechiness              0.045600
 acousticness             0.339700
 instrumentalness         0.812000
 liveness                 0.143600
 valence                  0.422311
 tempo                  126.529111
 duration_ms         325244.444444
 dtype: float64, danceability             0.452333
 energy                   0.446850
 key                      6.000000
 loudness              

In [None]:
df_test_music_features = (pd.DataFrame(list_means, index = df_test["track_id_list"])).reset_index()
df_test_music_features

Unnamed: 0,track_id_list,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,"[spotify:track:7HdXRMw14roDx2a0COWk3M, spotify...",0.6306,0.6386,5.9,-8.5793,0.4,0.04198,0.10001,0.039207,0.15366,0.5064,129.6291,217684.2
1,"[spotify:track:0qe5zk6E7SnQkMmIheGx4E, spotify...",0.539,0.618111,3.333333,-9.305444,1.0,0.0456,0.3397,0.812,0.1436,0.422311,126.529111,325244.444444
2,"[spotify:track:2xcxbUio7PmSclx3Ohr4Tq, spotify...",0.452333,0.44685,6.0,-13.167083,0.25,0.04325,0.273954,0.015215,0.115325,0.167758,118.17075,226810.25
3,"[spotify:track:43UPg7v2mdd01XObLezSOJ, spotify...",0.589778,0.512222,5.555556,-11.287222,0.666667,0.028978,0.209047,0.11931,0.118067,0.563333,113.680222,246847.444444
4,"[spotify:track:4GZxOH1P8EdjJyEZY0SfPo, spotify...",0.4,0.176467,3.0,-18.247833,0.666667,0.04245,0.746333,0.005786,0.156292,0.263342,117.267583,219070.0
5,"[spotify:track:1o6heh4aefLv3GJYjtfLa6, spotify...",0.433873,0.436067,5.266667,-13.124267,0.2,0.059913,0.53466,0.291131,0.1666,0.252967,112.614733,250648.066667
6,"[spotify:track:3oJo2EQfgA6kniqH1yHpqW, spotify...",0.486111,0.727889,4.333333,-6.522556,0.222222,0.039922,0.117007,0.053451,0.151322,0.451222,112.229333,253008.888889
7,"[spotify:track:79yGgP2ruLB7SavHhjPFI3, spotify...",0.4166,0.7012,4.8,-7.3413,0.9,0.06795,0.145315,0.080483,0.22735,0.28284,126.8719,196087.1
8,"[spotify:track:4hMWShv7tRCNuekpPFiHgH, spotify...",0.310117,0.049522,5.583333,-28.178333,0.666667,0.058033,0.847772,0.91675,0.138817,0.079508,84.987083,305951.166667
9,"[spotify:track:4bp2wYweUPvsBrQfntdYcr, spotify...",0.601333,0.55425,4.75,-8.650583,0.583333,0.04795,0.306567,0.002493,0.150217,0.330383,118.39325,226917.916667


In [None]:
df_test.to_csv("df_testing.csv", index = False)

In [None]:
df_copy = pd.read_csv("df_testing.csv") # Read from filepath to save time
df_test_artist_info = df_copy[["artists"]].copy()
df_test_artist_info["url"] = df_test_artist_info["artists"].str.replace(' ', '-')
df_test_artist_info["url"] = df_test_artist_info["url"].str.lower()
df_test_artist_info

Unnamed: 0,artists,url
0,Paramore,paramore
1,Yo La Tengo,yo-la-tengo
2,The Murder Capital,the-murder-capital
3,Robert Forster,robert-forster
4,"James Yorkston, Nina Persson, and The Second H...","james-yorkston,-nina-persson,-and-the-second-h..."
5,Kelela,kelela
6,Gaz Coombes,gaz-coombes
7,Young Fathers,young-fathers
8,Ryuichi Sakamoto,ryuichi-sakamoto
9,Caroline Polachek,caroline-polachek


In [None]:
# Get Sex of Each Artist
import re 

# For each artist/group get there gender and group type
start_link = "https://musicbrainz.org/search?query="
end_link = "&type=artist&method=indexed"

artists_type = []
artists_sex = []

for i in range (0, len(df_test_artist_info)):
  artist = df_test_artist_info["url"][i]
  artist = artist.replace("é", "e").replace("á", "a") # Remove accented characters in artists name before finding their link  
  #time.sleep(0.05)
  artist_link = start_link + artist + end_link
  response = requests.get(artist_link)
  soup = BeautifulSoup(response.text, "html.parser")

  # Get the first row of the main table of the website for each link
  first_row = soup.find("tr", attrs={"data-score": "100"})

  if (first_row != None): 
    # Check that the artist is who we are actually looking for
    artist_name = first_row.find_all("td")[0].text
    artist_name = re.sub("[\(\[].*?[\)\]]", "", artist_name) # Name from website, remove parenthetical
    artist_stage_name = df_test_artist_info["artists"][i] # Name from our Database

    if artist_stage_name.lower() in artist_name.lower():
      # Find each artist's type (group or person) and gender
      artist_type = first_row.find_all("td")[2].text
      artist_sex = first_row.find_all("td")[3].text

      artists_type.append(artist_type)
      artists_sex.append(artist_sex)
    else:
      artists_type.append("NaN")
      artists_sex.append("NaN")

In [None]:
# Append the new lists to the dataframe
df_test_artist_info["type"] = artists_type
df_test_artist_info["sex"] = artists_sex
df_test_artist_info

Unnamed: 0,artists,url,type,sex
0,Paramore,paramore,Group,
1,Yo La Tengo,yo-la-tengo,Group,
2,The Murder Capital,the-murder-capital,Group,
3,Robert Forster,robert-forster,Person,Male
4,"James Yorkston, Nina Persson, and The Second H...","james-yorkston,-nina-persson,-and-the-second-h...",,
5,Kelela,kelela,Person,Female
6,Gaz Coombes,gaz-coombes,Person,Male
7,Young Fathers,young-fathers,Group,
8,Ryuichi Sakamoto,ryuichi-sakamoto,,
9,Caroline Polachek,caroline-polachek,Person,Female


In [None]:
# Make changes to file given some artists weren't found:
df_test_artist_info.to_csv("test_artist_primary_info.csv", index = False)

# New file (from Github):
df_test_updated_artist_info = pd.read_csv("https://github.com/AdrianMPCodes/AMP-and-RZ---Is-Good-Music-Biased-Predicting-Grammy-Winning-Albums/blob/main/Data%20Collection%20and%20Cleaning/updated_test_artist_primary_info.csv?raw=true")
df_test_updated_artist_info

Unnamed: 0,artists,url,type,sex
0,Paramore,paramore,Group,Mixed
1,Yo La Tengo,yo-la-tengo,Group,Male
2,The Murder Capital,the-murder-capital,Group,Male
3,Robert Forster,robert-forster,Person,Male
4,"James Yorkston, Nina Persson, and The Second H...","james-yorkston,-nina-persson,-and-the-second-h...",Group,Mixed
5,Kelela,kelela,Person,Female
6,Gaz Coombes,gaz-coombes,Person,Male
7,Young Fathers,young-fathers,Group,Male
8,Ryuichi Sakamoto,ryuichi-sakamoto,Male,Male
9,Caroline Polachek,caroline-polachek,Person,Female


#Merge all test DataFrames:

In [None]:
# Combined 3 DFs from before removing unwanted variables
df_test_all = pd.concat([df_test_updated_artist_info, df_test, df_test_music_features], axis = 1)
df_test_all = df_test_all.drop(columns = ["url", "album_ids", "track_id_list", "album_uris", "rel_date"])
df_test_all = df_test_all.loc[:,~df_test_all.columns.duplicated()]
df_test_all = df_test_all.rename(columns={"tracklist": "num_of_tracks"})
df_test_all

Unnamed: 0,artists,type,sex,album_name,num_of_tracks,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,Paramore,Group,Mixed,This Is Why,10,0.6306,0.6386,5.9,-8.5793,0.4,0.04198,0.10001,0.039207,0.15366,0.5064,129.6291,217684.2
1,Yo La Tengo,Group,Male,This Stupid World,9,0.539,0.618111,3.333333,-9.305444,1.0,0.0456,0.3397,0.812,0.1436,0.422311,126.529111,325244.444444
2,The Murder Capital,Group,Male,Gigi's Recovery,12,0.452333,0.44685,6.0,-13.167083,0.25,0.04325,0.273954,0.015215,0.115325,0.167758,118.17075,226810.25
3,Robert Forster,Person,Male,The Candle and the Flame,9,0.589778,0.512222,5.555556,-11.287222,0.666667,0.028978,0.209047,0.11931,0.118067,0.563333,113.680222,246847.444444
4,"James Yorkston, Nina Persson, and The Second H...",Group,Mixed,The Great White Sea Eagle,12,0.4,0.176467,3.0,-18.247833,0.666667,0.04245,0.746333,0.005786,0.156292,0.263342,117.267583,219070.0
5,Kelela,Person,Female,Raven,15,0.433873,0.436067,5.266667,-13.124267,0.2,0.059913,0.53466,0.291131,0.1666,0.252967,112.614733,250648.066667
6,Gaz Coombes,Person,Male,The the Car Around,9,0.486111,0.727889,4.333333,-6.522556,0.222222,0.039922,0.117007,0.053451,0.151322,0.451222,112.229333,253008.888889
7,Young Fathers,Group,Male,Heavy Heavy,10,0.4166,0.7012,4.8,-7.3413,0.9,0.06795,0.145315,0.080483,0.22735,0.28284,126.8719,196087.1
8,Ryuichi Sakamoto,Male,Male,12,12,0.310117,0.049522,5.583333,-28.178333,0.666667,0.058033,0.847772,0.91675,0.138817,0.079508,84.987083,305951.166667
9,Caroline Polachek,Person,Female,"Desire, I Want to Turn Into You",12,0.601333,0.55425,4.75,-8.650583,0.583333,0.04795,0.306567,0.002493,0.150217,0.330383,118.39325,226917.916667


In [None]:
df_test_all.to_csv("df_test_combinedVariables.csv", index = False)