In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

# Scraping

## Collect data for the "Big Four"
- Source: https://totalmusicawards.com/grammy-awards-winners-archive/

### Collect data for album of the year and record of the year

In [2]:
awards =  ["album-of-the-year",
        "record-of-the-year"]
all_noms = []
for award in awards:

    url = f"https://totalmusicawards.com/grammy-awards/{award}-winners-nominees-archive/"
    html = requests.get(url).content
    doc = BeautifulSoup(html, "html.parser")
    all_years = doc.find_all("p")
    for one_year in all_years[9:73]:
        one_nom = {}
        winner = one_year.find("strong").text
        year = re.findall(r"^\d+", winner)[0]
        musician = re.findall(r", (.+$)", winner)[0]
        work = re.findall(r"^\d+: (.*), .+$", winner)[0]

        one_nom['year'] = year
        one_nom['musician'] = musician
        one_nom['work'] = work
        one_nom['category'] = award
        one_nom['status'] = 'winner'
        all_noms.append(one_nom)

        noms = one_year.find_all('br')
        for nom in noms:
            one_nom = {}
            nominee = nom.next.strip()
            musician = re.findall(r"^.+, (.+$)", nominee)
            work = re.findall(r"(.+), .+$", nominee)
            if len(musician)==1: 
                one_nom['year'] = year
                one_nom['musician'] = musician[0]
                one_nom['work'] = work[0]
                one_nom['category'] = award
                one_nom['status'] = 'nominee'
                all_noms.append(one_nom)
            else:
                typo_row =  {'year': '1980',
                              'musician': 'Supertramp',
                              'work': 'Breakfast in America',
                              'category': award,
                              'status': 'nominee'}
                all_noms.append(typo_row)
    


In [16]:
first_df = pd.DataFrame.from_dict(all_noms)
first_df.head(2)

Unnamed: 0,year,musician,work,category,status
0,2022,Jon Batiste,We Are,album-of-the-year,winner
1,2022,Tony Bennett & Lady Gaga,Love for Sale,album-of-the-year,nominee


### Collect data for best new artist

In [4]:
award = "best-new-artist"
artist_noms = []

url = f"https://totalmusicawards.com/grammy-awards/{award}-winners-nominees-archive/"
html = requests.get(url).content
doc = BeautifulSoup(html, "html.parser")
all_years = doc.find_all("p")

In [5]:
for one_year in all_years[3:66]:
    one_nom = {}
    winner = one_year.find("strong").text
    year = re.findall(r"(\d+): .*$", winner)[0]
    musician = re.findall(r"\d+: (.*$)", winner)[0]

    one_nom['year'] = year
    one_nom['musician'] = musician
    one_nom['work'] = 'NA'
    one_nom['category'] = award
    one_nom['status'] = 'winner'
    artist_noms.append(one_nom)

    noms = one_year.find_all('br')
    for nom in noms:
        one_nom = {}
        nominee = nom.next.strip()

        one_nom['year'] = year
        one_nom['musician'] = nominee
        one_nom['work'] = 'NA'
        one_nom['category'] = award
        one_nom['status'] = 'nominee'
        artist_noms.append(one_nom)


In [6]:
second_df = pd.DataFrame.from_dict(artist_noms)
second_df.head(2)

Unnamed: 0,year,musician,work,category,status
0,2022,Olivia Rodrigo,,best-new-artist,winner
1,2022,Arooj Aftab,,best-new-artist,nominee


## Collect 2023 grammys nominations

In [17]:
df_2023 = pd.read_csv("2023-grammy.csv")
df_2023.head(2)

Unnamed: 0,year,musician,work,category,status
0,2023,Lizzo,About Damn Time,record-of-the-year,winner
1,2023,Abba,Don't Shut Me Down,record-of-the-year,nominee


In [8]:
df = pd.concat([first_df, second_df, df_2023], ignore_index=True)
df.sample(10)

Unnamed: 0,year,musician,work,category,status
588,1975,Elton John,Don’t Let the Sun Go Down on Me,record-of-the-year,nominee
41,2017,Justin Bieber,Purpose,album-of-the-year,nominee
748,2010,MGMT,,best-new-artist,nominee
811,1997,LeAnn Rimes,,best-new-artist,winner
359,2020,Khalid,Talk,record-of-the-year,nominee
195,1986,Dire Straits,Brothers in Arms,album-of-the-year,nominee
126,2000,Dixie Chicks,Fly,album-of-the-year,nominee
803,1999,Andrea Bocelli,,best-new-artist,nominee
807,1998,Fiona Apple,,best-new-artist,nominee
551,1983,Vangelis,Chariots of Fire,record-of-the-year,nominee


In [9]:
df["cleaned_musician"] = df['musician'].str.replace('[^\w\s]','')
df.sample(10)

  df["cleaned_musician"] = df['musician'].str.replace('[^\w\s]','')


Unnamed: 0,year,musician,work,category,status,cleaned_musician
582,1976,Captain & Tennille,Love Will Keep Us Together,record-of-the-year,winner,Captain Tennille
66,2012,Lady Gaga,Born This Way,album-of-the-year,nominee,Lady Gaga
807,1998,Fiona Apple,,best-new-artist,nominee,Fiona Apple
456,2002,Train,Drops of Jupiter (Tell Me),record-of-the-year,nominee,Train
972,1965,Astrud Gilberto,,best-new-artist,nominee,Astrud Gilberto
797,2000,Macy Gray,,best-new-artist,nominee,Macy Gray
937,1972,Carly Simon,,best-new-artist,winner,Carly Simon
784,2003,Avril Lavigne,,best-new-artist,nominee,Avril Lavigne
487,1995,Sheryl Crow,All I Wanna Do,record-of-the-year,winner,Sheryl Crow
481,1997,Smashing Pumpkins,1979,record-of-the-year,nominee,Smashing Pumpkins


In [10]:
df.to_csv("all-three-awards.csv",index=False)

In [11]:
df.year = df.year.astype(int)

In [12]:
df.dtypes

year                 int64
musician            object
work                object
category            object
status              object
cleaned_musician    object
dtype: object

For this project, I only did analysis on all nominations from 1990 to 2023.

In [13]:
since_1990 = df[df.year>1989]
unique = since_1990.cleaned_musician.unique().tolist()
unique

['Jon Batiste',
 'Tony Bennett  Lady Gaga',
 'Justin Bieber',
 'Doja Cat',
 'Billie Eilish',
 'HER',
 'Lil Nas X',
 'Olivia Rodrigo',
 'Taylor Swift',
 'Kanye West',
 'Jhene Aiko',
 'Black Pumas',
 'Coldplay',
 'Jacob Collier',
 'Haim',
 'Dua Lipa',
 'Post Malone',
 'Where Do We Go Billie Eilish',
 'Bon Iver',
 'Lana Del Rey',
 'Ariana Grande',
 'Lizzo',
 'Vampire Weekend',
 'Kacey Musgraves',
 'Cardi B',
 'Brandi Carlile',
 'Drake',
 'Kendrick Lamar  others',
 'Janelle Monae',
 'Bruno Mars',
 'Childish Gambino',
 'JayZ',
 'Kendrick Lamar',
 'Lorde',
 'Adele',
 'Beyonce',
 'Sturgill Simpson',
 'Alabama Shakes',
 'Chris Stapleton',
 'the Weeknd',
 'Beck',
 'Pharrell Williams',
 'Sam Smith',
 'Ed Sheeran',
 'Daft Punk',
 'Sara Bareilles',
 'Macklemore  Ryan Lewis',
 'Mumford  Sons',
 'Black Keys',
 'Frank Ocean',
 'fun',
 'Jack White',
 'Foo Fighters',
 'Lady Gaga',
 'Rihanna',
 'Arcade Fire',
 'Eminem',
 'Lady Antebellum',
 'Katy Perry',
 'Black Eyed Peas',
 'Dave Matthews Band',
 'Robe

In [14]:
# Create a separate csv of unique musician name for cross reference 
artist = pd.DataFrame(unique, columns = ['musician'])
artist

Unnamed: 0,musician
0,Jon Batiste
1,Tony Bennett Lady Gaga
2,Justin Bieber
3,Doja Cat
4,Billie Eilish
...,...
346,Latto
347,Måneskin
348,Tobe Nwigwe
349,Molly Tuttle


In [15]:
artist.to_csv("ethnicity-match.csv", index=False)