In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

# Scraping

## Collect data for the "Big Four"
- Source: https://totalmusicawards.com/grammy-awards-winners-archive/

### Collect data for album of the year and record of the year

In [2]:
awards =  ["album-of-the-year",
        "record-of-the-year"]
all_noms = []
for award in awards:

    url = f"https://totalmusicawards.com/grammy-awards/{award}-winners-nominees-archive/"
    html = requests.get(url).content
    doc = BeautifulSoup(html, "html.parser")
    all_years = doc.find_all("p")
    for one_year in all_years[9:73]:
        one_nom = {}
        winner = one_year.find("strong").text
        year = re.findall(r"^\d+", winner)[0]
        musician = re.findall(r", (.+$)", winner)[0]
        work = re.findall(r"^\d+: (.*), .+$", winner)[0]

        one_nom['year'] = year
        one_nom['musician'] = musician
        one_nom['work'] = work
        one_nom['category'] = award
        one_nom['status'] = 'winner'
        all_noms.append(one_nom)

        noms = one_year.find_all('br')
        for nom in noms:
            one_nom = {}
            nominee = nom.next.strip()
            musician = re.findall(r"^.+, (.+$)", nominee)
            work = re.findall(r"(.+), .+$", nominee)
            if len(musician)==1: 
                one_nom['year'] = year
                one_nom['musician'] = musician[0]
                one_nom['work'] = work[0]
                one_nom['category'] = award
                one_nom['status'] = 'nominee'
                all_noms.append(one_nom)
            else:
                typo_row =  {'year': '1980',
                              'musician': 'Supertramp',
                              'work': 'Breakfast in America',
                              'category': award,
                              'status': 'nominee'}
                all_noms.append(typo_row)
    


In [3]:
first_df = pd.DataFrame.from_dict(all_noms)
first_df

Unnamed: 0,year,musician,work,category,status
0,2022,Jon Batiste,We Are,album-of-the-year,winner
1,2022,Tony Bennett & Lady Gaga,Love for Sale,album-of-the-year,nominee
2,2022,Justin Bieber,Justice,album-of-the-year,nominee
3,2022,Doja Cat,Planet Her,album-of-the-year,nominee
4,2022,Billie Eilish,Happier Than Ever,album-of-the-year,nominee
...,...,...,...,...,...
667,1959,Domenico Modugno,Nel Blu Dipinto Di Blu (Volare),record-of-the-year,winner
668,1959,Perry Como,Catch a Falling Star,record-of-the-year,nominee
669,1959,David Seville,The Chipmunk Song (Christmas Don’t Be Late),record-of-the-year,nominee
670,1959,Peggy Lee,Fever,record-of-the-year,nominee


### Collect data for best new artist

In [4]:
award = "best-new-artist"
artist_noms = []

url = f"https://totalmusicawards.com/grammy-awards/{award}-winners-nominees-archive/"
html = requests.get(url).content
doc = BeautifulSoup(html, "html.parser")
all_years = doc.find_all("p")

In [5]:
for one_year in all_years[3:66]:
    one_nom = {}
    winner = one_year.find("strong").text
    year = re.findall(r"(\d+): .*$", winner)[0]
    musician = re.findall(r"\d+: (.*$)", winner)[0]

    one_nom['year'] = year
    one_nom['musician'] = musician
    one_nom['work'] = 'NA'
    one_nom['category'] = award
    one_nom['status'] = 'winner'
    artist_noms.append(one_nom)

    noms = one_year.find_all('br')
    for nom in noms:
        one_nom = {}
        nominee = nom.next.strip()

        one_nom['year'] = year
        one_nom['musician'] = nominee
        one_nom['work'] = 'NA'
        one_nom['category'] = award
        one_nom['status'] = 'nominee'
        artist_noms.append(one_nom)


In [6]:
second_df = pd.DataFrame.from_dict(artist_noms)
second_df

Unnamed: 0,year,musician,work,category,status
0,2022,Olivia Rodrigo,,best-new-artist,winner
1,2022,Arooj Aftab,,best-new-artist,nominee
2,2022,Jimmie Allen,,best-new-artist,nominee
3,2022,Baby Keem,,best-new-artist,nominee
4,2022,Finneas,,best-new-artist,nominee
...,...,...,...,...,...
324,1960,Bobby Darin,,best-new-artist,winner
325,1960,Edd Byrnes,,best-new-artist,nominee
326,1960,Mark Murphy,,best-new-artist,nominee
327,1960,Johnny Restivo,,best-new-artist,nominee


In [7]:
df_2023 = pd.read_csv("2023-grammy.csv")
df_2023

Unnamed: 0,year,musician,work,category,status
0,2023,Lizzo,About Damn Time,record-of-the-year,winner
1,2023,Abba,Don't Shut Me Down,record-of-the-year,nominee
2,2023,Adele,Easy On Me,record-of-the-year,nominee
3,2023,Beyonce,BREAK MY SOUL,record-of-the-year,nominee
4,2023,Mary J. Blige,Good Morning Gorgeous,record-of-the-year,nominee
5,2023,Brandi Carlile Featuring Lucius,You And Me On The Rock,record-of-the-year,nominee
6,2023,Doja Cat,Woman,record-of-the-year,nominee
7,2023,Steve Lacy,Bad Habit,record-of-the-year,nominee
8,2023,Harry Styles,As It Was,record-of-the-year,nominee
9,2023,Kendrick Lamar,The Heart Part 5,record-of-the-year,nominee


In [8]:
df = pd.concat([first_df, second_df, df_2023], ignore_index=True)
df.sample(10)

Unnamed: 0,year,musician,work,category,status
860,1988,Swing Out Sister,,best-new-artist,nominee
50,2015,Beyonce,Beyonce,album-of-the-year,nominee
877,1984,Big Country,,best-new-artist,nominee
644,1964,Barbra Streisand,Happy Days Are Here Again,record-of-the-year,nominee
364,2019,Bad Bunny & J Balvin,"I Like It, Cardi B",record-of-the-year,nominee
835,1993,Jon Secada,,best-new-artist,nominee
78,2010,Dave Matthews Band,Big Whiskey and the GrooGrux King,album-of-the-year,nominee
71,2011,Lady Antebellum,Need You Now,album-of-the-year,nominee
397,2013,Black Keys,Lonely Boy,record-of-the-year,nominee
321,1961,Bob Newhart,The Button-Down Mind of Bob Newhart,album-of-the-year,winner


In [9]:
df["cleaned_musician"] = df['musician'].str.replace('[^\w\s]','')
df.sample(10)

  df["cleaned_musician"] = df['musician'].str.replace('[^\w\s]','')


Unnamed: 0,year,musician,work,category,status,cleaned_musician
533,1986,Dire Straits,Money for Nothing,record-of-the-year,nominee,Dire Straits
977,1964,John Gary,,best-new-artist,nominee,John Gary
435,2006,Gwen Stefani,Hollaback Girl,record-of-the-year,nominee,Gwen Stefani
426,2008,Justin Timberlake,What Goes Around…Comes Around,record-of-the-year,nominee,Justin Timberlake
506,1992,R.E.M.,Losing My Religion,record-of-the-year,nominee,REM
512,1990,Bette Midler,Wind Beneath My Wings,record-of-the-year,winner,Bette Midler
737,2012,The Band Perry,,best-new-artist,nominee,The Band Perry
681,2022,Saweetie,,best-new-artist,nominee,Saweetie
436,2006,Kanye West,Gold Digger,record-of-the-year,nominee,Kanye West
57,2014,Macklemore & Ryan Lewis,The Heist,album-of-the-year,nominee,Macklemore Ryan Lewis


In [10]:
df.to_csv("all-three-awards.csv",index=False)

In [11]:
df.year = df.year.astype(int)

In [12]:
df.dtypes

year                 int64
musician            object
work                object
category            object
status              object
cleaned_musician    object
dtype: object

In [13]:
since_1990 = df[df.year>1989]
unique = since_1990.cleaned_musician.unique().tolist()
unique

['Jon Batiste',
 'Tony Bennett  Lady Gaga',
 'Justin Bieber',
 'Doja Cat',
 'Billie Eilish',
 'HER',
 'Lil Nas X',
 'Olivia Rodrigo',
 'Taylor Swift',
 'Kanye West',
 'Jhene Aiko',
 'Black Pumas',
 'Coldplay',
 'Jacob Collier',
 'Haim',
 'Dua Lipa',
 'Post Malone',
 'Where Do We Go Billie Eilish',
 'Bon Iver',
 'Lana Del Rey',
 'Ariana Grande',
 'Lizzo',
 'Vampire Weekend',
 'Kacey Musgraves',
 'Cardi B',
 'Brandi Carlile',
 'Drake',
 'Kendrick Lamar  others',
 'Janelle Monae',
 'Bruno Mars',
 'Childish Gambino',
 'JayZ',
 'Kendrick Lamar',
 'Lorde',
 'Adele',
 'Beyonce',
 'Sturgill Simpson',
 'Alabama Shakes',
 'Chris Stapleton',
 'the Weeknd',
 'Beck',
 'Pharrell Williams',
 'Sam Smith',
 'Ed Sheeran',
 'Daft Punk',
 'Sara Bareilles',
 'Macklemore  Ryan Lewis',
 'Mumford  Sons',
 'Black Keys',
 'Frank Ocean',
 'fun',
 'Jack White',
 'Foo Fighters',
 'Lady Gaga',
 'Rihanna',
 'Arcade Fire',
 'Eminem',
 'Lady Antebellum',
 'Katy Perry',
 'Black Eyed Peas',
 'Dave Matthews Band',
 'Robe

In [14]:
# Create a separate csv of unique musician name for cross reference 
artist = pd.DataFrame(unique, columns = ['musician'])
artist

Unnamed: 0,musician
0,Jon Batiste
1,Tony Bennett Lady Gaga
2,Justin Bieber
3,Doja Cat
4,Billie Eilish
...,...
346,Latto
347,Måneskin
348,Tobe Nwigwe
349,Molly Tuttle


In [15]:
artist.to_csv("ethnicity-match.csv", index=False)