In [1]:
# Dependencies
import pandas as pd
from sqlalchemy import create_engine


In [None]:
# Establish connection to postgres for saving data later.
engine = create_engine(f'postgresql://postgres:postgres@localhost:5432/music_db')
connection = engine.connect()

In [3]:
# Setup url scrape. We are using Wikipedia because it has the most complete data.  
# Each year is on a separate page.  

#webpages = ['https://en.wikipedia.org/wiki/62nd_Annual_Grammy_Awards', 
#           'https://en.wikipedia.org/wiki/61st_Annual_Grammy_Awards',
#           'https://en.wikipedia.org/wiki/60th_Annual_Grammy_Awards',
#            'https://en.wikipedia.org/wiki/59th_Annual_Grammy_Awards',
#            'https://en.wikipedia.org/wiki/58th_Annual_Grammy_Awards',
#            'https://en.wikipedia.org/wiki/57th_Annual_Grammy_Awards',
#            'https://en.wikipedia.org/wiki/56th_Annual_Grammy_Awards',
#            'https://en.wikipedia.org/wiki/55th_Annual_Grammy_Awards',
#            'https://en.wikipedia.org/wiki/54th_Annual_Grammy_Awards',
#            'https://en.wikipedia.org/wiki/53rd_Annual_Grammy_Awards'
#            ]

# For each page, create a separate file.  We will merge later.


In [4]:
#Create data frame from csv file.
grammy_csv = "Resources/the_grammy_awards.csv"

grammy_df = pd.read_csv(grammy_csv)
grammy_df.head()


Unnamed: 0,year,title,published_at,updated_at,category,nominee,artist,workers,img,winner
0,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Bad Guy,Billie Eilish,"Finneas O'Connell, producer; Rob Kinelski & Fi...",https://www.grammy.com/sites/com/files/styles/...,True
1,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,"Hey, Ma",Bon Iver,"BJ Burton, Brad Cook, Chris Messina & Justin V...",https://www.grammy.com/sites/com/files/styles/...,True
2,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,7 rings,Ariana Grande,"Charles Anderson, Tommy Brown, Michael Foster ...",https://www.grammy.com/sites/com/files/styles/...,True
3,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Hard Place,H.E.R.,"Rodney “Darkchild” Jerkins, producer; Joseph H...",https://www.grammy.com/sites/com/files/styles/...,True
4,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Talk,Khalid,"Disclosure & Denis Kosiak, producers; Ingmar C...",https://www.grammy.com/sites/com/files/styles/...,True


In [5]:
# Drop unnecessary columns. Winner will be added back later.
grammy_df.drop(['title', 'published_at', 'updated_at', 'img', 'winner'], axis=1, inplace=True)
grammy_df

Unnamed: 0,year,category,nominee,artist,workers
0,2019,Record Of The Year,Bad Guy,Billie Eilish,"Finneas O'Connell, producer; Rob Kinelski & Fi..."
1,2019,Record Of The Year,"Hey, Ma",Bon Iver,"BJ Burton, Brad Cook, Chris Messina & Justin V..."
2,2019,Record Of The Year,7 rings,Ariana Grande,"Charles Anderson, Tommy Brown, Michael Foster ..."
3,2019,Record Of The Year,Hard Place,H.E.R.,"Rodney “Darkchild” Jerkins, producer; Joseph H..."
4,2019,Record Of The Year,Talk,Khalid,"Disclosure & Denis Kosiak, producers; Ingmar C..."
...,...,...,...,...,...
4805,1958,Best Classical Performance - Instrumentalist (...,Tchaikovsky: Piano Concerto No. 1 In B Flat Mi...,,"Van Cliburn, artist (Symphony Of The Air Orche..."
4806,1958,Best Classical Performance - Instrumentalist (...,Segovia Golden Jubilee,,"Andres Segovia, artist"
4807,1958,Best Classical Performance - Chamber Music (In...,Beethoven: Quartet 130,,"Hollywood String Quartet (Alvin Dinkin, Paul S..."
4808,1958,Best Classical Performance - Vocal Soloist (Wi...,Operatic Recital,,


In [6]:
# Drop all rows pre-2009 (first year of our Billboard database)
# Convert year to integer 
grammy_df['year'] = grammy_df['year'].astype(str).astype(int) 
clean_grammy_df = grammy_df.loc[grammy_df['year'] > 2008, :]
clean_grammy_df

Unnamed: 0,year,category,nominee,artist,workers
0,2019,Record Of The Year,Bad Guy,Billie Eilish,"Finneas O'Connell, producer; Rob Kinelski & Fi..."
1,2019,Record Of The Year,"Hey, Ma",Bon Iver,"BJ Burton, Brad Cook, Chris Messina & Justin V..."
2,2019,Record Of The Year,7 rings,Ariana Grande,"Charles Anderson, Tommy Brown, Michael Foster ..."
3,2019,Record Of The Year,Hard Place,H.E.R.,"Rodney “Darkchild” Jerkins, producer; Joseph H..."
4,2019,Record Of The Year,Talk,Khalid,"Disclosure & Denis Kosiak, producers; Ingmar C..."
...,...,...,...,...,...
1314,2009,Best Classical Vocal Performance,Verismo Arias,,"Renée Fleming, soloist; David Frost, producer;..."
1315,2009,Best Classical Contemporary Composition,"Higdon, Jennifer: Percussion Concerto",,"Jennifer Higdon, composer"
1316,2009,Best Classical Crossover Album,Yo-Yo Ma & Friends: Songs Of Joy And Peace,Yo-Yo Ma & Various Artists,"Steven Epstein, producer; Richard King, engine..."
1317,2009,Best Short Form Music Video,Boom Boom Pow,The Black Eyed Peas,"Mathew Cullen & Mark Kudsi, video directors; J..."


In examining the data, it appears that the compiler incorrectly marked everyone a winner for all awards.  This is obviously incorrect.  Comparing to the Wikipedia sites for each year, it seems that the first record for each award for each year is the winner, and the rest are the remaining nominees.  Thus, we need to correct the data in winner column to reflect this. We deleted this column above, and will now add it back with the correct data.


In [7]:
# Declare variable to hold current award name to know when it changes
award = ''

# Declare a list that holds True or False
winner_list = []

# Go through the grammy dataframe, storing appropriate value in winner_list we can later use.
for index, row in clean_grammy_df.iterrows():
    # if the current row is not equal to the stored award, keep the True in winner and change the value of award
    if row['category'] != award:
        award = row['category']
        winner_list.append(True)
    else: 
        winner_list.append(False)

winner_list

[True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,


In [8]:
# Using winner_list, let's update the clean_grammy_df winner column by merging

clean_grammy_df['winner'] = winner_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_grammy_df['winner'] = winner_list


In [9]:
clean_grammy_df

Unnamed: 0,year,category,nominee,artist,workers,winner
0,2019,Record Of The Year,Bad Guy,Billie Eilish,"Finneas O'Connell, producer; Rob Kinelski & Fi...",True
1,2019,Record Of The Year,"Hey, Ma",Bon Iver,"BJ Burton, Brad Cook, Chris Messina & Justin V...",False
2,2019,Record Of The Year,7 rings,Ariana Grande,"Charles Anderson, Tommy Brown, Michael Foster ...",False
3,2019,Record Of The Year,Hard Place,H.E.R.,"Rodney “Darkchild” Jerkins, producer; Joseph H...",False
4,2019,Record Of The Year,Talk,Khalid,"Disclosure & Denis Kosiak, producers; Ingmar C...",False
...,...,...,...,...,...,...
1314,2009,Best Classical Vocal Performance,Verismo Arias,,"Renée Fleming, soloist; David Frost, producer;...",True
1315,2009,Best Classical Contemporary Composition,"Higdon, Jennifer: Percussion Concerto",,"Jennifer Higdon, composer",True
1316,2009,Best Classical Crossover Album,Yo-Yo Ma & Friends: Songs Of Joy And Peace,Yo-Yo Ma & Various Artists,"Steven Epstein, producer; Richard King, engine...",True
1317,2009,Best Short Form Music Video,Boom Boom Pow,The Black Eyed Peas,"Mathew Cullen & Mark Kudsi, video directors; J...",True


In [10]:
# Save the file for other group members
clean_grammy_df.to_csv('grammys_clean.csv')

In [13]:
# Load dataframe as table to Postgres, using connection from above
clean_grammy_df.to_sql('grammys', engine, if_exists='replace',index=False) #drops old table and creates new empty table
