## Import libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
artist_name = 'Nirvana'
artist_URL = 'https://genius.com/artists/' + artist_name.replace(' ', '-')
response = requests.get(artist_URL)

# Artist page html
soup = BeautifulSoup(response.content, 'html.parser')

## Get links to album

In [3]:
# Get album links 
albums = {}
album_names = []
album_links = []

for a in soup.find_all('a', class_="vertical_album_card"):
    albums[a['title']] = {
        'album_name': a['title'],
        'album_link': a['href'],
    }
    album_names.append(a['title'])
    album_links.append(a['href'])
albums

{'Nevermind (Deluxe Edition)': {'album_name': 'Nevermind (Deluxe Edition)',
  'album_link': 'https://genius.com/albums/Nirvana/Nevermind-deluxe-edition'},
 'Nevermind (Super Deluxe Edition)': {'album_name': 'Nevermind (Super Deluxe Edition)',
  'album_link': 'https://genius.com/albums/Nirvana/Nevermind-super-deluxe-edition'},
 'Live At The Paramount': {'album_name': 'Live At The Paramount',
  'album_link': 'https://genius.com/albums/Nirvana/Live-at-the-paramount'},
 'Icon ': {'album_name': 'Icon ',
  'album_link': 'https://genius.com/albums/Nirvana/Icon'},
 'Live at Reading': {'album_name': 'Live at Reading',
  'album_link': 'https://genius.com/albums/Nirvana/Live-at-reading'},
 'Sliver: The Best of the Box': {'album_name': 'Sliver: The Best of the Box',
  'album_link': 'https://genius.com/albums/Nirvana/Sliver-the-best-of-the-box'}}

## Parse song data from each album page

In [4]:
for album in albums.keys():
    response = requests.get(albums[album]['album_link'])
    soup = BeautifulSoup(response.content, 'html.parser')

    # Get song names
    unclean_songs = soup.find_all('h3', class_='chart_row-content-title')
    song_names =  []

    # Clean up song name data before saving
    for unclean_song_name in unclean_songs:
        clean_song_name = unclean_song_name.text
        clean_song_name = clean_song_name[0:-10].strip()
        song_names.append(clean_song_name)

    albums[album]['songs'] = song_names

    # Get album release date
    release_date = soup.find('div', class_='metadata_unit')
    albums[album]['release_date'] = release_date.text[len('Released '):]


## Put album data into pandas dataframe

In [5]:
# Goal df: 
# Index, artist_name, album_name, release_year, song_name
# int, string, string, string, string

rows = list()

for album in albums:
    for song in albums[album]['songs']:
        row = {
            'artist_name': [artist_name],
            'album_name': [albums[album]['album_name']],
            'release_year': [albums[album]['release_date']],
            'song_name': [song],
        }

        row = pd.DataFrame.from_dict(row)

        rows.append(row)

df = pd.concat(rows)
df

Unnamed: 0,artist_name,album_name,release_year,song_name
0,Nirvana,Nevermind (Deluxe Edition),"September 27, 2011",Smells Like Teen Spirit
0,Nirvana,Nevermind (Deluxe Edition),"September 27, 2011",In Bloom
0,Nirvana,Nevermind (Deluxe Edition),"September 27, 2011",Come As You Are
0,Nirvana,Nevermind (Deluxe Edition),"September 27, 2011",Breed
0,Nirvana,Nevermind (Deluxe Edition),"September 27, 2011",Lithium
...,...,...,...,...
0,Nirvana,Sliver: The Best of the Box,"November 1, 2005",Heart-Shaped Box
0,Nirvana,Sliver: The Best of the Box,"November 1, 2005",Do Re Mi
0,Nirvana,Sliver: The Best of the Box,"November 1, 2005",You Know You're Right
0,Nirvana,Sliver: The Best of the Box,"November 1, 2005",All Apologies


In [6]:
df.to_csv('live-workshop-song-data.csv')