In [5]:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
pd.options.display.max_rows = 1399
pd.options.display.max_columns = 99

In [6]:
session = HTMLSession()
url = 'https://www.allmusic.com/artist/nirvana-mn0000357406/discography/all'
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [7]:
def get_page_data(url):
    """
    Input a page URL to get a BeautifulSoup object from that page.
    """
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [8]:
def get_album_page_links(soup):
    """
    Input a BeautifulSoup object to get links to all albums present on that discography page
    """
    links = []
    data = soup.find_all('td', class_ = 'title')
    for album in data:
        if album.find('a'):
            links.append(album.find('a')['href'])
    return links

def get_track_page_links(soup):
    """
    Input a BeautifulSoup object to get links to all tracks present on that album page
    """
    links = []
    data = soup.find_all('div', class_ = 'title')
    for track in data:
        if track.find('a'):
            links.append(track.find('a')['href'])
    return links

album_links = get_album_page_links(soup)

In [9]:
def calculate_rating(classes):
    """
    A function to parse classes from AllMusic website as a rating on a scale from [0.5-5.0].
    This function will expect a list of css classes as a parameter, and output 
    rating according to present class.
    """
    if 'rating-unrated' in classes: rating = ''
    elif 'rating-allmusic-0' in classes: rating = ''
    elif 'rating-allmusic-1' in classes: rating = 1.0
    elif 'rating-allmusic-2' in classes: rating = 1.5
    elif 'rating-allmusic-3' in classes: rating = 2.0
    elif 'rating-allmusic-4' in classes: rating = 2.5
    elif 'rating-allmusic-5' in classes: rating = 3.0
    elif 'rating-allmusic-6' in classes: rating = 3.5
    elif 'rating-allmusic-7' in classes: rating = 4.0
    elif 'rating-allmusic-8' in classes: rating = 4.5
    elif 'rating-allmusic-9' in classes: rating = 5.0
    return rating

In [10]:
def get_track_data(url):
    """
    This function expects a track URL as a parameter. It returns information about the track
    as a dictionary"""
    overview_page = get_page_data(url)
    attributes_page = get_page_data(str(url +  '/attributes'))
        
    track_title = overview_page.find('h1').text.strip()
    track_composers = [x.a.text for x in overview_page.find_all('p', class_ = 'song-composer')]
    genres = [x.text[:-4] for x in attributes_page.find('div', class_ = 'attribute-tab-genres').find_all('a')]
    styles = [x.text[:-4] for x in attributes_page.find('div', class_ = 'attribute-tab-styles').find_all('a')] 
    moods = [x.text[:-4] for x in attributes_page.find('div', class_ = 'attribute-tab-moods').find_all('a')] 
    themes = [x.text[:-4] for x in attributes_page.find('div', class_ = 'attribute-tab-themes').find_all('a')] 
        
    return {
        'track_title' : track_title,
        'track_composers' : track_composers,
        'track_genres':genres, 
        'track_styles':styles, 
        'track_moods':moods, 
        'track_themes':themes
    }
        

In [11]:
def get_data(url):
    """
    This function expects an album page URL. It will scrape all data about the album and all tracks from that album.
    It will return a list of all tracks from that album with its corresponding information as a dictionary"""

    # ALBUM data
    album_page = get_page_data(url)
    album_title = album_page.find('h1', class_ = 'album-title').text.strip()
    album_release_date = [album_page.find('div', class_ = 'release-date').span.text if album_page.find('div', class_ = 'release-date') else '']
    album_duration = [album_page.find('div', class_ = 'duration').text.strip() if album_page.find('div', class_ = 'duration') else None ]
    album_genre = album_page.find('div', class_ = 'genre').div.a.text
    album_styles = [[a.text for a in album_page.find('div', class_ = 'styles').find_all('a')] if album_page.find('div', class_ = 'styles') else '']
    album_recording_date = [album_page.find('div', class_ = 'recording-date').div.text if album_page.find('div', class_ = 'recording-date') else '']
    album_recording_location = [album_page.find('div', class_ = 'recording-location').li.text.strip() if album_page.find('div', class_ = 'recording-location') else '']
    album_moods = [x.text.strip() for x in album_page.find_all('span', class_ = 'mood')]
    album_themes = [x.text.strip() for x in album_page.find_all('span', class_ = 'theme')]
    album_rating = calculate_rating(album_page.find('div', class_ = 'allmusic-rating').get('class'))

    # TRACK data
    all_tracks_data = []
    track_links = get_track_page_links(album_page)
    if len(track_links) > 0:
        for track_link in track_links:
            all_tracks_data.append(get_track_data(track_link))

    # FINAL data
    final_data = []
    
    for track in all_tracks_data:
        final_data.append({
            'track_title' : track['track_title'],
            'track_composers' : track['track_composers'],
            'track_genres': track['track_genres'], 
            'track_styles': track['track_styles'], 
            'track_moods': track['track_moods'], 
            'track_themes': track['track_themes'],
            'album_title': album_title,
            'album_release_date': album_release_date,
            'album_duration' : album_duration,
            'album_genres' : album_genre,
            'album_styles' : album_styles,
            'album_recording_date' : album_recording_date,
            'album_recording_location' : album_recording_location,
            'album_moods' : album_moods,
            'album_themes' : album_themes,
            'album_rating' : album_rating
        })
    return final_data

### Fetching all data from AllMusic

In [12]:
all_tracks = []
for i in range(len(album_links)):
    all_tracks.extend(get_data(album_links[i]))
# get_data(album_links[0])

### Cleaning fetched data

In [41]:
import pandas as pd

In [42]:
df = pd.DataFrame(all_tracks)

In [43]:
df.drop('album_release_date', axis=1, inplace=True)

In [44]:
print(f'Columns:\t{df.shape[1]}\nRows:\t\t{df.shape[0]}')

Columns:	15
Rows:		1224


In [45]:
df['track_composers'] = df['track_composers'].apply(lambda x: ', '.join(x))
df['track_genres'] = df['track_genres'].apply(lambda x: ', '.join(x))
df['track_styles'] = df['track_styles'].apply(lambda x: ', '.join(x))
df['track_moods'] = df['track_moods'].apply(lambda x: ', '.join(x))
df['track_themes'] = df['track_themes'].apply(lambda x: ', '.join(x))
df['album_duration'] = df['album_duration'].apply(lambda x: str(x[0])[9:])
df['album_styles'] = df['album_styles'].apply(lambda x: ''.join(x[0]))
df['album_recording_date'] = df['album_recording_date'].apply(lambda x: ', '.join(x))
df['album_recording_location'] = df['album_recording_location'].apply(lambda x: ', '.join(x))
df['album_moods'] = df['album_moods'].apply(lambda x: ', '.join(x))
df['album_themes'] = df['album_themes'].apply(lambda x: ', '.join(x))

In [46]:
# Capializing every word in every track and album title and renaming the columns

df.rename(columns={'track_title':'track_name','album_title':'album_name'}, inplace=True)
df.loc[:,['track_name', 'album_name']] = df.loc[:,['track_name', 'album_name']].apply(lambda x: x.str.title())

In [47]:
# Cleaning album names

df.loc[df['album_name'] == 'Sliver: The Best Of The Box','album_name'] = 'Sliver - The Best Of The Box'
df.loc[df['album_name'] == 'Live & Loud','album_name'] = 'Live And Loud'
df.drop(df.loc[(df['album_name'] == 'Live And Loud') & (df['track_composers'] == '')].index, inplace=True)

In [52]:
# Cleaning track names

df.loc[108, 'track_name'] = 'Oh Me'
df.loc[111, 'track_name'] = 'Where Did You Sleep Last Night'
df.loc[[80, 386, 559], 'track_name'] = 'Heart-Shaped Box'
df.loc[568, 'track_name'] = 'Help Me, I\'M Hungry'
df.loc[592, 'track_name'] = 'Where Did You Sleep Last Night'
df.loc[[604, 684], 'track_name'] = 'Oh The Guilt'

In [53]:
# Cleaning the missing data format
df.loc[df['track_genres'] == 'Would you like to contrib', 'track_genres'] = ''
df.loc[df['track_styles'] == 'Would you like to contrib', 'track_styles'] = ''
df.loc[df['track_moods'] == 'Would you like to contrib', 'track_moods'] = ''
df.loc[df['track_themes'] == 'Would you like to contrib', 'track_themes'] = ''

### Saving data to .csv file

In [54]:
df.to_csv('nirvana_allmusic.csv', index=False)