# Spotify

In [145]:
import spotipy
import pandas as pd
import math
from spotipy.oauth2 import SpotifyClientCredentials

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_rows = 6000
pd.options.display.max_columns = 999

In [4]:
sp = spotipy.Spotify(
    client_credentials_manager=SpotifyClientCredentials(
        client_id='c3c6cbaa304c4c86b120e8d9603d44a4', 
        client_secret='b8819ea70b3142819602499ee139c24f'), retries=20, status_retries=20)

sound_uri = 'spotify:artist:5xUf6j4upBrXZPg6AI4MRK'

In [6]:
sound_uri = '5xUf6j4upBrXZPg6AI4MRK'

all_tracks = []

def get_data(results):
    albums = results['items']
    
    while results['next']:
        results = sp.next(results)
        albums.extend(results['items'])
    
    for album in albums:
        result = sp.album_tracks(album['uri'])
        tracks = result['items']
        
        while result['next']:
            result = sp.next(result)
            tracks.extend(result['items'])
            
        for track in tracks:
            new_track = {}
            new_track['track_name'] = track['name']
            new_track['album_name'] = album['name']
            new_track['album_type'] = album['album_type']
            new_track['release_year'] = album['release_date'][0:4]
            new_track['artists'] = ', '.join([artist['name'] for artist in track['artists']])
            new_track['duration'] = math.floor(track['duration_ms']/1000)

            try:
                audio = sp.audio_features(tracks=track['id'])
                new_track['acousticness'] = audio[0]['acousticness']
                new_track['danceability'] = audio[0]['danceability']
                new_track['energy'] = audio[0]['energy']
                new_track['instrumentalness'] = audio[0]['instrumentalness']
                new_track['key'] = audio[0]['key']
                new_track['liveness'] = audio[0]['liveness']
                new_track['loudness'] = audio[0]['loudness']
                new_track['mode'] = audio[0]['mode']
                new_track['speechiness'] = audio[0]['speechiness']
                new_track['tempo'] = audio[0]['tempo']
                new_track['time_signature'] = audio[0]['time_signature']
                new_track['valence'] = audio[0]['valence']
            except:
                break
            
            all_tracks.append(new_track)

get_data(sp.artist_albums(sound_uri, album_type='album'))
get_data(sp.artist_albums(sound_uri, album_type='single')) 

In [102]:
spot = pd.DataFrame(all_tracks)

### Cleaning

In [103]:
# Choosing only original studio albums, singles and extended plays

albums = ['Ultramega OK (Expanded Reissue)', 'Louder Than Love', 'Badmotorfinger', 'Superunknown (20th Anniversary)','Down On The Upside', 'King Animal (Deluxe Version)','Screaming Life/Fopp']
indices = []
for i, v in spot['album_name'].items():
    if v in albums: indices.append(i)

spot = spot.loc[indices,]
spot = spot.reset_index(drop=True)
spot['album_type'] = 'studio album'
spot.loc[spot['album_name'] == 'Screaming Life/Fopp', 'album_type'] = 'extended play'

## Saving

In [104]:
spot.to_csv('soundgarden_spotify.csv')

# Wiki

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re

In [106]:
url = 'https://en.wikipedia.org/wiki/List_of_songs_recorded_by_Soundgarden'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [112]:
table = soup.find_all('table', class_ = 'wikitable')[1]

In [165]:
rows = table('tr')
del rows[0]

data = []

for row in rows:
    d = {}
    d['track_title'] = row('th')[0].text.strip().split('"')[1]
    d['album_title'] = row('td')[1].text.strip('"').strip()
    d['writers'] = ', '.join([x.text for x in row('td')[0].find_all('a')])
    d['release_year'] = row('td')[2].text.strip()
    data.append(d)
wiki = pd.DataFrame(data)

In [167]:
wiki.to_csv('soundgarden_wiki.csv', index=False)

# AllMusic

In [2]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


import pandas as pd

In [3]:
from requests_html import HTMLSession
session = HTMLSession()

In [4]:

# driver = webdriver.Chrome(service=FireService(ChromeDriverManager().install()))
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
driver.get("https://www.allmusic.com/artist/soundgarden-mn0000001098#discography")


  driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))


In [24]:
# Select the option 'All' from Discography select options
select = Select(driver.find_element(By.XPATH, '//*[@id="releaseType"]'))
select.select_by_value('all')

# Get HTML page source
source = driver.page_source

# Convert HTML page source to BeautifulSoup object
soup = BeautifulSoup(source, 'lxml')

In [25]:
def get_page_data(url):
    """
    Input a page URL to get a BeautifulSoup object from that page.
    """
    driver.get(url)
    source = driver.page_source
    soup = BeautifulSoup(source, 'lxml')
    return soup

In [26]:
def get_album_page_links(soup):
    """
    Input a BeautifulSoup object to get links to all albums present on that discography page
    """
    links = []
    data = soup.find_all('td', class_ = 'meta')
    for album in data:
        if album.find('a'):
            links.append(album.find('a')['href'])
    return links

def get_track_page_links(soup):
    """
    Input a BeautifulSoup object to get links to all tracks present on that album page
    """
    links = []
    data = soup.find_all('div', class_ = 'title')
    for track in data:
        if track.find('a'):
            links.append(track.find('a')['href'])
    return links

album_links = get_album_page_links(soup)

In [27]:
def calculate_rating(classes):
    """
    A function to parse classes from AllMusic website as a rating on a scale from [0.5-5.0].
    This function will expect a list of css classes as a parameter, and output 
    rating according to present class.
    """
    if 'rating-unrated' in classes: rating = ''
    elif 'ratingAllmusic0' in classes: rating = ''
    elif 'ratingAllmusic1' in classes: rating = 1.0
    elif 'ratingAllmusic2' in classes: rating = 1.5
    elif 'ratingAllmusic3' in classes: rating = 2.0
    elif 'ratingAllmusic4' in classes: rating = 2.5
    elif 'ratingAllmusic5' in classes: rating = 3.0
    elif 'ratingAllmusic6' in classes: rating = 3.5
    elif 'ratingAllmusic7' in classes: rating = 4.0
    elif 'ratingAllmusic8' in classes: rating = 4.5
    elif 'ratingAllmusic9' in classes: rating = 5.0
    return rating

In [9]:
def get_track_data(url):
    """
    This function expects a track URL as a parameter. It returns information about the track
    as a dictionary"""
    
    page = get_page_data(url)

    # TRACK COMPOSERS
    track_title = page.find('h1').text.strip()
    if page.find('div', class_='composer'):
        track_composers = ', '.join([x.text for x in page.find('div', class_='composer').find_all('a')])
    else: track_composers = ''

    # TRACK GENRES
    if page.find('div', class_ = 'genre'):
        genres = ', '.join([x.text for x in page.find('div', class_ = 'genre').find_all('a')])
    else:
        genres = ''

    # TRACK STYLES
    if page.find('div', class_ = 'styles'):
        styles = ', '.join([x.text for x in page.find('div', class_ = 'styles').find_all('a')]) 
    else:
        styles = ''

    # OPEN MOODS & THEMES TAB on track page
    button = driver.find_element(By.XPATH, '//*[@id="moodsThemesSidebarLink"]')
    driver.execute_script("arguments[0].click();", button)
    delay = 2

    # TRACK MOODS
    try:
        WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID, 'moodsGrid')))
        if driver.find_element(By.ID, 'moodsGrid'):
            moods = ', '.join([x.text[:-4] for x in driver.find_element(By.ID, 'moodsGrid').find_elements(By.TAG_NAME, 'a')]) 
        else:
            moods = ''
    except TimeoutException:
        moods = ''

    # TRACK THEMES
    try:
        WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID, 'themesGrid')))
        if driver.find_element(By.ID, 'themesGrid'):
            themes = ', '.join([x.text[:-4] for x in driver.find_element(By.ID, 'themesGrid').find_elements(By.TAG_NAME, 'a')]) 
        else:
            themes = ''
    except TimeoutException:
        themes = ''
    
            
    return {
        'track_title':track_title,
        'track_composers':track_composers,
        'track_genres':genres, 
        'track_styles':styles, 
        'track_moods':moods, 
        'track_themes':themes
    }

In [10]:
def get_data(url):
    """
    This function expects an album page URL. It will scrape all data about the album and all tracks from that album.
    It will return a list of all tracks from that album with its corresponding information as a dictionary"""

    album_page = get_page_data(url)

    # ALBUM data
    album_title = album_page.find(id = 'albumTitle').text.strip()
    print(album_title)
    album_release_date = [album_page.find('div', class_ = 'release-date').span.text if album_page.find('div', class_ = 'release-date') else '']
    album_duration = [album_page.find('div', class_ = 'duration').text.strip() if album_page.find('div', class_ = 'duration') else None ]
    album_genre = [album_page.find('div', class_ = 'genre').div.a.text if album_page.find('div', class_ = 'genre') else '']
    album_styles = [[a.text for a in album_page.find('div', class_ = 'styles').find_all('a')] if album_page.find('div', class_ = 'styles') else '']
    album_recording_date = [album_page.find('div', class_ = 'recording-date').div.text if album_page.find('div', class_ = 'recording-date') else '']
    album_recording_location = [album_page.find('div', class_ = 'recording-location').text.strip() if album_page.find('div', class_ = 'recording-location') else '']
    album_moods = [x.text.strip() for x in album_page.find_all('span', class_ = 'mood')]
    album_themes = [x.text.strip() for x in album_page.find_all('span', class_ = 'theme')]
    album_rating = calculate_rating(album_page.find('div', class_ = 'allmusicRating').get('class'))

    # OPEN MOODS & THEMES TAB on ablum page
    try:
        button = driver.find_element(By.XPATH, '//*[@id="moodsThemesTab"]')
        driver.execute_script("arguments[0].click();", button)
        delay = 2
    
        # ALBUM MOODS
        try:
            WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID, 'moodsGrid')))
            if driver.find_element(By.ID, 'moodsGrid'):
                album_moods = ', '.join([x.text for x in driver.find_element(By.ID, 'moodsGrid').find_elements(By.TAG_NAME, 'a')]) 
            else:
                album_moods = ''
        except TimeoutException:
            album_moods = ''
    
        # ALBUM THEMES
        try:
            WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID, 'themesGrid')))
            if driver.find_element(By.ID, 'themesGrid'):
                album_themes = ', '.join([x.text for x in driver.find_element(By.ID, 'themesGrid').find_elements(By.TAG_NAME, 'a')]) 
            else:
                album_themes = ''
        except TimeoutException:
            album_themes = ''
    except:
        pass
    
    # TRACK data
    all_tracks_data = []

    # OPEN TRACK LISTING TAB on album page
    try:
        button = driver.find_element(By.XPATH, '//*[@id="trackListingSidebarLink"]')
        print('Button was found')
        driver.execute_script("arguments[0].click();", button)
        print('Button was clicked')
        delay = 2
        try:
            WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID, 'trackListing')))
            album_page = BeautifulSoup(driver.page_source, 'lxml')
            track_links = get_track_page_links(album_page)
            print(track_links)
            if len(track_links) > 0:
                for track_link in track_links:
                    all_tracks_data.append(get_track_data(track_link))
        except TimeoutException:
            print('Page took too much time to load')   
    except:
        print('No Track Listing Button')
            
    # FINAL data
    final_data = []
    
    for track in all_tracks_data:
        final_data.append({
            'track_title' : track['track_title'],
            'album_title': album_title,
            'album_release_date': album_release_date,
            'track_composers' : track['track_composers'],
            'track_genres': track['track_genres'], 
            'track_styles': track['track_styles'], 
            'track_moods': track['track_moods'], 
            'track_themes': track['track_themes'],
            'album_duration' : album_duration,
            'album_genres' : album_genre,
            'album_styles' : album_styles,
            'album_recording_date' : album_recording_date,
            'album_recording_location' : album_recording_location,
            'album_moods' : album_moods,
            'album_themes' : album_themes,
            'album_rating' : album_rating
        })
    return final_data

### Fetching all data from AllMusic

In [None]:
all_tracks  = []
for i in range(len(album_links)):
    new_data = get_data(album_links[i])
    all_tracks.extend(new_data)

In [72]:
allmusic = pd.DataFrame(all_tracks)

In [73]:
allmusic.drop('album_genres', axis=1, inplace=True)

In [74]:
allmusic['album_duration'] = allmusic['album_duration'].apply(lambda x: str(x[0]).replace('Duration\n',''))
allmusic['album_styles'] = allmusic['album_styles'].apply(lambda x: ', '.join(x[0]))
allmusic['album_recording_date'] = allmusic['album_recording_date'].apply(lambda x: str(x[0]).split(' ')[-1])
allmusic['album_release_date'] = allmusic['album_release_date'].apply(lambda x: x[0].split(' ')[-1])
allmusic['album_recording_location'] = allmusic['album_recording_location'].apply(lambda x: x[0].replace('Recording Location\n',''))
allmusic['album_moods'] = allmusic['album_moods'].apply(lambda x: ''.join(x) if isinstance(x, list) else x)
allmusic['album_themes'] = allmusic['album_themes'].apply(lambda x: ''.join(x) if isinstance(x, list) else x)

In [76]:
allmusic.to_csv('soundgarden_allmusic.csv', index=False)

# Billboard

In [85]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import pandas as pd

In [86]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

driver.get("https://www.billboard.com/artist/soundgarden/")

In [87]:
def extract_data(chart_name = ""):
    data = []
    
    table = driver.find_element(By.CLASS_NAME, 'artist-chart-history-items')
    rows = table.find_elements(By.XPATH, '*')
    
    for row in rows:
        track = {}
        track_name = row.find_element(By.ID, 'title-of-a-story')
        debut_date = row.find_element(By.CLASS_NAME, 'artist-chart-row-debut-date')
        peak_pos = row.find_element(By.CLASS_NAME, 'artist-chart-row-peak-pos')
        peak_date = row.find_element(By.CLASS_NAME, 'artist-chart-row-peak-date')
        weaks_on_chart = row.find_element(By.CLASS_NAME, 'artist-chart-row-week-on-chart')

        track['chart'] = chart_name
        track['track_name'] = track_name.text
        track['debut_date']  = debut_date.text
        track['peak_pos'] = peak_pos.text
        track['peak_date'] = peak_date.text
        track['weaks_on_chart'] = weaks_on_chart.text
        data.append(track)
    
    return data

In [None]:
import time

driver.refresh()

filter = driver.find_element(By.XPATH, '//*[@id="artist-chart-selector"]')
current_li = 1

all_data = []

while(True):
    if current_li == 55:
        break
        
    # filter.click(x)
    delay = 2
    try:
        WebDriverWait(driver, delay).until(EC.element_to_be_clickable((By.ID, 'artist-chart-selector')))
        driver.execute_script("arguments[0].click();", filter)
    except TimeoutException:
        print('Filter took too much time to load!')

    time.sleep(1)
    
    li_path = f'//*[@id="main-wrapper"]/main/div[2]/div[2]/div/div[2]/div[1]/nav/ul/li[{current_li}]/a'
    new_li = driver.find_element(By.XPATH, li_path)
    # new_li.click()
    try:
        WebDriverWait(driver, delay).until(EC.element_to_be_clickable((By.XPATH, li_path)))
        driver.execute_script("arguments[0].click();", new_li)
    except TimeoutException:
        print('List item took too much time to load!')

    time.sleep(1)

    print(f"Chart: {new_li.get_attribute('text').strip()}", end='\r')
    
    all_data.extend(extract_data(chart_name=new_li.get_attribute('text').strip()))
    
    current_li += 1
driver.refresh()

In [94]:
import pandas as pd

bill = pd.DataFrame(all_data)

In [95]:
# Taking into consideration only song charts

bill['chart'].value_counts()
useful_charts = [
    'Mainstream Rock Airplay',
    'Alternative Airplay',
    'Radio Songs', 
    'LyricFind U.S.',
    'Billboard Hot 100',
    'Rock Digital Song Sales',
    'Hot Alternative Songs',
    'LyricFind Global',
    'Alternative Digital Song Sales',
    'Billboard Global 200',
    'Canadian Digital Song Sales',
    'Hot Rock Songs',
    'Hot Rock & Alternative Songs',
    'Digital Song Sales',
    'Rock Streaming Songs',
    'Alternative Streaming Songs'
    ]
indices = bill.loc[bill['chart'].isin(useful_charts),].index

bill = bill.loc[indices,]
bill.reset_index(drop=True, inplace=True)

## Cleaning

In [97]:
bill.sample()

Unnamed: 0,chart,track_name,debut_date,peak_pos,peak_date,weaks_on_chart
51,LyricFind U.S.,4th Of July,07.11.20,7,07.10.21,2


## Saving

In [98]:
bill.to_csv('soundgarden_billboard.csv', index=False)

# Merging Datasets

In [312]:
spot = pd.read_csv('soundgarden_spotify.csv', usecols=range(1,19))
wiki = pd.read_csv('soundgarden_wiki.csv')
allmusic = pd.read_csv('soundgarden_allmusic.csv')
bill = pd.read_csv('soundgarden_billboard.csv')

### Cleaning Spotify

In [313]:
spot.loc[spot['album_name'] == 'Ultramega OK (Expanded Reissue)', 'album_name'] = 'Ultramega OK'

In [314]:
spot = spot.drop(spot.loc[(spot['album_name'] == 'Badmotorfinger') & (spot['track_name'].str.contains('Live')),].index)

In [315]:
spot = spot.reset_index(drop=True)

In [316]:
spot.loc[spot['album_name'] == 'Badmotorfinger', 'track_name'] = spot.loc[spot['album_name'] == 'Badmotorfinger', 'track_name'].str.replace(' - Remastered','')

In [317]:
spot.loc[spot['album_name'].str.contains('Superunknown \(20'), 'album_name'] = 'Superunknown'

In [318]:
spot = spot.drop(spot.loc[(spot['album_name'] == 'Superunknown (Super Deluxe)') | (spot['album_name'] == 'Superunknown (Deluxe Edition)')]. index)

In [319]:
spot = spot.reset_index(drop=True)

In [320]:
spot.loc[spot['album_name'].str.contains('King Animal'), 'album_name'] = 'King Animal'

In [321]:
spot.loc[spot['album_name'].str.contains('Screaming Life'), 'track_name'] = spot.loc[spot['album_name'].str.contains('Screaming Life'), 'track_name'].str.replace(' - Remastered', '')

In [322]:
relevant_albums = ['Ultramega OK', 'Louder Than Love', 'Down On The Upside', 'Badmotorfinger', 'Superunknown', 'King Animal', 'Screaming Life/Fopp']
pattern = '|'.join(relevant_albums)
spot = spot.loc[spot['album_name'].str.contains(pattern),]
spot = spot.reset_index(drop=True)

In [323]:
spot['track_album'] = spot['track_name'].str.lower() + ' ' + spot['album_name'].str.lower()

### Cleaning Wikipedia

In [324]:
wiki = wiki.drop('release_year', axis=1)

In [325]:
wiki.loc[(wiki['album_title'] == 'Screaming Life') | (wiki['album_title'] == 'Fopp'), 'album_title'] = 'Screaming Life/Fopp'

In [326]:
wiki.loc[wiki['track_title'] == 'No Wrong, No Right', 'track_title'] = 'No Wrong No Right'

In [327]:
wiki.loc[wiki['track_title'] == 'Full On (Reprise)', 'track_title'] = 'Full On'

In [328]:
wiki['track_album'] = wiki['track_title'].str.lower() + ' ' + wiki['album_title'].str.lower()

### Merging (Spotify+Wikipedia)

In [329]:
v1 = spot.merge(wiki, how='left', left_on=['track_album'], right_on=['track_album']).drop(['track_title', 'album_title'], axis=1)

In [330]:
v1.replace(np.nan, '').to_csv('soundgarden_v1.csv', index=False)

### Cleaning AllMusic

In [339]:
allmusic.loc[allmusic['track_title'] == 'Full On (Reprise)', 'track_title'] = 'Full On'

In [340]:
allmusic['track_album'] = allmusic['track_title'].str.lower() + ' ' + allmusic['album_title'].str.lower()

### Merging (+AllMusic)

In [357]:
v2 = v1.merge(allmusic, how='left', on='track_album')

In [358]:
v2.replace(np.nan, '').to_csv('soundgarden_v2.csv', index=False)

### Cleaning Billboard

In [360]:
sound = v2
sound = sound.drop_duplicates(subset=['track_name', 'album_name', 'release_year'])

In [361]:
for i in range(bill.shape[0]):

    if sound.loc[sound['track_name'].str.title() == bill.loc[i,'track_name'],].index.size == 1:
        # If song has only one occurence in sound_sound then input Billboard data there
        chart = bill.loc[i,'chart'].lower().replace(' ','_')
        track = bill.loc[i,'track_name']
        index = sound.loc[sound['track_name'].str.title() == bill.loc[i,'track_name'],].index[0]

        if chart + '_debut_date' not in sound.columns: sound[chart + '_debut_date'] = ''
        sound[chart + '_debut_date'][index] = bill.loc[i,'debut_date']

        if chart + '_peak_pos' not in sound.columns: sound[chart + '_peak_pos'] = ''
        sound[chart + '_peak_pos'][index] = bill.loc[i,'peak_pos']

        if chart + '_peak_date' not in sound.columns: sound[chart + '_peak_date'] = ''
        sound[chart + '_peak_date'][index] = bill.loc[i,'peak_date']

        if chart + '_weaks_on_chart' not in sound.columns: sound[chart + '_weaks_on_chart'] = ''
        sound[chart + '_weaks_on_chart'][index] = bill.loc[i,'weaks_on_chart']


In [363]:
sound.replace(np.nan, '').to_csv('soundgarden_v3.csv', index=False)

KeyError: 'authors'