# Spotiify Data Extraction

In [465]:
import spotipy
import pandas as pd
import math
from spotipy.oauth2 import SpotifyClientCredentials

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_rows = 6000
pd.options.display.max_columns = 999

In [466]:
sp = spotipy.Spotify(
    client_credentials_manager=SpotifyClientCredentials(
        client_id='c3c6cbaa304c4c86b120e8d9603d44a4', 
        client_secret='b8819ea70b3142819602499ee139c24f'), retries=20, status_retries=20)

nirvana_uri = 'spotify:artist:1w5Kfo2jwwIPruYS2UWh56'

In [566]:
pearl_uri = '1w5Kfo2jwwIPruYS2UWh56'

all_tracks = []

def get_data(results):
    albums = results['items']
    
    while results['next']:
        results = sp.next(results)
        albums.extend(results['items'])
    
    for album in albums:
        if album['name'] not in ['Ten', 'Vs.', 'Vitalogy', 'No Code', 'Yield', 'Binaural', 'Riot Act', 'Pearl Jam', 'Backspacer', 'Lightning Bolt' ,'Gigaton', 'Last Kiss', 'Merkin Ball']:
            continue
            
        result = sp.album_tracks(album['uri'])
        tracks = result['items']
        
        while result['next']:
            result = sp.next(result)
            tracks.extend(result['items'])
            
        for track in tracks:
            # if track['id'] == '6DifPvqyEMk6zj0uDvcOpl': continue
            
            new_track = {}
            new_track['track_name'] = track['name']
            new_track['album_name'] = album['name']
            new_track['album_type'] = album['album_type']
            new_track['release_year'] = album['release_date'][0:4]
            new_track['artists'] = ', '.join([artist['name'] for artist in track['artists']])
            new_track['duration'] = math.floor(track['duration_ms']/1000)

            try:
                audio = sp.audio_features(tracks=track['id'])
                new_track['acousticness'] = audio[0]['acousticness']
                new_track['danceability'] = audio[0]['danceability']
                new_track['energy'] = audio[0]['energy']
                new_track['instrumentalness'] = audio[0]['instrumentalness']
                new_track['key'] = audio[0]['key']
                new_track['liveness'] = audio[0]['liveness']
                new_track['loudness'] = audio[0]['loudness']
                new_track['mode'] = audio[0]['mode']
                new_track['speechiness'] = audio[0]['speechiness']
                new_track['tempo'] = audio[0]['tempo']
                new_track['time_signature'] = audio[0]['time_signature']
                new_track['valence'] = audio[0]['valence']
            except:
                break
            
            all_tracks.append(new_track)

get_data(sp.artist_albums(pearl_uri, album_type='album')) # Studio Albums
get_data(sp.artist_albums(pearl_uri, album_type='single')) # Last Kiss and Merkin Ball

In [560]:
spot = pd.DataFrame(all_tracks)

## Cleaning The Data

In [569]:
# Cleaning Track Names

spot['track_name'] = spot['track_name'].apply(lambda x: x.split(' - ')[0])
spot.loc[114, 'track_name'] = 'Aye Davanita'
spot.loc[14, 'track_name'] = 'My Father\'s Son'
spot.loc[35, 'track_name'] = 'Better Man'
spot.loc[149, 'track_name'] = 'Long Road'

In [570]:
# Cleaning Album Types

spot.loc[spot['album_type'] == 'single', 'album_type'] = 'extended play'

## Saving The Data

In [572]:
spot.to_csv('pearl_spotify.csv', index=False)

# Wikipedia Data Extraction

In [474]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re

In [475]:
url = 'https://en.m.wikipedia.org/wiki/Category:Pearl_Jam_songs'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [476]:
links = [x.a['href'] if not x.span else '' for x in soup.find('div', class_='mw-content-ltr').find_all('li')]
while '' in links: links.remove('')
links = links[3:]

In [477]:
all_tracks = []
for link in links:
    page_link = 'https://en.wikipedia.org' + link
    song_page = requests.get(page_link, )
    soup = BeautifulSoup(song_page.content, 'html.parser')
    
    table_rows = soup.find('table', class_='infobox').find_all('tr')
    
    title = table_rows[0].text.strip('/"')
    
    track = {}
    track['track_title'] = title
    for row in table_rows:
        if row.th and row.td:
            if row.th.text.lower() == 'released':
                release_date = row.find('td').text
                track['release'] = release_date.split('\xa0(')[0].replace('[1]','')[-5:].strip()
            if row.th.text.lower() == 'songwriter(s)':
                authors  = row.td.text
                track['authors'] = ', '.join(authors.replace('[2]','').split('\n')).lstrip(', ').rstrip(', ')
            if row.th.text.lower() == 'producer(s)':
                producers  = row.td.text
                track['producers'] = ', '.join(producers.replace('[1]','').split('\n')).lstrip(', ').rstrip(', ')
    all_tracks.append(track)

df = pd.DataFrame(all_tracks)

In [478]:
df = df.replace(np.nan,'')
df.sort_values(by='release', inplace=True)
df.to_csv('pearl_wiki.csv', index=False)
wiki = df

## Cleaning The data

In [479]:
indices = [1, 9, 37]
wiki.loc[indices,'track_title'] = wiki.loc[indices,'track_title'].str.title()
wiki.loc[22,'track_title'] = '1/2 Full'

## Saving The Data

In [573]:
pearl_v1 = spot.merge(wiki, how='left', left_on=['track_name'], right_on=['track_title']).drop(['track_title', 'release'], axis=1)
pearl_v1.to_csv('pearl_v1.csv', index=False)

# AllMusic Data Extraction

In [532]:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd

In [596]:
session = HTMLSession()
url = 'https://www.allmusic.com/artist/pearl-jam-mn0000037730/discography/all'
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [498]:
def get_page_data(url):
    """
    Input a page URL to get a BeautifulSoup object from that page.
    """
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [499]:
def get_album_page_links(soup):
    """
    Input a BeautifulSoup object to get links to all albums present on that discography page
    """
    links = []
    data = soup.find_all('td', class_ = 'title')
    for album in data:
        if album.find('a'):
            links.append(album.find('a')['href'])
    return links

def get_track_page_links(soup):
    """
    Input a BeautifulSoup object to get links to all tracks present on that album page
    """
    links = []
    data = soup.find_all('div', class_ = 'title')
    for track in data:
        if track.find('a'):
            links.append(track.find('a')['href'])
    return links

album_links = get_album_page_links(soup)

In [500]:
def calculate_rating(classes):
    """
    A function to parse classes from AllMusic website as a rating on a scale from [0.5-5.0].
    This function will expect a list of css classes as a parameter, and output 
    rating according to present class.
    """
    if 'rating-unrated' in classes: rating = ''
    elif 'rating-allmusic-0' in classes: rating = ''
    elif 'rating-allmusic-1' in classes: rating = 1.0
    elif 'rating-allmusic-2' in classes: rating = 1.5
    elif 'rating-allmusic-3' in classes: rating = 2.0
    elif 'rating-allmusic-4' in classes: rating = 2.5
    elif 'rating-allmusic-5' in classes: rating = 3.0
    elif 'rating-allmusic-6' in classes: rating = 3.5
    elif 'rating-allmusic-7' in classes: rating = 4.0
    elif 'rating-allmusic-8' in classes: rating = 4.5
    elif 'rating-allmusic-9' in classes: rating = 5.0
    return rating

In [501]:
def get_track_data(url):
    """
    This function expects a track URL as a parameter. It returns information about the track
    as a dictionary"""
    overview_page = get_page_data(url)
    attributes_page = get_page_data(str(url +  '/attributes'))
        
    track_title = overview_page.find('h1').text.strip()
    track_composers = [x.a.text for x in overview_page.find_all('p', class_ = 'song-composer')]
    if attributes_page.find('div', class_ = 'attribute-tab-genres'):
        genres = [x.text[:-4] for x in attributes_page.find('div', class_ = 'attribute-tab-genres').find_all('a')]
    styles = [x.text[:-4] for x in attributes_page.find('div', class_ = 'attribute-tab-styles').find_all('a')] 
    moods = [x.text[:-4] for x in attributes_page.find('div', class_ = 'attribute-tab-moods').find_all('a')] 
    themes = [x.text[:-4] for x in attributes_page.find('div', class_ = 'attribute-tab-themes').find_all('a')] 
        
    return {
        'track_title' : track_title,
        'track_composers' : track_composers,
        'track_genres':genres, 
        'track_styles':styles, 
        'track_moods':moods, 
        'track_themes':themes
    }
        

In [594]:
def get_data(url):
    """
    This function expects an album page URL. It will scrape all data about the album and all tracks from that album.
    It will return a list of all tracks from that album with its corresponding information as a dictionary"""

    # ALBUM data
    album_page = get_page_data(url)
    album_title = album_page.find('h1', class_ = 'album-title').text.strip()
    if album_title not in ['Ten', 'Vs.', 'Vitalogy', 'No Code', 'Yield', 'Binaural', 'Riot Act', 'Pearl Jam', 'Backspacer', 'Lightning Bolt' ,'Gigaton', 'Last Kiss', 'Merkinball']:
        return []
    album_release_date = [album_page.find('div', class_ = 'release-date').span.text if album_page.find('div', class_ = 'release-date') else '']
    album_duration = [album_page.find('div', class_ = 'duration').text.strip() if album_page.find('div', class_ = 'duration') else None ]
    album_genre = [album_page.find('div', class_ = 'genre').div.a.text if album_page.find('div', class_ = 'genre') else '']
    album_styles = [[a.text for a in album_page.find('div', class_ = 'styles').find_all('a')] if album_page.find('div', class_ = 'styles') else '']
    album_recording_date = [album_page.find('div', class_ = 'recording-date').div.text if album_page.find('div', class_ = 'recording-date') else '']
    album_recording_location = [album_page.find('div', class_ = 'recording-location').li.text.strip() if album_page.find('div', class_ = 'recording-location') else '']
    album_moods = [x.text.strip() for x in album_page.find_all('span', class_ = 'mood')]
    album_themes = [x.text.strip() for x in album_page.find_all('span', class_ = 'theme')]
    album_rating = calculate_rating(album_page.find('div', class_ = 'allmusic-rating').get('class'))

    # TRACK data
    all_tracks_data = []
    track_links = get_track_page_links(album_page)
    if len(track_links) > 0:
        for track_link in track_links:
            all_tracks_data.append(get_track_data(track_link))

    # FINAL data
    final_data = []
    
    for track in all_tracks_data:
        final_data.append({
            'track_title' : track['track_title'],
            'track_composers' : track['track_composers'],
            'track_genres': track['track_genres'], 
            'track_styles': track['track_styles'], 
            'track_moods': track['track_moods'], 
            'track_themes': track['track_themes'],
            'album_title': album_title,
            'album_release_date': album_release_date,
            'album_duration' : album_duration,
            'album_genres' : album_genre,
            'album_styles' : album_styles,
            'album_recording_date' : album_recording_date,
            'album_recording_location' : album_recording_location,
            'album_moods' : album_moods,
            'album_themes' : album_themes,
            'album_rating' : album_rating
        })
    return final_data

### Fetching all data from AllMusic

In [597]:
all_tracks  = []
for i in range(len(album_links)):
    all_tracks.extend(get_data(album_links[i]))

### Cleaning The Data

In [598]:
df = pd.DataFrame(all_tracks)
allmusic = df

In [599]:
allmusic.drop('album_genres', axis=1, inplace=True)

In [600]:
print(f'Columns:\t{allmusic.shape[1]}\nRows:\t\t{allmusic.shape[0]}')

Columns:	15
Rows:		144


In [601]:
allmusic['track_composers'] = allmusic['track_composers'].apply(lambda x: ', '.join(x))
allmusic['track_genres'] = allmusic['track_genres'].apply(lambda x: ', '.join(x))
allmusic['track_styles'] = allmusic['track_styles'].apply(lambda x: ', '.join(x))
allmusic['track_moods'] = allmusic['track_moods'].apply(lambda x: ', '.join(x))
allmusic['track_themes'] = allmusic['track_themes'].apply(lambda x: ', '.join(x))
allmusic['album_duration'] = allmusic['album_duration'].apply(lambda x: str(x[0])[9:])
allmusic['album_styles'] = allmusic['album_styles'].apply(lambda x: ''.join(x[0]))
allmusic['album_recording_date'] = allmusic['album_recording_date'].apply(lambda x: ', '.join(x))
allmusic['album_release_date'] = allmusic['album_release_date'].apply(lambda x: str(x)[-6:-2])
allmusic['album_recording_location'] = allmusic['album_recording_location'].apply(lambda x: ', '.join(x))
allmusic['album_moods'] = allmusic['album_moods'].apply(lambda x: ', '.join(x))
allmusic['album_themes'] = allmusic['album_themes'].apply(lambda x: ', '.join(x))

In [602]:
# Cleaning the missing data format
allmusic.loc[df['track_genres'] == 'Would you like to contrib', 'track_genres'] = ''
allmusic.loc[df['track_styles'] == 'Would you like to contrib', 'track_styles'] = ''
allmusic.loc[df['track_moods'] == 'Would you like to contrib', 'track_moods'] = ''
allmusic.loc[df['track_themes'] == 'Would you like to contrib', 'track_themes'] = ''

In [603]:
allmusic

Unnamed: 0,track_title,track_composers,track_genres,track_styles,track_moods,track_themes,album_title,album_release_date,album_duration,album_styles,album_recording_date,album_recording_location,album_moods,album_themes,album_rating
0,Once,"Stone Gossard, Eddie Vedder",Pop/Rock,"Alternative Pop/Rock, Alternative/Indie Rock, ...","Aggressive, Angry, Angst-Ridden, Bitter, Bitte...","Introspection, Empowerment, Revolution",Ten,1991,50:46,Alternative Pop/RockAlternative/Indie RockGrun...,"March 27, 1991 - April 26, 1991","London Bridge Studios, Seattle, WA","Angst-Ridden, Brooding, Aggressive, Angry, Bit...","Empowerment, Revolution, Solitude",5.0
1,Even Flow,"Stone Gossard, Eddie Vedder",Pop/Rock,"Alternative Pop/Rock, Grunge, Alternative/Indi...","Earnest, Melancholy, Reflective, Searching, An...","Freedom, Cool & Cocky, Empowerment, Guys Night...",Ten,1991,50:46,Alternative Pop/RockAlternative/Indie RockGrun...,"March 27, 1991 - April 26, 1991","London Bridge Studios, Seattle, WA","Angst-Ridden, Brooding, Aggressive, Angry, Bit...","Empowerment, Revolution, Solitude",5.0
2,Alive,"Stone Gossard, Eddie Vedder",Pop/Rock,"Grunge , Alternative Pop/Rock, Alternative/Ind...","Aggressive, Angst-Ridden, Bittersweet, Broodin...","Empowerment, Revolution, Reminiscing",Ten,1991,50:46,Alternative Pop/RockAlternative/Indie RockGrun...,"March 27, 1991 - April 26, 1991","London Bridge Studios, Seattle, WA","Angst-Ridden, Brooding, Aggressive, Angry, Bit...","Empowerment, Revolution, Solitude",5.0
3,Why Go,"Jeff Ament, Eddie Vedder",Pop/Rock,"Alternative Pop/Rock, Alternative/Indie Rock, ...","Aggressive, Angry, Angst-Ridden, Bitter, Brood...","Empowerment, Revolution",Ten,1991,50:46,Alternative Pop/RockAlternative/Indie RockGrun...,"March 27, 1991 - April 26, 1991","London Bridge Studios, Seattle, WA","Angst-Ridden, Brooding, Aggressive, Angry, Bit...","Empowerment, Revolution, Solitude",5.0
4,Black,"Stone Gossard, Eddie Vedder",Pop/Rock,"Alternative Pop/Rock, Alternative/Indie Rock, ...","Angst-Ridden, Brooding, Earnest, Fiery, Intens...","Feeling Blue, Heartache, Introspection, Regret...",Ten,1991,50:46,Alternative Pop/RockAlternative/Indie RockGrun...,"March 27, 1991 - April 26, 1991","London Bridge Studios, Seattle, WA","Angst-Ridden, Brooding, Aggressive, Angry, Bit...","Empowerment, Revolution, Solitude",5.0
5,Jeremy,"Jeff Ament, Eddie Vedder",Pop/Rock,"Grunge , Hard Rock , Alternative Pop/Rock, Alt...","Angst-Ridden, Bitter, Brooding, Cathartic, Fie...",,Ten,1991,50:46,Alternative Pop/RockAlternative/Indie RockGrun...,"March 27, 1991 - April 26, 1991","London Bridge Studios, Seattle, WA","Angst-Ridden, Brooding, Aggressive, Angry, Bit...","Empowerment, Revolution, Solitude",5.0
6,Oceans,"Jeff Ament, Stone Gossard, Eddie Vedder",Pop/Rock,"Alternative Pop/Rock, Alternative/Indie Rock, ...","Angst-Ridden, Atmospheric, Bittersweet, Dramat...",,Ten,1991,50:46,Alternative Pop/RockAlternative/Indie RockGrun...,"March 27, 1991 - April 26, 1991","London Bridge Studios, Seattle, WA","Angst-Ridden, Brooding, Aggressive, Angry, Bit...","Empowerment, Revolution, Solitude",5.0
7,Porch,Eddie Vedder,Pop/Rock,"Alternative Pop/Rock, Alternative/Indie Rock, ...","Aggressive, Angst-Ridden, Autumnal, Cathartic,...","Empowerment, Revolution, In Love",Ten,1991,50:46,Alternative Pop/RockAlternative/Indie RockGrun...,"March 27, 1991 - April 26, 1991","London Bridge Studios, Seattle, WA","Angst-Ridden, Brooding, Aggressive, Angry, Bit...","Empowerment, Revolution, Solitude",5.0
8,Garden,"Jeff Ament, Stone Gossard, Eddie Vedder",Pop/Rock,"Alternative Pop/Rock, Alternative/Indie Rock, ...","Angst-Ridden, Bittersweet, Brooding, Cathartic...",,Ten,1991,50:46,Alternative Pop/RockAlternative/Indie RockGrun...,"March 27, 1991 - April 26, 1991","London Bridge Studios, Seattle, WA","Angst-Ridden, Brooding, Aggressive, Angry, Bit...","Empowerment, Revolution, Solitude",5.0
9,Deep,"Jeff Ament, Stone Gossard, Eddie Vedder",Pop/Rock,"Alternative Pop/Rock, Alternative/Indie Rock, ...","Aggressive, Angst-Ridden, Bitter, Brooding, Ca...",,Ten,1991,50:46,Alternative Pop/RockAlternative/Indie RockGrun...,"March 27, 1991 - April 26, 1991","London Bridge Studios, Seattle, WA","Angst-Ridden, Brooding, Aggressive, Angry, Bit...","Empowerment, Revolution, Solitude",5.0


In [604]:
# Cleaning the track names

indices = [71, 98, 113, 116, 117, 128, 134, 139]
allmusic.loc[indices, 'track_title'] = allmusic.loc[indices, 'track_title'].str.title()

allmusic.loc[91, 'track_title'] = 'Bu$hleaguer'
allmusic.loc[93, 'track_title'] = 'All or None'
allmusic.loc[68, 'track_title'] = 'Gods\' Dice'
allmusic.loc[66, 'track_title'] = 'Soldier of Love'
allmusic.loc[53, 'track_title'] = 'Faithful'
allmusic.loc[59, 'track_title'] = 'Red Bar'
allmusic.loc[25, 'track_title'] = 'Not for You'
allmusic.loc[36, 'track_title'] = 'Stupidmop'
allmusic.loc[122, 'track_title'] = 'My Father\'s Son'
allmusic.loc[37, 'track_title'] = 'I Got Id'

# Cleaning album names

allmusic.loc[allmusic['album_title'] == 'Merkinball', 'album_title'] = 'Merkin Ball'

## Saving The Data

In [605]:
allmusic.to_csv('pearl_allmusic.csv', index=False)

In [606]:
pearl_v2 = pearl_v1.merge(allmusic, how='left', left_on=['track_name', 'album_name'], right_on=['track_title', 'album_title']).drop(['track_title', 'album_title'], axis=1)
pearl_v2.to_csv('pearl_v2.csv', index=False)

# Billboard Data Extraction

In [512]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import pandas as pd

In [513]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

driver.get("https://www.billboard.com/artist/pearl-jam/")

In [514]:
def extract_data(chart_name = ""):
    data = []
    
    table = driver.find_element(By.CLASS_NAME, 'artist-chart-history-items')
    rows = table.find_elements(By.XPATH, '*')
    
    for row in rows:
        track = {}
        track_name = row.find_element(By.ID, 'title-of-a-story')
        debut_date = row.find_element(By.CLASS_NAME, 'artist-chart-row-debut-date')
        peak_pos = row.find_element(By.CLASS_NAME, 'artist-chart-row-peak-pos')
        peak_date = row.find_element(By.CLASS_NAME, 'artist-chart-row-peak-date')
        weaks_on_chart = row.find_element(By.CLASS_NAME, 'artist-chart-row-week-on-chart')

        track['chart'] = chart_name
        track['track_name'] = track_name.text
        track['debut_date']  = debut_date.text
        track['peak_pos'] = peak_pos.text
        track['peak_date'] = peak_date.text
        track['weaks_on_chart'] = weaks_on_chart.text
        data.append(track)
    
    return data

In [522]:
import time

driver.refresh()

filter = driver.find_element(By.XPATH, '//*[@id="artist-chart-selector"]')
current_li = 1

all_data = []

while(True):
    if current_li == 55:
        break
        
    filter.click()

    time.sleep(1)
    
    li_path = f'//*[@id="main-wrapper"]/main/div[2]/div[2]/div/div[2]/div[1]/nav/ul/li[{current_li}]/a'
    new_li = driver.find_element(By.XPATH, li_path)
    new_li.click()

    time.sleep(1)

    print(f"Chart: {new_li.get_attribute('text').strip()}", end='\r')
    
    all_data.extend(extract_data(chart_name=new_li.get_attribute('text').strip()))
    
    current_li += 1
driver.refresh()

Chart: Rock & Alternative Producersrtm Rock Songsndex

In [751]:
import pandas as pd

bill = pd.DataFrame(all_data)

In [752]:
# Taking into consideration only song charts

bill['chart'].value_counts()
useful_charts = [
    'Mainstream Rock Airplay',
    'Alternative Airplay',
    'Radio Songs', 
    'LyricFind U.S.',
    'Billboard Hot 100',
    'Rock Digital Song Sales',
    'Hot Alternative Songs',
    'LyricFind Global',
    'Alternative Digital Song Sales',
    'Billboard Global 200',
    'Canadian Digital Song Sales',
    'Hot Rock Songs',
    'Hot Rock & Alternative Songs',
    'Digital Song Sales',
    'Rock Streaming Songs',
    'Alternative Streaming Songs'
    ]
indices = bill.loc[bill['chart'].isin(useful_charts),].index

bill = bill.loc[indices,]
bill.reset_index(drop=True, inplace=True)

## Cleaning Billboard Data

In [753]:
# Indices of 'I Got Id/Long Road'
indices = bill.loc[bill['track_name'] == 'I Got Id/Long Road', ['chart', 'debut_date', 'peak_pos', 'peak_date', 'weaks_on_chart']].index
for i in indices:
    new_row = pd.DataFrame(bill.loc[i,]).transpose()
    track_names = new_row['track_name'].str.split('/')
    bill.loc[i, 'track_name'] = list(track_names)[0][0]
    new_row['track_name'] = list(track_names)[0][1]
    bill = pd.concat([bill, new_row])
  

In [754]:
bill.reset_index(inplace=True)

In [755]:
# Cleaning Song Names

# bill.loc[bill['track_name'].str.lower().str.contains('given to fly'), 'track_name'] = 'Given to Fly'
bill.loc[[33, 37], 'track_name'] = 'Last Kiss'

## Saving Billboard Data

In [756]:
bill.to_csv('pearl_billboard.csv', index=False)

## Creating Billboard Charts Columns

In [757]:
bill['chart'] = bill['chart'].apply(lambda x: x.replace('\xa0',' '))
pearl = pd.read_csv('pearl_v2.csv')
pearl.drop_duplicates(subset=['track_name', 'album_name', 'release_year'], inplace=True)

In [758]:
for i in range(bill.shape[0]):

    if pearl.loc[pearl['track_name'].str.title() == bill.loc[i,'track_name'],].index.size == 1:
        # If song has only one occurence in pearl_v2 then input Billboard data there
        chart = bill.loc[i,'chart'].lower().replace(' ','_')
        track = bill.loc[i,'track_name']
        index = pearl.loc[pearl['track_name'].str.title() == bill.loc[i,'track_name'],].index[0]

        if chart + '_debut_date' not in pearl.columns: pearl[chart + '_debut_date'] = ''
        pearl[chart + '_debut_date'][index] = bill.loc[i,'debut_date']

        if chart + '_peak_pos' not in pearl.columns: pearl[chart + '_peak_pos'] = ''
        pearl[chart + '_peak_pos'][index] = bill.loc[i,'peak_pos']

        if chart + '_peak_date' not in pearl.columns: pearl[chart + '_peak_date'] = ''
        pearl[chart + '_peak_date'][index] = bill.loc[i,'peak_date']

        if chart + '_weaks_on_chart' not in pearl.columns: pearl[chart + '_weaks_on_chart'] = ''
        pearl[chart + '_weaks_on_chart'][index] = bill.loc[i,'weaks_on_chart']


In [759]:
pearl.to_csv('pearl_v3.csv', index=False)