# MusicBrainz artist lookup

To see this analysis live, check out my article ["Analyzing Last.fm Listening History"](http://geoffboeing.com/2016/05/analyzing-lastfm-history/)

Get artist information, including place name, for each artist that has a music brainz id in my data set generated by the [lastfm_downloader](lastfm_downloader.ipynb).

Documentation:
 - Web service: https://wiki.musicbrainz.org/Development/XML_Web_Service/Version_2/Search
 - Artist entities: https://musicbrainz.org/doc/Artist
 - Area entities: https://musicbrainz.org/doc/Area

Sample queries:
 - Artist: https://musicbrainz.org/ws/2/artist/d4659efb-b8eb-4f03-95e9-f69ce35967a9
 - Area: https://musicbrainz.org/ws/2/area/0a70f24b-1263-4341-8d70-17b8df84154f?inc=area-rels

In [1]:
import pandas as pd, requests, time, json
import logging as lg, datetime as dt

pause_standard = 0.1
pause_exceeded_rate = 19

In [2]:
# configure URLs and user-agent header
artist_name_url = u'https://musicbrainz.org/ws/2/artist/?query=artist:{}&fmt=json'
artist_id_url = u'https://musicbrainz.org/ws/2/artist/{}?fmt=json'
area_id_url = u'https://musicbrainz.org/ws/2/area/{}?inc=area-rels&fmt=json'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

In [3]:
# create a logger to capture progress
log = lg.getLogger('mb')
if not getattr(log, 'handler_set', None):
    todays_date = dt.datetime.today().strftime('%Y_%m_%d_%H_%M_%S')
    log_filename = 'logs/mb_{}.log'.format(todays_date)
    handler = lg.FileHandler(log_filename)
    formatter = lg.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s')
    handler.setFormatter(formatter)
    log.addHandler(handler)
    log.setLevel(lg.INFO)
    log.handler_set = True

In [4]:
log.info('music brainz downloader script started')

## Define functions

In [5]:
# make a http request to musicbrainz api and return the result
def make_request(url, headers=headers, attempt_count=0):
    
    global pause_standard
    global pause_exceeded_rate
    
    time.sleep(pause_standard)
    log.info('request: {}'.format(url))
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200: #if status OK
        return {'status_code':response.status_code, 'json':response.json()}
    
    elif response.status_code == 503: #if status error (server busy or rate limit exceeded)
        try:
            if 'exceeding the allowable rate limit' in response.json()['error']:
                pause_standard = pause_standard + 0.1
                log.warning('exceeded allowable rate limit, increased pause_standard to {} seconds'.format(pause_standard))
                time.sleep(pause_exceeded_rate)
        except:
            pass
        
        log.warning('request failed with status_code 503, so we will try it again')
        return make_request(url)
    
    else: #if other status code, print info and return None for caller to handle
        log.error('make_request error code {} {}'.format(response.status_code, response.json()))
        return None

In [6]:
# query the musicbrainz api for an artist's name and return the resulting id
def get_artist_id_by_name(name):
    response = make_request(artist_name_url.format(name))
    try:
        if response is not None:
            result = response['json']
            artist_id = result['artists'][0]['id']
            return artist_id
    except:
        log.error('get_artist_id_by_name error: {}'.format(response))

In [7]:
# get an artist object from the musicbrainz api by the musicbrainz artist id
def get_artist_by_id(artist_id):
    response = make_request(artist_id_url.format(artist_id))
    try:
        if response is not None:
            result = response['json']
            data = {'id':artist_id,
                    'name':result['name'],            
                    'type':result['type'],
                    'gender':result['gender'],
                    'country':result['country'],
                    'begin_date':None,
                    'end_date':None,
                    'area_id':None,
                    'area_name':None,
                    'area_name_full':None,
                    'area_latlng':None,
                    'begin_area_id':None,
                    'begin_area_name':None,
                    'begin_area_name_full':None,
                    'begin_area_latlng':None,
                    'place':None,
                    'place_id':None,
                    'place_full':None,
                    'place_latlng':None}

            if result['life-span'] is not None and 'begin' in result['life-span'] and 'end' in result['life-span']:
                data['begin_date'] = result['life-span']['begin']
                data['end_date'] = result['life-span']['end']
            if result['area'] is not None and 'id' in result['area'] and 'name' in result['area']:
                data['area_id'] = result['area']['id']
                data['area_name'] = result['area']['name']
            if result['begin_area'] is not None and 'id' in result['begin_area'] and 'name' in result['begin_area']:
                data['begin_area_id'] = result['begin_area']['id']
                data['begin_area_name'] = result['begin_area']['name']
            
            # populate place with begin_area_name if it's not null, else area_name if it's not null, else None
            if data['begin_area_name'] is not None:
                data['place'] = data['begin_area_name']
                data['place_id'] = data['begin_area_id']
            elif data['area_name'] is not None:
                data['place'] = data['area_name']
                data['place_id'] = data['area_id']
            
            return data
    
    except:
        log.error('get_artist_by_id error: {}'.format(response))

In [8]:
# get details of an 'area' from the musicbrainz api by area id
def get_area(area_id, area_str=''):
    response = make_request(area_id_url.format(area_id))
    try:
        if response is not None:
            result = response['json']
            
            if area_str == '':
                area_str = result['name']
            
            if 'relations' in result:
                for relation in result['relations']:
                    if relation['direction']=='backward' and relation['type']=='part of':
                        area_id = relation['area']['id']
                        area_str = u'{}, {}'.format(area_str, relation['area']['name'])
                        return area_id, area_str
            else:
                log.error('get_area no relations error: {}'.format(result))
            return None, area_str
    except:
        log.error('get_area error: {}'.format(response))

In [9]:
# recursively traverse the api, getting coarser grain place detail each time until top level country
def get_place_full_name_by_id(area_id):
    area_name=''
    while area_id is not None:
        area_id, area_name = get_area(area_id, area_name)
    return area_name

In [10]:
# create a dataframe of artist details and place info from a list of artist IDs
def make_artists_df(artist_ids, row_labels=None, df=None, status_interval=10):
    
    # create a list of row labels if caller didn't pass one in
    if row_labels is None:
        row_labels = range(len(artist_ids))
    
    # create a new dataframe if caller didn't pass an existing one in
    cols = ['id', 'name', 'type', 'gender', 'country', 'begin_date', 'end_date', 'begin_area_name', 
            'begin_area_name_full', 'begin_area_latlng', 'area_name', 'area_name_full', 'area_latlng',
            'place_id', 'place', 'place_full', 'place_latlng']
    if not isinstance(df, pd.DataFrame):
        df = pd.DataFrame(columns=cols)
    
    start_time = time.time()
    for artist_id, n in zip(artist_ids, row_labels):

        if n % status_interval == 0 :
            # save the dataframe every nth row
            df.to_csv('data/mb.csv', index=False, encoding='utf-8')

        try:
            # get the artist info object
            artist = get_artist_by_id(artist_id)

            # create (or update) a df row containing the data from this artist object
            df.loc[n] = [ artist[col] for col in cols ]
            log.info('successfully got artist details for label={}, artist_id={}'.format(n, artist_id))
            
            # update the row's place_full cell with the full place name
            df.loc[n, 'place_full'] = get_place_full_name_by_id(artist['place_id'])
            log.info('successfully created place_full for label={}, artist_id={}'.format(n, artist_id))
            
        except Exception as e:
            log.error('row #{} failed: {}'.format(n, e))
            pass
    
    df.to_csv('data/mb.csv', index=False, encoding='utf-8')
    finish_time = time.time()
    message = 'processed {:,} artists in {:,} seconds'.format(len(artist_ids), round(finish_time-start_time, 2))
    log.info(message)
    print message
    
    return df

## Test it with a demo

In [11]:
# where is david bowie from?
name = 'david bowie'
start_time = time.time()
artist_id = get_artist_id_by_name(name)
artist = get_artist_by_id(artist_id)
artist['place_full'] = get_place_full_name_by_id(artist['place_id'])
log.info('demo test finished in {:,} seconds'.format(round(time.time()-start_time, 2)))
artist['place_full']

u'Brixton, Lambeth, London, England, United Kingdom'

## Now run it

In [12]:
# load the artist IDs from the lastfm scrobble history data set
scrobbles = pd.read_csv('data/lastfm_scrobbles.csv', encoding='utf-8')
artist_ids = scrobbles['artist_mbid'].dropna().unique()
len(artist_ids)

12511

In [13]:
df = make_artists_df(artist_ids)

processed 12,511 artists in 75,052.51 seconds


In [14]:
df.head()

Unnamed: 0,id,name,type,gender,country,begin_date,end_date,begin_area_name,begin_area_name_full,begin_area_latlng,area_name,area_name_full,area_latlng,place_id,place,place_full,place_latlng
0,cdc0fff7-54cf-4052-a283-319b648670fd,Prince,,,JP,,,,,,Japan,,,2db42837-c832-3c27-b4a3-08198f75693c,Japan,Japan,
1,bc86d48c-5393-4436-bb5d-2c214d07a676,Brenton Wood,Person,Male,US,1941-07-26,,,,,United States,,,489ce91b-6658-3307-9877-795b68554c98,United States,United States,
2,4a8d9623-4d6c-4b7c-8dc5-5d5319ab8a20,Jean Knight,Person,Female,US,1943-01-26,,,,,United States,,,489ce91b-6658-3307-9877-795b68554c98,United States,United States,
3,d8df96ae-8fcf-4997-b3e6-e5d1aaf0f69e,The Temptations,Group,,US,1960,,Detroit,,,United States,,,b03ff310-d8e2-45cf-9455-769f76641eb2,Detroit,"Detroit, Wayne County, Michigan, United States",
4,172e1f1a-504d-4488-b053-6344ba63e6d0,Nick Cave & The Bad Seeds,Group,,AU,1983,,Melbourne,,,Australia,,,b4e9352c-8edf-4911-8fa3-e852afa30501,Melbourne,"Melbourne, Victoria, Australia",


## Re-try any failed rows one more time

In [15]:
# first get all the rows missing place_full (ie, row was created but couldn't get full place name)
rows_missing_place_full = list(df[pd.isnull(df['place_full'])].index)

# then get all the row labels missing in the df (due to errors that prevented row creation)
missing_row_labels = [ label for label in range(len(artist_ids)) if label not in df.index ]

# combine the two lists then get the artist mbid for each
row_labels_to_retry = sorted(rows_missing_place_full + missing_row_labels)
artist_ids_to_retry = [ artist_ids[label] for label in row_labels_to_retry ]

message = '{} artists to retry'.format(len(artist_ids_to_retry))
log.info(message)
print message

8 artists to retry


In [16]:
df = make_artists_df(artist_ids_to_retry, row_labels_to_retry, df, status_interval=1)

processed 8 artists in 38.63 seconds


In [17]:
df.head()

Unnamed: 0,id,name,type,gender,country,begin_date,end_date,begin_area_name,begin_area_name_full,begin_area_latlng,area_name,area_name_full,area_latlng,place_id,place,place_full,place_latlng
0,cdc0fff7-54cf-4052-a283-319b648670fd,Prince,,,JP,,,,,,Japan,,,2db42837-c832-3c27-b4a3-08198f75693c,Japan,Japan,
1,bc86d48c-5393-4436-bb5d-2c214d07a676,Brenton Wood,Person,Male,US,1941-07-26,,,,,United States,,,489ce91b-6658-3307-9877-795b68554c98,United States,United States,
2,4a8d9623-4d6c-4b7c-8dc5-5d5319ab8a20,Jean Knight,Person,Female,US,1943-01-26,,,,,United States,,,489ce91b-6658-3307-9877-795b68554c98,United States,United States,
3,d8df96ae-8fcf-4997-b3e6-e5d1aaf0f69e,The Temptations,Group,,US,1960,,Detroit,,,United States,,,b03ff310-d8e2-45cf-9455-769f76641eb2,Detroit,"Detroit, Wayne County, Michigan, United States",
4,172e1f1a-504d-4488-b053-6344ba63e6d0,Nick Cave & The Bad Seeds,Group,,AU,1983,,Melbourne,,,Australia,,,b4e9352c-8edf-4911-8fa3-e852afa30501,Melbourne,"Melbourne, Victoria, Australia",


In [18]:
# OK, one final check - see how many artist ids did not make it into the final dataframe
# first get all the rows missing place_full (ie, row was created but couldn't get full place name)
rows_missing_place_full = list(df[pd.isnull(df['place_full'])].index)

# then get all the row labels missing in the df (due to errors that prevented row creation)
missing_row_labels = [ label for label in range(len(artist_ids)) if label not in df.index ]

log.info('{} rows are missing place_full'.format(len(rows_missing_place_full)))
log.info('{} labels are missing in the df'.format(len(missing_row_labels)))
print len(rows_missing_place_full)
print len(missing_row_labels)

0
4
