# <font color=red>DATA GATHERING I: MUSIC RELEASES AND THEIR GEOGRAPHICAL ORIGIN</font>

In [145]:
import pandas as pd
import numpy as np
import time
import tqdm
import warnings
import reverse_geocoder #pip install reverse geocoder in console
warnings.filterwarnings('ignore')

## <font color=blue>1) Artists' origin information</font>

### Data from Musicbrainz.org

In [88]:
artists= pd.read_csv('Musicbrainz/Tables_used/artist.txt',sep='\t', header=None, engine='c', usecols=[0,1,3,11,17])
artists.columns = ['artist_id','artist_mbid','artist_name','start_area1', 'start_area2']
artists.head()

Unnamed: 0,artist_id,artist_mbid,artist_name,start_area1,start_area2
0,805192,8972b1c1-6482-4750-b51f-596d2edea8b1,WIK▲N,,
1,371203,49add228-eac5-4de8-836c-d75cde7369c3,"Moutso, Pete",,
2,273232,c112a400-af49-4665-8bba-741531d962a1,Zachary,,
3,101060,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,"Silhouettes, The",222.0,7707.0
4,145773,7b4a548e-a01a-49b7-82e7-b49efeb9732c,"Leavitt, Aric",,


In [89]:
#Let's see how many artists we have:
artists['artist_id'].nunique()

1476425

In [90]:
#How much info we have for each artist?
artists.isnull().sum(axis=0)

artist_id            0
artist_mbid          0
artist_name         11
start_area1     808442
start_area2    1274001
dtype: int64

What are the "start_area1" and "start_area2"? If we look at Musicbrainz's field description for each artist (https://musicbrainz.org/doc/Artist), we can see that:

Area: The artist area, as the name suggests, indicates the area with which an artist is primarily identified with. It is often, but not always, its birth/formation country.

We will keep this information as the artist's origin for later.

We need to incorporate as well the table called "artist credit", which gives us the artist credit_id. We will use this field to join later on each release with its artist:

In [91]:
artists_credit= pd.read_csv('Musicbrainz/Tables_used/artist_credit_name.txt',sep='\t', header=None, engine='c', usecols=[0,2,3])
artists_credit.columns = ['credit_id','artist_id','artist_name']
artists_credit.head()

Unnamed: 0,credit_id,artist_id,artist_name
0,578352,578352,Gustav Ruppke
1,273232,273232,Zachary
2,153193,153193,The High Level Ranters
3,32262,32262,Georges Brassens
4,1389968,1171184,Harvard of the South


In [282]:
artists_credit.drop_duplicates(keep='first', inplace=True)

In [285]:
artists_credit.duplicated(subset='credit_id').value_counts()

False    1588033
True     1072366
dtype: int64

In [284]:
len(artists_credit)

2660399

In [93]:
#Let's join the artists with their credit id and verify that the matching is good:
df = pd.merge(artists_credit, artists, how='left', on='artist_id')
df.head()

Unnamed: 0,credit_id,artist_id,artist_name_x,artist_mbid,artist_name_y,start_area1,start_area2
0,578352,578352,Gustav Ruppke,b4f76788-7e6f-41b7-ac7b-dfb67f66282e,"Ruppke, Gustav",,
1,273232,273232,Zachary,c112a400-af49-4665-8bba-741531d962a1,Zachary,,
2,153193,153193,The High Level Ranters,c42eed94-e233-44e2-82b8-3ed6dd9bf318,"High Level Ranters, The",,
3,32262,32262,Georges Brassens,b58165ba-ac55-49a1-8855-caf16c68f5f2,"Brassens, Georges",73.0,46130.0
4,1389968,1171184,Harvard of the South,bf8fefa4-c02d-4ad0-91d9-26f94169488f,Harvard of the South,,


In [94]:
#It looks like it makes sense. Please note that the credit id is sometimes equal to the artist_id, but not always:
df['check'] = df['artist_id'] - df['credit_id']
df['check'].nunique()

1270628

In [95]:
df.isnull().sum(axis=0)

credit_id              0
artist_id              0
artist_name_x         12
artist_mbid            0
artist_name_y         16
start_area1       884971
start_area2      1702108
check                  0
dtype: int64

In [96]:
#We can now get rid of check and the duplicate artist_name column:
df.drop(labels=['check','artist_name_y'], axis=1, inplace=True)
df.head()

Unnamed: 0,credit_id,artist_id,artist_name_x,artist_mbid,start_area1,start_area2
0,578352,578352,Gustav Ruppke,b4f76788-7e6f-41b7-ac7b-dfb67f66282e,,
1,273232,273232,Zachary,c112a400-af49-4665-8bba-741531d962a1,,
2,153193,153193,The High Level Ranters,c42eed94-e233-44e2-82b8-3ed6dd9bf318,,
3,32262,32262,Georges Brassens,b58165ba-ac55-49a1-8855-caf16c68f5f2,73.0,46130.0
4,1389968,1171184,Harvard of the South,bf8fefa4-c02d-4ad0-91d9-26f94169488f,,


In [97]:
df.duplicated(subset='artist_id').any()

True

As we can see, the artist name and first name are separated by comma. In order to be able to follow the next steps, we will reverse the order of the names (ie: first name, family name) and remove the comma:

In [98]:
def reverse(word):
    
    if ',' in word:
        b = word.split(',')
        rev = b[-1::-1]
        out = ' '.join(rev)
        return out
    
    else:
        return word

In [99]:
#We remove the punctuation:
df['artist_name_x'] = df['artist_name_x'].str.replace('#!?()*-%"/\,<>:$@.','')
df.head()

Unnamed: 0,credit_id,artist_id,artist_name_x,artist_mbid,start_area1,start_area2
0,578352,578352,Gustav Ruppke,b4f76788-7e6f-41b7-ac7b-dfb67f66282e,,
1,273232,273232,Zachary,c112a400-af49-4665-8bba-741531d962a1,,
2,153193,153193,The High Level Ranters,c42eed94-e233-44e2-82b8-3ed6dd9bf318,,
3,32262,32262,Georges Brassens,b58165ba-ac55-49a1-8855-caf16c68f5f2,73.0,46130.0
4,1389968,1171184,Harvard of the South,bf8fefa4-c02d-4ad0-91d9-26f94169488f,,


In [100]:
#And we reverse the names:
df['artist_name'] = df['artist_name_x'].apply(lambda x: reverse(str(x)))
df.head()

Unnamed: 0,credit_id,artist_id,artist_name_x,artist_mbid,start_area1,start_area2,artist_name
0,578352,578352,Gustav Ruppke,b4f76788-7e6f-41b7-ac7b-dfb67f66282e,,,Gustav Ruppke
1,273232,273232,Zachary,c112a400-af49-4665-8bba-741531d962a1,,,Zachary
2,153193,153193,The High Level Ranters,c42eed94-e233-44e2-82b8-3ed6dd9bf318,,,The High Level Ranters
3,32262,32262,Georges Brassens,b58165ba-ac55-49a1-8855-caf16c68f5f2,73.0,46130.0,Georges Brassens
4,1389968,1171184,Harvard of the South,bf8fefa4-c02d-4ad0-91d9-26f94169488f,,,Harvard of the South


In [101]:
df.drop(labels=['artist_name_x'], axis=1, inplace=True)

The idea of the visualization is to see where each music genre comes from, so, ideally, we would have to look at the artists origins (start area 1 and 2).

We will try to retrieve, for each artist, the most detailed area we have, by using the subdivisions_all dataframe we generated in the notebook "Areas_Musicbrainz". That dataframe provides us with the coordinates for each area_id that is not a country.

In [102]:
areas = pd.read_csv('Musicbrainz/Tables_used/area.txt',sep='\t', header=None, engine='python', usecols=[0,2,3])
areas.columns = ['area_id','area_name','code_type']
areas.head()

Unnamed: 0,area_id,area_name,code_type
0,15449,Greccio,4.0
1,38,Canada,1.0
2,43,Chile,1.0
3,44,China,1.0
4,36,Cambodia,1.0


In [103]:
sub_coordinates = pd.read_csv('subdivisions_all.csv', sep='\t', header=0, encoding='utf-8')
sub_coordinates.head()

Unnamed: 0,area_id,area_name,country_name,latitude,longitude
0,262,Alaska,United States,64.200841,-149.493673
1,339,Sachsen-Anhalt,Germany,51.950265,11.692273
2,263,Alabama,United States,32.318231,-86.902298
3,261,Maryland,United States,39.045755,-76.641271
4,264,Arkansas,United States,35.20105,-91.831833


In [104]:
#We merge both dataframes, to have all our area_id's (countries and other subdivisions that we couldn't retrieve in
#the previous notebook won't have coordinates in this dataframe):
areas_coordinates = pd.merge(areas, sub_coordinates, how='left', on='area_id')
areas_coordinates.head()

Unnamed: 0,area_id,area_name_x,code_type,area_name_y,country_name,latitude,longitude
0,15449,Greccio,4.0,Greccio,Italy,41.655242,12.989615
1,38,Canada,1.0,,,,
2,43,Chile,1.0,,,,
3,44,China,1.0,,,,
4,36,Cambodia,1.0,,,,


In [105]:
len(areas_coordinates)

117674

In [106]:
areas_coordinates.duplicated(subset='area_id').any()

False

#### Selecting the origin for each artist credit:

1) Merging by start_area1:

In [107]:
df1 = pd.merge(df, areas_coordinates, how='left', left_on='start_area1', right_on='area_id')
df1.head()

Unnamed: 0,credit_id,artist_id,artist_mbid,start_area1,start_area2,artist_name,area_id,area_name_x,code_type,area_name_y,country_name,latitude,longitude
0,578352,578352,b4f76788-7e6f-41b7-ac7b-dfb67f66282e,,,Gustav Ruppke,,,,,,,
1,273232,273232,c112a400-af49-4665-8bba-741531d962a1,,,Zachary,,,,,,,
2,153193,153193,c42eed94-e233-44e2-82b8-3ed6dd9bf318,,,The High Level Ranters,,,,,,,
3,32262,32262,b58165ba-ac55-49a1-8855-caf16c68f5f2,73.0,46130.0,Georges Brassens,73.0,France,1.0,,,,
4,1389968,1171184,bf8fefa4-c02d-4ad0-91d9-26f94169488f,,,Harvard of the South,,,,,,,


In [115]:
#How many coordinates do we have for each artist credit?
df1.notnull().sum(axis=0)

credit_id       2662425
artist_id       2662425
artist_mbid     2662425
start_area1     1777454
start_area2      960317
artist_name     2662425
area_id         1777454
area_name_x     1777454
code_type       1774216
area_name_y      268898
country_name     268898
latitude         268898
longitude        268898
dtype: int64

In [112]:
len(df1)

2662425

In [113]:
retrieved_credit1 = df1[df1['country_name'].notnull()]
len(retrieved_credit1)

268898

In [114]:
pending_credit1 = df1[df1['country_name'].isnull()]
#drop empty columns:
to_drop = ['area_id', 'area_name_x', 'code_type', 'area_name_y', 'country_name', 'latitude', 'longitude']
pending_credit1.drop(labels=to_drop, axis=1, inplace=True)

In [116]:
pending_credit1.head()

Unnamed: 0,credit_id,artist_id,artist_mbid,start_area1,start_area2,artist_name
0,578352,578352,b4f76788-7e6f-41b7-ac7b-dfb67f66282e,,,Gustav Ruppke
1,273232,273232,c112a400-af49-4665-8bba-741531d962a1,,,Zachary
2,153193,153193,c42eed94-e233-44e2-82b8-3ed6dd9bf318,,,The High Level Ranters
3,32262,32262,b58165ba-ac55-49a1-8855-caf16c68f5f2,73.0,46130.0,Georges Brassens
4,1389968,1171184,bf8fefa4-c02d-4ad0-91d9-26f94169488f,,,Harvard of the South


2) Merging by start_area2:

In [117]:
df2 = pd.merge(pending_credit1, areas_coordinates, how='left', left_on='start_area2', right_on='area_id')
df2.head()

Unnamed: 0,credit_id,artist_id,artist_mbid,start_area1,start_area2,artist_name,area_id,area_name_x,code_type,area_name_y,country_name,latitude,longitude
0,578352,578352,b4f76788-7e6f-41b7-ac7b-dfb67f66282e,,,Gustav Ruppke,,,,,,,
1,273232,273232,c112a400-af49-4665-8bba-741531d962a1,,,Zachary,,,,,,,
2,153193,153193,c42eed94-e233-44e2-82b8-3ed6dd9bf318,,,The High Level Ranters,,,,,,,
3,32262,32262,b58165ba-ac55-49a1-8855-caf16c68f5f2,73.0,46130.0,Georges Brassens,46130.0,Sète,3.0,Sète,France,43.892723,3.282763
4,1389968,1171184,bf8fefa4-c02d-4ad0-91d9-26f94169488f,,,Harvard of the South,,,,,,,


In [118]:
#How many coordinates do we have for each artist credit in this second merge?
df2.notnull().sum(axis=0)

credit_id       2393527
artist_id       2393527
artist_mbid     2393527
start_area1     1508556
start_area2      841932
artist_name     2393527
area_id          841932
area_name_x      841932
code_type        841774
area_name_y      806693
country_name     806693
latitude         806693
longitude        806693
dtype: int64

In [119]:
retrieved_credit2 = df2[df2['country_name'].notnull()]

In [120]:
pending_credit2 = df2[df2['country_name'].isnull()]
#drop empty columns:
to_drop = ['area_id', 'area_name_x', 'code_type', 'area_name_y', 'country_name', 'latitude', 'longitude']
pending_credit2.drop(labels=to_drop, axis=1, inplace=True)

In [121]:
pending_credit2.head()

Unnamed: 0,credit_id,artist_id,artist_mbid,start_area1,start_area2,artist_name
0,578352,578352,b4f76788-7e6f-41b7-ac7b-dfb67f66282e,,,Gustav Ruppke
1,273232,273232,c112a400-af49-4665-8bba-741531d962a1,,,Zachary
2,153193,153193,c42eed94-e233-44e2-82b8-3ed6dd9bf318,,,The High Level Ranters
4,1389968,1171184,bf8fefa4-c02d-4ad0-91d9-26f94169488f,,,Harvard of the South
5,145773,145773,7b4a548e-a01a-49b7-82e7-b49efeb9732c,,,Aric Leavitt


### Data from the 1 million songs dataset

Between 2011 and 2012, there was a Music Information Retrieval challenge called "Million Song Dataset". The majority of the data contained was provided by The Echo Nest (today known as Spotify).

At the bottom of the following website, there are links to download the Dataset:

https://labrosa.ee.columbia.edu/millionsong/pages/getting-dataset

As we won't use the whole dataset (just some of the tables), you don't need to download them: they will be attached in the repo.

In [122]:
artists_locations = pd.read_csv('1M_songs/artist_location.csv',sep='<SEP>', header=None, engine='python')
artists_locations.columns = ['artist_id','lat','long','artist_name','location_name']
artists_locations.head()

Unnamed: 0,artist_id,lat,long,artist_name,location_name
0,ARZGXZG1187B9B56B6,-16.96595,-61.14804,Endless Blue,Santa Cruz
1,AR8K6F31187B99C2BC,46.44231,-93.36586,Go Fish,"Twin Cities, MN"
2,ARHJJ771187FB5B581,51.59678,-0.33556,Screaming Lord Sutch,"Harrow, Middlesex, England"
3,ARJ8YLL1187FB3CA93,40.69626,-73.83301,Morton Gould,"Richmond Hill, NY"
4,ARYBAGV11ECC836DAC,43.58828,-79.64372,Crash Parallel,Mississauga


In [123]:
#How many artist are there?
artists_locations['artist_id'].nunique()

13850

In this dataset, there is also another table which provides us with some extra information, especially the artist Musicbrainz's id (which will be very helpful to link with Musicbrainz's dataframe).

In [125]:
metadata = pd.read_csv('1M_songs/track_metadata.csv',sep=',', header=0, engine='python', usecols=['artist_id','artist_mbid'])
metadata.head()

Unnamed: 0,artist_id,artist_mbid
0,ARYZTJS1187B98C555,357ff05d-848a-44cf-b608-cb34b5701ae5
1,ARMVN3U1187FB3A1EB,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9
2,ARGEKB01187FB50750,3d403d44-36ce-465c-ad43-ae877e65adc4
3,ARNWYLR1187B9B2F9C,12be7648-7094-495f-90e6-df4189d68615
4,AREQDTE1269FB37231,


In [135]:
#We drop the rows without artist_mbid (as we can't link them with our df)
metadata.dropna(subset=['artist_mbid'],axis=0, inplace=True)
#We merge artist_locations and metadata dataframes:
a = pd.merge(artists_locations,metadata,how='left',on='artist_id')
a.dropna(subset=['artist_mbid'],axis=0, inplace=True)
a.head()

Unnamed: 0,artist_id,lat,long,artist_name,location_name,artist_mbid
0,ARZGXZG1187B9B56B6,-16.96595,-61.14804,Endless Blue,Santa Cruz,0bd9755c-c86d-431c-bc28-ef908b8a9821
1,ARZGXZG1187B9B56B6,-16.96595,-61.14804,Endless Blue,Santa Cruz,0bd9755c-c86d-431c-bc28-ef908b8a9821
2,AR8K6F31187B99C2BC,46.44231,-93.36586,Go Fish,"Twin Cities, MN",d4620364-82ec-4c34-9265-a2b72dfa8e3e
3,AR8K6F31187B99C2BC,46.44231,-93.36586,Go Fish,"Twin Cities, MN",d4620364-82ec-4c34-9265-a2b72dfa8e3e
4,AR8K6F31187B99C2BC,46.44231,-93.36586,Go Fish,"Twin Cities, MN",d4620364-82ec-4c34-9265-a2b72dfa8e3e


In [136]:
#We delete the useless columns:
a.drop(labels=['artist_id','artist_name'], axis=1, inplace=True)
#We get rid of the duplicate rows:
a.drop_duplicates(subset='artist_mbid', inplace=True)
a.head()

Unnamed: 0,lat,long,location_name,artist_mbid
0,-16.96595,-61.14804,Santa Cruz,0bd9755c-c86d-431c-bc28-ef908b8a9821
2,46.44231,-93.36586,"Twin Cities, MN",d4620364-82ec-4c34-9265-a2b72dfa8e3e
33,51.59678,-0.33556,"Harrow, Middlesex, England",e1079a78-75d4-4a1a-aef1-0be051386598
64,40.69626,-73.83301,"Richmond Hill, NY",4db4e744-3007-4386-b87d-9653acfe0464
78,43.58828,-79.64372,Mississauga,b0d85cf7-b73b-4a5d-bf31-a82493c3a8a8


Before starting to retrieve geographical information about these artists, we need to know for which ones we don't already have the information in our main dataframe:

In [137]:
list1 = retrieved_credit1.artist_mbid.values.tolist()
list2 = retrieved_credit2.artist_mbid.values.tolist()
final_list = list1 + list2

In [138]:
to_retrieve = a[~a.artist_mbid.isin(final_list)]
len(to_retrieve)

5819

In [139]:
to_retrieve.head()

Unnamed: 0,lat,long,location_name,artist_mbid
0,-16.96595,-61.14804,Santa Cruz,0bd9755c-c86d-431c-bc28-ef908b8a9821
2,46.44231,-93.36586,"Twin Cities, MN",d4620364-82ec-4c34-9265-a2b72dfa8e3e
78,43.58828,-79.64372,Mississauga,b0d85cf7-b73b-4a5d-bf31-a82493c3a8a8
89,59.91228,10.74998,Oslo,276f08c2-4b4b-4e45-bab8-3ebe6066e080
221,38.8991,-77.029,"Washington, D.C.",9320918d-5143-4846-bab8-7ad48ec17d58


As we can see above, the column "location_name" provides us with some geographical information but, for instance, in the first row, we don't really know the country where Santa Cruz is located.

Luckily, we have a pair of coordinates that we can use to retrieve more geographical detail for each row, using our reverse_geocoder:

In [141]:
#We first create a new column called "coords" in which we'll gather both latitude and longitude:
to_retrieve['coords'] = list(zip(to_retrieve.lat, to_retrieve.long))
coords = tuple(to_retrieve['coords'].values.tolist())
#And now we use the reverse_geocoder utility to retrieve info for each pair of coordinates:
address = reverse_geocoder.search(coords)
to_retrieve['address'] = address
to_retrieve.head()

Loading formatted geocoded file...


Unnamed: 0,lat,long,location_name,artist_mbid,coords,address
0,-16.96595,-61.14804,Santa Cruz,0bd9755c-c86d-431c-bc28-ef908b8a9821,"(-16.96595, -61.14804)","{'lat': '-16.43333', 'lon': '-60.9', 'name': '..."
2,46.44231,-93.36586,"Twin Cities, MN",d4620364-82ec-4c34-9265-a2b72dfa8e3e,"(46.44231, -93.36586)","{'lat': '46.53301', 'lon': '-93.71025', 'name'..."
78,43.58828,-79.64372,Mississauga,b0d85cf7-b73b-4a5d-bf31-a82493c3a8a8,"(43.58828, -79.64372)","{'lat': '43.5789', 'lon': '-79.6583', 'name': ..."
89,59.91228,10.74998,Oslo,276f08c2-4b4b-4e45-bab8-3ebe6066e080,"(59.91228, 10.74998)","{'lat': '59.91273', 'lon': '10.74609', 'name':..."
221,38.8991,-77.029,"Washington, D.C.",9320918d-5143-4846-bab8-7ad48ec17d58,"(38.8991, -77.029)","{'lat': '38.89511', 'lon': '-77.03637', 'name'..."


In [143]:
#Let's see how is this new infor formatted:
to_retrieve['address'][0]

OrderedDict([('lat', '-16.43333'),
             ('lon', '-60.9'),
             ('name', 'Concepcion'),
             ('admin1', 'Santa Cruz'),
             ('admin2', ''),
             ('cc', 'BO')])

We'll retrieve the country ISO code for each row (the field in "cc):

In [149]:
#We create an empty column:

to_retrieve['ISO_code'] = np.nan
to_retrieve.reset_index(drop=True, inplace=True)

#And fill it with the info we need:

for i in tqdm.tqdm(range(len(to_retrieve))):
    address = list(to_retrieve['address'][i].items())
    to_retrieve['ISO_code'][i] = address[5][1]


#We check the result:
to_retrieve.head()

100%|██████████| 5819/5819 [35:13<00:00,  2.80it/s]


Unnamed: 0,lat,long,location_name,artist_mbid,coords,address,ISO_code
0,-16.96595,-61.14804,Santa Cruz,0bd9755c-c86d-431c-bc28-ef908b8a9821,"(-16.96595, -61.14804)","{'lat': '-16.43333', 'lon': '-60.9', 'name': '...",BO
1,46.44231,-93.36586,"Twin Cities, MN",d4620364-82ec-4c34-9265-a2b72dfa8e3e,"(46.44231, -93.36586)","{'lat': '46.53301', 'lon': '-93.71025', 'name'...",US
2,43.58828,-79.64372,Mississauga,b0d85cf7-b73b-4a5d-bf31-a82493c3a8a8,"(43.58828, -79.64372)","{'lat': '43.5789', 'lon': '-79.6583', 'name': ...",CA
3,59.91228,10.74998,Oslo,276f08c2-4b4b-4e45-bab8-3ebe6066e080,"(59.91228, 10.74998)","{'lat': '59.91273', 'lon': '10.74609', 'name':...",NO
4,38.8991,-77.029,"Washington, D.C.",9320918d-5143-4846-bab8-7ad48ec17d58,"(38.8991, -77.029)","{'lat': '38.89511', 'lon': '-77.03637', 'name'...",US


In [150]:
#Retrieving the country name via Musicbrainz's ISO codes and areas:
areas_countries = pd.read_csv('Musicbrainz/Tables_used/area.txt',sep='\t', header=None, engine='python', usecols=[0,2,3])
areas_countries.columns = ['area_id','area_name','code_type']

In [151]:
#Let's see the area types we have:
area_types = pd.read_csv('Musicbrainz/Tables_used/area_type.txt',sep='\t', header=None, engine='python', usecols=[1,3,4], error_bad_lines=False)
area_types.columns = ['type','code_type','definition']
area_types.head()

Unnamed: 0,type,code_type,definition
0,Country,1,Country is used for areas included (or previou...
1,Subdivision,2,Subdivision is used for the main administrativ...
2,County,7,County is used for smaller administrative divi...
3,Municipality,4,Municipality is used for small administrative ...
4,City,3,"City is used for settlements of any size, incl..."


We need to keep only the countries, so code_type equal to 1:

In [152]:
countries = areas_countries[areas_countries['code_type'] ==1]
countries.head()

Unnamed: 0,area_id,area_name,code_type
1,38,Canada,1.0
2,43,Chile,1.0
3,44,China,1.0
4,36,Cambodia,1.0
5,78,Gabon,1.0


ISO tables: In order to retrieve the ISO code for the countries, Musicbrainz provides us with a table which contains: area_id and their ISO code. These are international standard codes set by the International organization for Standardization (www.iso.org) and will help us to retrieve the country name via ISO code:

In [153]:
#First, we load the first ISO file:
ISO1 = pd.read_csv('Musicbrainz/Tables_used/iso_3166_1.txt',sep='\t', header=None, engine='python')
ISO1.columns = ['area_id','ISO_code']
ISO1.head()

Unnamed: 0,area_id,ISO_code
0,1,AF
1,2,AL
2,3,DZ
3,4,AS
4,5,AD


In [154]:
#We combine both dataframes:
countries_ISO = pd.merge(countries, ISO1, how='left', on='area_id')
countries_ISO.head()

Unnamed: 0,area_id,area_name,code_type,ISO_code
0,38,Canada,1.0,CA
1,43,Chile,1.0,CL
2,44,China,1.0,CN
3,36,Cambodia,1.0,KH
4,78,Gabon,1.0,GA


In [171]:
#And now we can merge the country name into our "to_retrieve" dataframe:
retrieved_artist = pd.merge(to_retrieve, countries_ISO, how='left', on='ISO_code')
retrieved_artist.head()

Unnamed: 0,lat,long,location_name,artist_mbid,coords,address,ISO_code,area_id,area_name,code_type
0,-16.96595,-61.14804,Santa Cruz,0bd9755c-c86d-431c-bc28-ef908b8a9821,"(-16.96595, -61.14804)","{'lat': '-16.43333', 'lon': '-60.9', 'name': '...",BO,26.0,Bolivia,1.0
1,46.44231,-93.36586,"Twin Cities, MN",d4620364-82ec-4c34-9265-a2b72dfa8e3e,"(46.44231, -93.36586)","{'lat': '46.53301', 'lon': '-93.71025', 'name'...",US,222.0,United States,1.0
2,43.58828,-79.64372,Mississauga,b0d85cf7-b73b-4a5d-bf31-a82493c3a8a8,"(43.58828, -79.64372)","{'lat': '43.5789', 'lon': '-79.6583', 'name': ...",CA,38.0,Canada,1.0
3,59.91228,10.74998,Oslo,276f08c2-4b4b-4e45-bab8-3ebe6066e080,"(59.91228, 10.74998)","{'lat': '59.91273', 'lon': '10.74609', 'name':...",NO,160.0,Norway,1.0
4,38.8991,-77.029,"Washington, D.C.",9320918d-5143-4846-bab8-7ad48ec17d58,"(38.8991, -77.029)","{'lat': '38.89511', 'lon': '-77.03637', 'name'...",US,222.0,United States,1.0


In [172]:
#Did we miss any country?
retrieved_artist.isnull().sum(axis=0)

lat              0
long             0
location_name    3
artist_mbid      0
coords           0
address          0
ISO_code         0
area_id          6
area_name        6
code_type        6
dtype: int64

In [173]:
retrieved_artist[retrieved_artist.area_name.isnull()]

Unnamed: 0,lat,long,location_name,artist_mbid,coords,address,ISO_code,area_id,area_name,code_type
649,25.02698,121.49713,108,4fdd526e-3c9f-43d2-aef6-d8bf7dbe4fd8,"(25.02698, 121.49713)","{'lat': '25.01427', 'lon': '121.46719', 'name'...",TW,,,
1008,-22.96566,18.48617,Detroit,3c34ad84-7628-459e-949d-c27e2ae73231,"(-22.96566, 18.48617)","{'lat': '-22.45', 'lon': '18.96667', 'name': '...",,,,
2692,24.20666,120.85153,Taichung,4a965b5a-1177-4ead-a2a8-932d6f12e734,"(24.20666, 120.85153000000001)","{'lat': '24.25', 'lon': '120.71694', 'name': '...",TW,,,
3306,23.599751,121.023811,Taiwan,300db63b-7614-4642-b306-400c6b54bb20,"(23.599751, 121.02381100000001)","{'lat': '23.74639', 'lon': '120.7525', 'name':...",TW,,,
4426,25.02698,121.49713,108,7332456e-c76e-4362-ad31-d84fd34a4942,"(25.02698, 121.49713)","{'lat': '25.01427', 'lon': '121.46719', 'name'...",TW,,,
5081,23.59975,121.02381,Taiwan,90c879b6-b4d1-4632-b66a-cdd4ed766cde,"(23.59975, 121.02381000000001)","{'lat': '23.74639', 'lon': '120.7525', 'name':...",TW,,,


The file didn't recognize the country codes TW and NA, so we'll input this information manually (please note that Taiwan belongs to China, therefore we'll input that information):

In [174]:
retrieved_artist['area_name'][649] = 'China'
retrieved_artist['area_name'][2692] = 'China'
retrieved_artist['area_name'][3306] = 'China'
retrieved_artist['area_name'][4426] = 'China'
retrieved_artist['area_name'][5081] = 'China'
retrieved_artist['area_name'][1008] = 'Taiwan'

In [175]:
retrieved_artist[retrieved_artist.area_name.isnull()]

Unnamed: 0,lat,long,location_name,artist_mbid,coords,address,ISO_code,area_id,area_name,code_type


In [176]:
#Dropping unnecessary columns before merging with our pending dataframe:
to_drop = ['coords', 'address', 'ISO_code']
retrieved_artist.drop(labels=to_drop, axis=1, inplace=True)
#Renaming columns:
retrieved_artist.rename(columns={'lat':'latitude','long':'longitude','area_name':'country_name', 'location_name':'area_name'}, inplace=True)

In [177]:
#And now we cand merge it with our pending2 dataframe:
df3 = pd.merge(pending_credit2, retrieved_artist, how='left', on='artist_mbid')
df3.head()

Unnamed: 0,credit_id,artist_id,artist_mbid,start_area1,start_area2,artist_name,latitude,longitude,area_name,area_id,country_name,code_type
0,578352,578352,b4f76788-7e6f-41b7-ac7b-dfb67f66282e,,,Gustav Ruppke,,,,,,
1,273232,273232,c112a400-af49-4665-8bba-741531d962a1,,,Zachary,,,,,,
2,153193,153193,c42eed94-e233-44e2-82b8-3ed6dd9bf318,,,The High Level Ranters,,,,,,
3,1389968,1171184,bf8fefa4-c02d-4ad0-91d9-26f94169488f,,,Harvard of the South,,,,,,
4,145773,145773,7b4a548e-a01a-49b7-82e7-b49efeb9732c,,,Aric Leavitt,,,,,,


In [178]:
#How many coordinates do we have for each artist credit in this third merge?
df3.notnull().sum(axis=0)

credit_id       1586844
artist_id       1586844
artist_mbid     1586844
start_area1      720141
start_area2       35239
artist_name     1586844
latitude          29705
longitude         29705
area_name         29703
area_id           29668
country_name      29705
code_type         29668
dtype: int64

Thanks to the 1 Million Songs Dataset, we have been able to retrieve the origin for 29.705 artist credits.

We'll save it in retrieved_credit3 and see what's pending:

In [179]:
retrieved_credit3 = df3[df3['country_name'].notnull()]

In [180]:
pending_credit3 = df3[df3['country_name'].isnull()]
#drop empty columns:
to_drop = ['area_id', 'code_type', 'area_name', 'country_name', 'latitude', 'longitude']
pending_credit3.drop(labels=to_drop, axis=1, inplace=True)

In [181]:
#How many artists do we have without origin information?
pending_credit3['artist_id'].nunique()

809640

### Data from Wikidata Query with SPARQL

https://query.wikidata.org/

1) Musicians

SELECT ?musician ?musicianLabel ?genre ?genreLabel ?MusicBrainz_artist_ID ?place_of_birth ?place_of_birthLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?musician wdt:P106 wd:Q639669.
  OPTIONAL { ?musician wdt:P136 ?genre. }
  OPTIONAL { ?musician wdt:P434 ?MusicBrainz_artist_ID. }
  OPTIONAL { ?musician wdt:P19 ?place_of_birth. }
}


--> Export to csv file: query_wikidata_musicians.csv

2) Singers

SELECT ?musician ?musicianLabel ?genre ?genreLabel ?MusicBrainz_artist_ID ?place_of_birth ?place_of_birthLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?musician wdt:P106 wd:Q177220.
  OPTIONAL { ?musician wdt:P136 ?genre. }
  OPTIONAL { ?musician wdt:P434 ?MusicBrainz_artist_ID. }
  OPTIONAL { ?musician wdt:P19 ?place_of_birth. }
}

--> Export to csv file: query_wikidata_singers.csv

3) Bands

SELECT ?band ?bandLabel ?genre ?genreLabel ?MusicBrainz_artist_ID ?location_of_formation ?location_of_formationLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?band wdt:P31 wd:Q215380.
  OPTIONAL { ?band wdt:P136 ?genre. }
  OPTIONAL { ?band wdt:P434 ?MusicBrainz_artist_ID. }
  OPTIONAL { ?band wdt:P740 ?location_of_formation. }
}

--> Export to csv file: query_wikidata_bands.csv

In [191]:
#Open the files and load them into dataframes with the same column names (to match with our main dataframe later):
musicians = pd.read_csv('wikidata/query_wikidata_musicians.csv',sep=',', encoding='utf-8', usecols=[4,6])
musicians.rename(columns={'MusicBrainz_artist_ID':'artist_mbid','place_of_birthLabel':'area_name'}, inplace=True)
singers = pd.read_csv('wikidata/query_wikidata_singers.csv',sep=',', encoding='utf-8', usecols=[4,6])
singers.rename(columns={'MusicBrainz_artist_ID':'artist_mbid','place_of_birthLabel':'area_name'}, inplace=True)
bands = pd.read_csv('wikidata/query_wikidata_bands.csv',sep=',', encoding='utf-8', usecols=[4,6])
bands.rename(columns={'MusicBrainz_artist_ID':'artist_mbid','location_of_formationLabel':'area_name'}, inplace=True)

In [192]:
bands.head()

Unnamed: 0,artist_mbid,area_name
0,f26c72d3-e52c-467b-b651-679c73d8e1a7,Sacramento
1,f26c72d3-e52c-467b-b651-679c73d8e1a7,Sacramento
2,f26c72d3-e52c-467b-b651-679c73d8e1a7,Sacramento
3,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,Dublin
4,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,Dublin


In [193]:
#Now we can concatenate the 3 dataframes into one:
wiki_df = pd.concat([musicians, singers, bands])
wiki_df.head()

Unnamed: 0,artist_mbid,area_name
0,,Cherbourg-en-Cotentin
1,b972f589-fb0e-474e-b64a-803b0364fa75,Salzburg
2,b972f589-fb0e-474e-b64a-803b0364fa75,Salzburg
3,b972f589-fb0e-474e-b64a-803b0364fa75,Salzburg
4,b972f589-fb0e-474e-b64a-803b0364fa75,Salzburg


In [194]:
#We can directly drop the rows which don't have a musicbrainz's id (we need and ID to join with our main df):
wiki_df.dropna(subset=['artist_mbid'], axis=0, inplace=True)

In [195]:
len(wiki_df)

116020

In [196]:
#Let's see how many artists we have:
wiki_df['artist_mbid'].nunique()

96110

In [197]:
#Drop duplicated artist_mbid:
wiki_df.drop_duplicates(subset=['artist_mbid'],keep='first', inplace=True)

In [199]:
#What kind of information do we have in the column origin_name?:
wiki_df.area_name.value_counts()

New York City          1273
Los Angeles            1209
London                  995
Tokyo                   750
Paris                   440
Chicago                 436
Brooklyn                390
Seoul                   376
Philadelphia            348
Toronto                 333
Berlin                  324
Stockholm               323
San Francisco           289
Seattle                 282
Boston                  268
Moscow                  267
California              266
Montreal                248
Detroit                 248
Oslo                    219
Rome                    215
Helsinki                212
Rio de Janeiro          212
Vienna                  201
Istanbul                200
Atlanta                 197
Liverpool               196
Buenos Aires            188
Nashville               187
Manchester              187
                       ... 
Rättvik                   1
Bersillies                1
Kutno                     1
An Cựu                    1
Altstätten          

In [200]:
wiki_df.head()

Unnamed: 0,artist_mbid,area_name
1,b972f589-fb0e-474e-b64a-803b0364fa75,Salzburg
12,b58165ba-ac55-49a1-8855-caf16c68f5f2,Sète
13,d135874d-9cae-4fef-97e3-36acbd9f5a26,Chicago
14,75167b8b-44e4-407b-9d35-effe87b223cf,Toronto
24,4b585938-f271-45e2-b19a-91c634b5e396,Bexleyheath


It looks like we have city names, so we can use our sub_coordinates dataframe to match by area_name. However, we'll have to drop duplicates in both retrieved areas and wiki_df, to avoid mismatching.

In [202]:
#We first delete the duplicate area names in our wiki_df:
wiki_df.drop_duplicates(subset='area_name', keep=False, inplace=True)
#We do the same with our retrieved_areas dataframe:
areas_dedup = sub_coordinates.drop_duplicates(subset='area_name', keep=False)

In [205]:
#Now we can merge wiki_df with areas_dedup:
retrieved_wiki = pd.merge(wiki_df, areas_dedup, how='left', on='area_name')
retrieved_wiki.head()

Unnamed: 0,artist_mbid,area_name,area_id,country_name,latitude,longitude
0,4b585938-f271-45e2-b19a-91c634b5e396,Bexleyheath,,,,
1,dc0640db-f5db-4fde-a5ca-ab1331f94a43,Anchiano,,,,
2,6fdd3b3e-1ea6-4da9-8d6f-8f8de01c133a,Ciboure,49448.0,France,45.708718,0.626891
3,09ff1fe8-d61c-4b98-bb82-18487c74d7b7,Żelazowa Wola,88730.0,Poland,51.892718,21.002168
4,2a33d974-ad9e-4b51-917e-fb9397bf82c6,Puente Alto,115951.0,Chile,-33.437554,-70.65049


In [207]:
retrieved_wiki.country_name.isnull().value_counts()

False    3731
True     3451
Name: country_name, dtype: int64

In [209]:
#Before the merging, we drop the rows we didn't identify:
retrieved_wiki.dropna(subset=['country_name'], axis=0, inplace=True)

In [210]:
#And now we cand merge it with our pending3 dataframe:
df4 = pd.merge(pending_credit3, retrieved_wiki, how='left', on='artist_mbid')
df4.head()

Unnamed: 0,credit_id,artist_id,artist_mbid,start_area1,start_area2,artist_name,area_name,area_id,country_name,latitude,longitude
0,578352,578352,b4f76788-7e6f-41b7-ac7b-dfb67f66282e,,,Gustav Ruppke,,,,,
1,273232,273232,c112a400-af49-4665-8bba-741531d962a1,,,Zachary,,,,,
2,153193,153193,c42eed94-e233-44e2-82b8-3ed6dd9bf318,,,The High Level Ranters,,,,,
3,1389968,1171184,bf8fefa4-c02d-4ad0-91d9-26f94169488f,,,Harvard of the South,,,,,
4,145773,145773,7b4a548e-a01a-49b7-82e7-b49efeb9732c,,,Aric Leavitt,,,,,


In [211]:
#How many coordinates do we have for each artist credit in this third merge?
df4.notnull().sum(axis=0)

credit_id       1557139
artist_id       1557139
artist_mbid     1557139
start_area1      695988
start_area2       34544
artist_name     1557139
area_name          6271
area_id            6271
country_name       6271
latitude           6271
longitude          6271
dtype: int64

Thanks to the 1 Million Songs Dataset, we have been able to retrieve the origin for 6271 artist credits.

We'll save it in retrieved_credit4 and see what's pending:

In [212]:
retrieved_credit4 = df4[df4['country_name'].notnull()]

In [213]:
pending_credit4 = df4[df4['country_name'].isnull()]
#drop empty columns:
to_drop = ['area_id', 'area_name', 'country_name', 'latitude', 'longitude']
pending_credit4.drop(labels=to_drop, axis=1, inplace=True)

In [214]:
#How many artists do we have without origin information?
pending_credit4['artist_id'].nunique()

808602

We can now merge all our retrieved_credit dataframes into 1 and start merging that information into the releases:

In [227]:
#Changing columns names for the first one:
retrieved_credit1.rename(columns={'area_name_x':'area_name'}, inplace=True)
#Dropping useless columns:
to_drop = ['start_area1', 'start_area2', 'code_type', 'area_name_y']
retrieved_credit1.drop(labels=to_drop, axis=1, inplace=True)

In [230]:
#Changing columns names for the second one:
retrieved_credit2.rename(columns={'area_name_x':'area_name'}, inplace=True)
#Dropping useless columns:
to_drop = ['start_area1', 'start_area2', 'code_type', 'area_name_y']
retrieved_credit2.drop(labels=to_drop, axis=1, inplace=True)

In [233]:
#Dropping useless columns for the third one:
to_drop = ['start_area1', 'start_area2', 'code_type']
retrieved_credit3.drop(labels=to_drop, axis=1, inplace=True)

In [236]:
#Dropping useless columns for the fourth one:
to_drop = ['start_area1', 'start_area2']
retrieved_credit4.drop(labels=to_drop, axis=1, inplace=True)

In [238]:
all_credits_retrieved = pd.concat([retrieved_credit1, retrieved_credit2, retrieved_credit3, retrieved_credit4], ignore_index=True)
all_credits_retrieved.head()

Unnamed: 0,area_id,area_name,artist_id,artist_mbid,artist_name,country_name,credit_id,latitude,longitude
0,3868.0,Kingston upon Hull,1075202,d8d9139a-51b9-4983-bbaa-4853a1485c94,Delfino Square,United Kingdom,1258383,52.355518,-1.17432
1,150.0,Netherlands,471271,ff1072ec-8c36-4b01-879b-47166cab1536,Duo De Munck,Kingdom of the Netherlands,471271,52.132633,5.291266
2,99597.0,Ventnor,295560,16fec377-b791-4443-9e44-269ddcd9e299,Death Throes,United Kingdom,295560,52.355518,-1.17432
3,1178.0,London,180899,9f0b3f9d-72b8-49b3-8b1b-5a0ec3b7c577,Cmetric,United Kingdom,180899,52.355518,-1.17432
4,3840.0,Dundee,1171183,955861e9-9822-402d-94c0-52ab31ed42ed,The Shithawks,United Kingdom,1389970,56.490671,-4.202646


In [239]:
all_credits_retrieved['artist_id'].nunique()

206582

# NOTA: HASTA AQUI, NO TENEMOS EN CUENTA NINGUN ARTISTA CUYO ORIGEN SEA SOLO PAIS. SE PODRIA INTEGRAR SI FALTAN DATOS:

## <font color=blue>2) Release information</font>

### Data from Musicbrainz.org

The objective of this project is to visualize when each artist released for the first time a certain CD/Album/Single etc.

If we look at the "releases" table:

In [240]:
releases = pd.read_csv('Musicbrainz/Tables_used/release.txt',sep='\t', header=None, engine='c', usecols=[0,2,3,4])
releases.columns = ['release_id','release_group','credit_id','group_id']
releases.head()

Unnamed: 0,release_id,release_group,credit_id,group_id
0,9,A Sorta Fairytale,60,896742
1,10,A Sorta Fairytale,60,896742
2,11,Glory of the 80's,60,95360
3,12,Silent All These Years,60,104189
4,26,Demons,20211,94299


We can see, in the first 2 rows, that the same CD/Album can be released/remastered many times. According to Musicbrainz's field description for each release (https://musicbrainz.org/doc/Release):

"A MusicBrainz release represents the unique release (i.e. issuing) of a product on a specific date with specific release information such as the country, label, barcode and packaging. If you walk into a store and purchase an album or single, they are each represented in MusicBrainz as one release".

If we look at another release-related field in Musicbrainz, we find the "release group" (https://musicbrainz.org/doc/Release_Group):

"A release group, just as the name suggests, is used to group several different releases into a single logical entity. Every release belongs to one, and only one release group.

Both release groups and releases are "albums" in a general sense, but with an important difference: a release is something you can buy as media such as a CD or a vinyl record, while a release group embraces the overall concept of an album -- it doesn't matter how many CDs or editions/versions it had."

By reading these descriptions, we can clearly see that the release group is the table we are looking for as it represents a single creation, no matter how many times it has been edited or released afterwards. So we will have to keep the first release id for each release group.

In [241]:
release_country = pd.read_csv('Musicbrainz/Tables_used/release_country.txt',sep='\t', header=None, engine='c', usecols=[0,1,2])
release_country.columns = ['release_id','release_country','release_year']
release_country.head()

Unnamed: 0,release_id,release_country,release_year
0,3,81,1997.0
1,1427792,107,2014.0
2,9,81,2002.0
3,10,221,2002.0
4,11,81,1999.0


In [300]:
df5 = pd.merge(releases, release_country, how='left', on='release_id')
df5.head()

Unnamed: 0,release_id,release_group,credit_id,group_id,release_country,release_year
0,9,A Sorta Fairytale,60,896742,81.0,2002.0
1,10,A Sorta Fairytale,60,896742,221.0,2002.0
2,11,Glory of the 80's,60,95360,81.0,1999.0
3,12,Silent All These Years,60,104189,81.0,1997.0
4,26,Demons,20211,94299,107.0,1998.0


In [301]:
#Let's see how many releases we have:
df5['release_id'].nunique()

2198457

In [245]:
df5.isnull().sum(axis=0)

release_id              0
release_group           7
credit_id               0
group_id                0
release_country    287376
release_year       341983
dtype: int64

In [302]:
#We want to keep only the releases which have a release year, so we can drop the others:
df5.dropna(subset=['release_year'], axis=0, inplace=True)
df5['release_year'] = df5.release_year.astype(int,inplace=True)
df5['release_id'].nunique()

1859982

In [303]:
#Let's analyze the year column:
pd.options.display.max_rows = 2000
df5.groupby('release_year').count()

Unnamed: 0_level_0,release_id,release_group,credit_id,group_id,release_country
release_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,2,2,2,2
4,1,1,1,1,1
5,5,5,5,5,5
7,1,1,1,1,1
8,2,2,2,2,2
10,3,3,3,3,3
14,1,1,1,1,1
17,4,4,4,4,4
18,1,1,1,1,1
19,3,3,3,3,3


By looking at the different year values, and, in order to have enough values per year, we could drop the rows whose year is below 1890 and above 2019. Our visualization would have 130 years, which is pretty good.

In [304]:
df5.drop(df5[df5['release_year'] < 1890].index , inplace=True)
df5.drop(df5[df5['release_year'] >2019].index , inplace=True)
df5.sort_values(by=['release_year']).head()

Unnamed: 0,release_id,release_group,credit_id,group_id,release_country,release_year
1266766,386919,Visions of Paradise Waltz,97546,712605,222.0,1890
1266956,386830,German Ballad with Variations,97546,712514,222.0,1890
1266958,386829,German Ballad with Variations,97546,712514,222.0,1890
1266960,386828,Mountain Bells Polka,97546,712513,222.0,1890
1266961,386827,Mountain Bells Polka,97546,712513,222.0,1890


In [305]:
#Converting the year column to datetime for later:
df5['release_year'] = pd.to_datetime(df5['release_year'].astype(str), format='%Y')
df5.dtypes

release_id                  int64
release_group              object
credit_id                   int64
group_id                    int64
release_country           float64
release_year       datetime64[ns]
dtype: object

In [306]:
#We sort by release id and year (we could have 2 release groups with the same name but produced by different artists):
df5.sort_values(['release_group','release_year','credit_id'], ascending=[True,True,True], inplace=True)
df5.head()

Unnamed: 0,release_id,release_group,credit_id,group_id,release_country,release_year
2026273,2163750,,2205562,1962329,240.0,2014-01-01
1648516,1846605,,1503027,1713833,240.0,2015-01-01
1250325,1714060,Beaux Soirs De Paris,1324142,1609358,73.0,1995-01-01
2116340,2265346,Le 1,2291833,2042812,240.0,2018-01-01
1748061,1895266,M2Music HitDisc Vol. 1,1,1751021,222.0,2006-01-01


In [307]:
df5[df5['release_group'] == 'Artaxerxes']

Unnamed: 0,release_id,release_group,credit_id,group_id,release_country,release_year
1836724,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01
1910376,2379252,Artaxerxes,2392005,2132682,221.0,2009-01-01
1909444,2379244,Artaxerxes,2392011,2133192,222.0,2011-01-01


In [308]:
#Now we can delete the duplicate release group and keep the ones who were first released:
df5.drop_duplicates(subset=['release_group'],keep='first', inplace=True)
df5['release_id'].nunique()

1207453

In [309]:
#Just to double-check:
df5[df5['release_group'] == 'Artaxerxes']

Unnamed: 0,release_id,release_group,credit_id,group_id,release_country,release_year
1836724,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01


## <font color=blue>3) Matching releases with artists</font>

Now that we have both artist and releases dataframes, we can join them, knowing that, for all the releases whose credit belongs to more than one artist, will appear as many times as the number of artists in the credit.

As we don't want to show duplicate releases, we need to keep only one artist per release. We will keep the first artist appearing for each release (even though we know this is not 100% accurate, but we have to avoid duplicates). 

In [310]:
#We first merge with the artists_credits dataframe, to retrieve all the artists names:
df6 = pd.merge(df5, artists_credit, how='left', on='credit_id')
df6.head()

Unnamed: 0,release_id,release_group,credit_id,group_id,release_country,release_year,artist_id,artist_name
0,2163750,,2205562,1962329,240.0,2014-01-01,1654312.0,Soul Glo
1,1714060,Beaux Soirs De Paris,1324142,1609358,73.0,1995-01-01,1122795.0,Soixante Étages
2,2265346,Le 1,2291833,2042812,240.0,2018-01-01,1720981.0,TedeuzeM
3,1895266,M2Music HitDisc Vol. 1,1,1751021,222.0,2006-01-01,1.0,Various Artists
4,1772538,devil jokes,1653884,1656147,240.0,2016-01-01,1363025.0,yzome


In [311]:
#And now we merge with all_credits_retrieved to retrieve the artists for which we have coordinates:
df7 = pd.merge(df6, all_credits_retrieved, how='left', on='credit_id')
df7.head()

Unnamed: 0,release_id,release_group,credit_id,group_id,release_country,release_year,artist_id_x,artist_name_x,area_id,area_name,artist_id_y,artist_mbid,artist_name_y,country_name,latitude,longitude
0,2163750,,2205562,1962329,240.0,2014-01-01,1654312.0,Soul Glo,7707.0,Philadelphia,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,United States,41.203322,-77.194525
1,1714060,Beaux Soirs De Paris,1324142,1609358,73.0,1995-01-01,1122795.0,Soixante Étages,,,,,,,,
2,2265346,Le 1,2291833,2042812,240.0,2018-01-01,1720981.0,TedeuzeM,68613.0,Aix-en-Provence,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,France,43.935169,6.067919
3,1895266,M2Music HitDisc Vol. 1,1,1751021,222.0,2006-01-01,1.0,Various Artists,,,,,,,,
4,1772538,devil jokes,1653884,1656147,240.0,2016-01-01,1363025.0,yzome,9655.0,Seattle,1363025.0,c941ad72-8b13-4940-8d99-0ed9becad2d7,yzome,United States,47.751074,-120.740139


In [312]:
len(df7)

1714455

In [313]:
#As we did two merges by credit, there are duplicate entries now:
df7.sort_values(by=['release_id','latitude'], inplace=True)
df7.drop_duplicates(subset='release_id', keep='first', inplace=True)
len(df7)

1207453

In [314]:
df7.isnull().sum(axis=0)

release_id              0
release_group           1
credit_id               0
group_id                0
release_country         0
release_year            0
artist_id_x           124
artist_name_x         127
area_id            612405
area_name          612389
artist_id_y        612385
artist_mbid        612385
artist_name_y      612385
country_name       612385
latitude           612385
longitude          612385
dtype: int64

What kind of artists do we have in our dataframe? Which artists produced the most releases?

In [315]:
df7.groupby('artist_name_x').count().sort_values('release_id',ascending=False).head(100)

Unnamed: 0_level_0,release_id,release_group,credit_id,group_id,release_country,release_year,artist_id_x,area_id,area_name,artist_id_y,artist_mbid,artist_name_y,country_name,latitude,longitude
artist_name_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Various Artists,128419,128419,128419,128419,128419,128419,128419,38,38,38,38,38,38,38,38
Johann Sebastian Bach,1653,1653,1653,1653,1653,1653,1653,1653,1653,1653,1653,1653,1653,1653,1653
[unknown],1223,1223,1223,1223,1223,1223,1223,1222,1222,1222,1222,1222,1222,1222,1222
Mozart,1148,1148,1148,1148,1148,1148,1148,1147,1147,1147,1147,1147,1147,1147,1147
Wolfgang Amadeus Mozart,1117,1117,1117,1117,1117,1117,1117,1117,1117,1117,1117,1117,1117,1117,1117
Bruce Springsteen,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099
Beethoven,1051,1051,1051,1051,1051,1051,1051,1049,1049,1049,1049,1049,1049,1049,1049
Ludwig van Beethoven,971,971,971,971,971,971,971,971,971,971,971,971,971,971,971
Phish,622,622,622,622,622,622,622,622,622,622,622,622,622,622,622
Pearl Jam,566,566,566,566,566,566,566,566,566,566,566,566,566,566,566


In [316]:
#From what we can see above, the category "Various Artists" has many releases assigned:
df7[df7['artist_name_x']=='Various Artists']

Unnamed: 0,release_id,release_group,credit_id,group_id,release_country,release_year,artist_id_x,artist_name_x,area_id,area_name,artist_id_y,artist_mbid,artist_name_y,country_name,latitude,longitude
943987,80,Now That's What I Call Music! 4,1,131289,57.0,2003-01-01,1.0,Various Artists,,,,,,,,
122358,111,Atlantic Jazz: Best of the '70s,1,173495,221.0,1994-01-01,1.0,Various Artists,,,,,,,,
270544,113,Classic FM: Hall of Fame,1,173206,221.0,1996-01-01,1.0,Various Artists,,,,,,,,
666402,135,"It’s a Cool, Cool Christmas",1,56983,221.0,2000-01-01,1.0,Various Artists,,,,,,,,
966749,253,One Way: A Showcase for New Music (Dec/Jan 2003),1,173765,222.0,2003-01-01,1.0,Various Artists,,,,,,,,
86729,407,Alligator Records 30th Anniversary Collection,1,62108,222.0,2001-01-01,1.0,Various Artists,,,,,,,,
1328417,410,The Alligator Records 20th Anniversary Collection,1,107037,222.0,1991-01-01,1.0,Various Artists,,,,,,,,
1329163,456,The Annual 2004,1,157903,221.0,2003-01-01,1.0,Various Artists,,,,,,,,
944193,551,Now That’s What I Call Music! 56,1,162109,221.0,2003-01-01,1.0,Various Artists,,,,,,,,
225226,613,"CMJ New Music Monthly, Volume 50: October 1997",1,169128,222.0,1997-01-01,1.0,Various Artists,,,,,,,,


If we look in detail into these releases, we can see that most of them are music compilations (hence the generic category "Various Artists"). As they are music compilations, that means that the tracks included were originally released before by their genuine author, so we shouldn't take them into account. Also, as we don't have an artist name for them, it will be impossible to retrieve the origin.

We will delete those rows from our dataframe later.

Let's analyze more in detail who are the rest of artists that have many releases, and decide what to do with them.

In [318]:
#"Language instruction" artist:
df7[df7['artist_name_x']=='[language instruction]']

Unnamed: 0,release_id,release_group,credit_id,group_id,release_country,release_year,artist_id_x,artist_name_x,area_id,area_name,artist_id_y,artist_mbid,artist_name_y,country_name,latitude,longitude
1220609,179513,Spanish: Beyond the Basics,597116,539148,222.0,2005-01-01,597116.0,[language instruction],,,,,,,,
1369013,196550,The French Experience 1,597116,552957,221.0,2003-01-01,597116.0,[language instruction],,,,,,,,
1498938,253239,Ultimate Spanish: Beginner-Intermediate (A2),597116,602912,222.0,2004-01-01,597116.0,[language instruction],,,,,,,,
1498937,253241,Ultimate Spanish: Beginner-Intermediate (A1),597116,602915,222.0,2004-01-01,597116.0,[language instruction],,,,,,,,
1498939,253246,Ultimate Spanish: Beginner-Intermediate (A3),597116,602920,222.0,2004-01-01,597116.0,[language instruction],,,,,,,,
1498940,253248,Ultimate Spanish: Beginner-Intermediate (A4),597116,602922,222.0,2004-01-01,597116.0,[language instruction],,,,,,,,
1498941,253255,Ultimate Spanish: Beginner-Intermediate (B1),597116,602925,222.0,2004-01-01,597116.0,[language instruction],,,,,,,,
1498942,253269,Ultimate Spanish: Beginner-Intermediate (B2),597116,602941,222.0,2004-01-01,597116.0,[language instruction],,,,,,,,
1498943,253277,Ultimate Spanish: Beginner-Intermediate (B3),597116,602947,222.0,2004-01-01,597116.0,[language instruction],,,,,,,,
1498944,253282,Ultimate Spanish: Beginner-Intermediate (B4),597116,602949,222.0,2004-01-01,597116.0,[language instruction],,,,,,,,


As its name suggests, these releases are language courses recorded, so they are not music and they are out of our scope too.

The same would apply to the categories [unknown],[nature sounds], [dialogue], [christmas music], [no artist] and [church chimes].

We can now delete from our dataframes all these cathegories, and see what we have left.

In [319]:
#In our main dataframe:
labels = ['[unknown]','[nature sounds]','[dialogue]','[christmas music]', '[no artist]', '[church chimes]','Various Artists','[language instruction]']
df7.drop(df7[df7['artist_name_x'].isin(labels)].index, axis=0, inplace=True)

In [320]:
len(df7)

1077168

In [324]:
#Wich are the unknow artists that produce the most releases?
unknown_artists = df7[df7['country_name'].isnull()].groupby(by='artist_name_x', axis=0).count()
unknown_artists.sort_values(by='release_id',axis=0, ascending=False)

Unnamed: 0_level_0,release_id,release_group,credit_id,group_id,release_country,release_year,artist_id_x,area_id,area_name,artist_id_y,artist_mbid,artist_name_y,country_name,latitude,longitude
artist_name_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
The Cherry Blues Project,267,267,267,267,267,267,267,0,0,0,0,0,0,0,0
Vitamin String Quartet,248,248,248,248,248,248,248,0,0,0,0,0,0,0,0
Die drei ???,244,244,244,244,244,244,244,0,0,0,0,0,0,0,0
Senmuth,231,231,231,231,231,231,231,0,0,0,0,0,0,0,0
Stefan Wolf,213,213,213,213,213,213,213,0,0,0,0,0,0,0,0
モーニング娘。,200,200,200,200,200,200,200,0,0,0,0,0,0,0,0
Glee Cast,200,200,200,200,200,200,200,0,0,0,0,0,0,0,0
Duke Ellington & His Orchestra,183,183,183,183,183,183,183,0,0,0,0,0,0,0,0
Dwelling of Duels,180,180,180,180,180,180,180,0,0,0,0,0,0,0,0
Enid Blyton,164,164,164,164,164,164,164,0,0,0,0,0,0,0,0


It could be an idea to retrieve information for the remaining artists by searching into Wikipedia online, however, we will leave this task as a bonus point for later if we have time.

For now, we will consider that, for these releases whose artists' origin hasn't been identified yet, their origin is equal to their release area (ie: the area in which they were produced). We will have to fill in the origin columns with the release_area info.

In [160]:
#We do the merging by the release country to extract its details:
#df8 = pd.merge(release_pending4, retrieved_areas, how='left', left_on='release_country', right_on='area_id')

In [161]:
#We save the rows for which we retrieved the coordinates:
#release_retrieved5 = df8[df8['lat'].notnull()]

In [162]:
#How much information did we retrieve here?
#len(release_retrieved5)

250747

We can see that this last step has provided us with information for 250.747 releases, which leaves us with 94% of our dataframe completed (as far as the geographical origin is concerned).

We'll now export our main dataframe as the result of this Notebook and we'll follow-up in the next one called "Data_gathering_music_genre":

In [329]:
df7.columns

Index(['release_id', 'release_group', 'credit_id', 'group_id',
       'release_country', 'release_year', 'artist_id', 'artist_name',
       'area_id', 'area_name', 'artist_mbid', 'country_name', 'latitude',
       'longitude'],
      dtype='object')

In [326]:
df7.rename(columns={'artist_id_x':'artist_id','artist_name_x':'artist_name'}, inplace=True)

In [328]:
to_drop = ['artist_id_y', 'artist_name_y']
df7.drop(labels=to_drop, axis=1, inplace=True)

In [330]:
origin_retrieved = df7[df7['country_name'].notnull()]

In [331]:
#And how much do we have left?
unknown_origin = df7[df7['country_name'].isnull()]
columns = ['area_id','area_name', 'country_name', 'latitude', 'longitude']
unknown_origin.drop(labels=columns, axis=1, inplace=True)
len(unknown_origin)

483509

In [332]:
unknown_origin.to_csv('unknown_artist_origin_2.csv', sep='\t', index=False, encoding='utf-8')

In [333]:
#Ordering the columns for better visibility:
order = ['release_id','release_group', 'group_id','release_year','artist_id','artist_mbid', 'credit_id','artist_name','area_id', 'area_name','country_name', 'latitude', 'longitude']
final = origin_retrieved.reindex(columns=order)

In [334]:
origin_retrieved.to_csv('Dataframe_with_origin_2.csv', sep='\t', index=False, encoding='utf-8')