# Gathering the Data

In [1]:
import pandas as pd
import numpy as np
#!pip install pygeocoder
from pygeocoder import Geocoder #If you want to follow the geocoding later, you will need your own Google Maps API key

In [2]:
areas = pd.read_csv('Musicbrainz/Tables_used/area.txt',sep='\t', header=None, engine='python', usecols=[0,1,2,3])
areas.columns = ['area_id','area_mbid','area_name','area_type']
areas.head()

Unnamed: 0,area_id,area_mbid,area_name,area_type
0,15449,2913ad77-cec8-4d2f-98d3-d4aa46ab73bc,Greccio,4.0
1,38,71bbafaa-e825-3e15-8ca9-017dcad1748b,Canada,1.0
2,43,82d5f4d6-aed4-3ff5-81d1-5363ac6e97a7,Chile,1.0
3,44,7c81bb69-a99b-3487-b6d4-0f76d7a29ca0,China,1.0
4,36,ee26e886-87f5-33a2-8e8e-f9591490426d,Cambodia,1.0


In [3]:
#Let's see the area types we have:
area_types = pd.read_csv('Musicbrainz/Tables_used/area_type.txt',sep='\t', header=None, engine='python', usecols=[1,3,4], error_bad_lines=False)
area_types.columns = ['type','code_type','definition']
area_types.head(20)

Unnamed: 0,type,code_type,definition
0,Country,1,Country is used for areas included (or previou...
1,Subdivision,2,Subdivision is used for the main administrativ...
2,County,7,County is used for smaller administrative divi...
3,Municipality,4,Municipality is used for small administrative ...
4,City,3,"City is used for settlements of any size, incl..."
5,District,5,District is used for a division of a large cit...
6,Island,6,Island is used for islands and atolls which do...


In [4]:
artists= pd.read_csv('Musicbrainz/Tables_used/artist.txt',sep='\t', header=None, engine='c', usecols=[0,1,2,4,11,17])
artists.columns = ['artist_id','artist_mbid','artist_name','start_year','start_area1', 'start_area2']
artists.head()

Unnamed: 0,artist_id,artist_mbid,artist_name,start_year,start_area1,start_area2
0,805192,8972b1c1-6482-4750-b51f-596d2edea8b1,WIK▲N,,,
1,371203,49add228-eac5-4de8-836c-d75cde7369c3,Pete Moutso,,,
2,273232,c112a400-af49-4665-8bba-741531d962a1,Zachary,,,
3,101060,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,The Silhouettes,1956.0,222.0,7707.0
4,145773,7b4a548e-a01a-49b7-82e7-b49efeb9732c,Aric Leavitt,,,


In [2]:
1476425-808442

667983

In [5]:
#Let's see how many artists we have:
artists['name'].nunique()

1476425

In [6]:
artists.isnull().sum(axis=0)

artist_id            0
artist_mbid          0
artist_name          8
start_year     1162804
start_area1     808442
start_area2    1274001
dtype: int64

In [7]:
artists_credit= pd.read_csv('Musicbrainz/Tables_used/artist_credit_name.txt',sep='\t', header=None, engine='c', usecols=[0,2,3])
artists_credit.columns = ['credit_id','artist_id','artist_name']
artists_credit.head()

Unnamed: 0,credit_id,artist_id,artist_name
0,578352,578352,Gustav Ruppke
1,273232,273232,Zachary
2,153193,153193,The High Level Ranters
3,32262,32262,Georges Brassens
4,1389968,1171184,Harvard of the South


In [8]:
#Let's join the artists with their credit id and verify that the matching is good:
df = pd.merge(artists, artists_credit, how='left', on='artist_id')
df.head()

Unnamed: 0,artist_id,artist_mbid,artist_name_x,start_year,start_area1,start_area2,credit_id,artist_name_y
0,805192,8972b1c1-6482-4750-b51f-596d2edea8b1,WIK▲N,,,,822846.0,WIK▲N
1,371203,49add228-eac5-4de8-836c-d75cde7369c3,Pete Moutso,,,,,
2,273232,c112a400-af49-4665-8bba-741531d962a1,Zachary,,,,273232.0,Zachary
3,101060,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,The Silhouettes,1956.0,222.0,7707.0,101060.0,The Silhouettes
4,145773,7b4a548e-a01a-49b7-82e7-b49efeb9732c,Aric Leavitt,,,,145773.0,Aric Leavitt


In [9]:
#It looks like it makes sense. Please note that the credit id is sometimes equal to the artist_id, but not always:
df['check'] = df['artist_id'] - df['credit_id']
df['check'].nunique()

1270628

The reason why we need this credit_id for each artist is that, later in the notebook, we'll be able to match each release with its artist.

In [10]:
#We can now get rid of check and the duplicate artist_name column:
df.drop(labels=['check','artist_name_y'], axis=1, inplace=True)
df.head()

Unnamed: 0,artist_id,artist_mbid,artist_name_x,start_year,start_area1,start_area2,credit_id
0,805192,8972b1c1-6482-4750-b51f-596d2edea8b1,WIK▲N,,,,822846.0
1,371203,49add228-eac5-4de8-836c-d75cde7369c3,Pete Moutso,,,,
2,273232,c112a400-af49-4665-8bba-741531d962a1,Zachary,,,,273232.0
3,101060,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,The Silhouettes,1956.0,222.0,7707.0,101060.0
4,145773,7b4a548e-a01a-49b7-82e7-b49efeb9732c,Aric Leavitt,,,,145773.0


In [11]:
df['artist_id'].nunique()

1476425

In [12]:
df.isnull().sum(axis=0)

artist_id              0
artist_mbid            0
artist_name_x         15
start_year       1799557
start_area1      1120376
start_area2      2109027
credit_id         461241
dtype: int64

In [2]:
releases = pd.read_csv('Musicbrainz/Tables_used/release.txt',sep='\t', header=None, engine='c', usecols=[0,1,2,4])
releases.columns = ['release_id','release_mbid','release_name','credit_id']
releases.head()

Unnamed: 0,release_id,release_mbid,release_name,credit_id
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,896742
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,896742
2,11,dd1c5726-5a38-47e1-9747-18b919b9948a,Glory of the 80's,95360
3,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,104189
4,26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,94299


In [3]:
releases['credit_id'].nunique()

1744139

In [15]:
release_country = pd.read_csv('Musicbrainz/Tables_used/release_country.txt',sep='\t', header=None, engine='c', usecols=[0,1,2])
release_country.columns = ['release_id','area_id','release_year']
release_country.head()

Unnamed: 0,release_id,area_id,release_year
0,3,81,1997.0
1,1427792,107,2014.0
2,9,81,2002.0
3,10,221,2002.0
4,11,81,1999.0


In [16]:
df2 = pd.merge(releases, release_country, how='left', on='release_id')
df2.head()

Unnamed: 0,release_id,release_mbid,release_name,credit_id,area_id,release_year
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,896742,81.0,2002.0
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,896742,221.0,2002.0
2,11,dd1c5726-5a38-47e1-9747-18b919b9948a,Glory of the 80's,95360,81.0,1999.0
3,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,104189,81.0,1997.0
4,26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,94299,107.0,1998.0


In [17]:
df2.isnull().sum(axis=0)

release_id           0
release_mbid         0
release_name         7
credit_id            0
area_id         287376
release_year    341983
dtype: int64

In [18]:
df2['release_id'].nunique()

2198457

In [19]:
#We have another table which provides us with the release year for those whose release country is unkown:
release_unk = pd.read_csv('Musicbrainz/Tables_used/release_unknown_country.txt',sep='\t', header=None, engine='c', usecols=[0,1])
release_unk.columns = ['release_id','release_year']
release_unk.head()

Unnamed: 0,release_id,release_year
0,1372866,1998.0
1,1089598,2010.0
2,1147748,2011.0
3,1068236,2006.0
4,1148229,2008.0


In [20]:
df3 = pd.merge(df2, release_unk, how='left', on='release_id')
df3.head()

Unnamed: 0,release_id,release_mbid,release_name,credit_id,area_id,release_year_x,release_year_y
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,896742,81.0,2002.0,
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,896742,221.0,2002.0,
2,11,dd1c5726-5a38-47e1-9747-18b919b9948a,Glory of the 80's,95360,81.0,1999.0,
3,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,104189,81.0,1997.0,
4,26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,94299,107.0,1998.0,


In [21]:
df3['release_year_x'].fillna(df3['release_year_y'], inplace=True)
df3.drop(labels=['release_year_y'], axis=1, inplace=True)
df3.isnull().sum(axis=0)

release_id             0
release_mbid           0
release_name           7
credit_id              0
area_id           287376
release_year_x    210161
dtype: int64

In [22]:
df3.dropna(subset=['release_year_x'], axis=0, inplace=True)
df3['release_year_x'] = df3.release_year_x.astype(int,inplace=True)

In [23]:
df3.describe()

Unnamed: 0,release_id,credit_id,area_id,release_year_x
count,2009584.0,2009584.0,1877788.0,2009584.0
mean,1237545.0,1118064.0,167.1807,2003.481
std,703936.0,608073.6,72.78065,16.27693
min,1.0,2.0,1.0,1.0
25%,589548.8,615122.8,99.0,1998.0
50%,1308062.0,1151382.0,221.0,2006.0
75%,1851028.0,1641454.0,222.0,2012.0
max,2379919.0,2133727.0,258.0,2201.0


In [24]:
df3.isnull().sum(axis=0)

release_id             0
release_mbid           0
release_name           5
credit_id              0
area_id           131796
release_year_x         0
dtype: int64

In [25]:
df3.head()

Unnamed: 0,release_id,release_mbid,release_name,credit_id,area_id,release_year_x
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,896742,81.0,2002
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,896742,221.0,2002
2,11,dd1c5726-5a38-47e1-9747-18b919b9948a,Glory of the 80's,95360,81.0,1999
3,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,104189,81.0,1997
4,26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,94299,107.0,1998


# Understanding the data

### 1) Temporal Data

In [26]:
df3.sort_values(by=['release_year_x']).head()

Unnamed: 0,release_id,release_mbid,release_name,credit_id,area_id,release_year_x
132016,2166840,aae8d806-8694-43fe-9446-3882a33ad8db,Premonition,1964778,194.0,1
1998044,2146235,b0c4deae-d0cb-4cc8-9751-d1e72ebb3f44,Ostrich - Part One,1947908,240.0,1
2005981,2150739,cb4f2efa-fa92-464e-af1b-6c6d1ebc0e20,Tangential,1951552,240.0,4
1967306,2080139,8a3e8541-f89f-4e20-a914-a1ff8b75fde3,Insane Love,1895983,221.0,5
2130834,2273607,3c0c17b5-cc92-407d-99cf-6fcb3e02e3ec,Black Desert Blues,2049134,240.0,5


It's clear that some of the years are not correct. Let's see how many we have and decide what to do with them:

In [27]:
pd.options.display.max_rows = 2000
df3.groupby('release_year_x').count()

Unnamed: 0_level_0,release_id,release_mbid,release_name,credit_id,area_id
release_year_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,2,2,2,2
4,1,1,1,1,1
5,5,5,5,5,5
7,1,1,1,1,1
8,2,2,2,2,2
10,3,3,3,3,3
14,1,1,1,1,1
16,1,1,1,1,0
17,4,4,4,4,4
18,1,1,1,1,1


By looking at the different year values, and, in order to have enough values per year, we could drop the rows whose year is below 1890 and above 2019. Our visualization would have 130 years, which is pretty good.

In [28]:
df3.drop(df3[df3['release_year_x'] < 1890].index , inplace=True)
df3.drop(df3[df3['release_year_x'] >2019].index , inplace=True)
df3.sort_values(by=['release_year_x']).head()

Unnamed: 0,release_id,release_mbid,release_name,credit_id,area_id,release_year_x
1266706,386917,239b696f-3368-4204-beb7-09c638cd9558,Visions of Paradise Waltz,712605,222.0,1890
1266939,386964,6ce8e3b6-4d62-4e14-aa16-b42858cd5f16,Listen to My Tale of Woe,712631,222.0,1890
1266937,386965,93ba9a94-d139-4f11-9606-684417758ff1,Suwanee River,712633,222.0,1890
1266908,386847,2d8a67af-69b2-4724-8232-4ee0122cf246,Nadjy Waltz,712522,222.0,1890
1266907,386848,8281c07f-da7c-35f7-8f91-a4092a89bf3e,Nadjy Waltz,712522,222.0,1890


In [29]:
#Now, let's try to match each release with their artist:
df4 = pd.merge(df3, df, how='left', on='credit_id')
df4.head(10)

Unnamed: 0,release_id,release_mbid,release_name,credit_id,area_id,release_year_x,artist_id,artist_mbid,artist_name_x,start_year,start_area1,start_area2
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,896742,81.0,2002,,,,,,
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,896742,221.0,2002,,,,,,
2,11,dd1c5726-5a38-47e1-9747-18b919b9948a,Glory of the 80's,95360,81.0,1999,,,,,,
3,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,104189,81.0,1997,,,,,,
4,26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,94299,107.0,1998,,,,,,
5,49,bc9afecd-685e-482c-9412-27f2d267b1dd,Out Spaced,94125,221.0,1998,94125.0,bdcfb781-68cd-4178-b10c-e9f6fb01da18,Fikret Kızılok,1946.0,214.0,5065.0
6,372235,71d9b457-08d6-4fa2-b932-ff56f03f9e70,Open Up Your Mind,596076,222.0,2002,596076.0,8f76f21e-dcd5-4a5a-bdc3-8adfcd0d5427,n.t.lie,,,
7,2356295,8a73db92-1396-4969-a938-0dab0ca30b97,Augusta Taurinorum,2113953,105.0,2008,446005.0,28742ef4-d86e-4fd6-b5b4-3b4917c73613,Rolando Panerai,1924.0,105.0,21258.0
8,2356295,8a73db92-1396-4969-a938-0dab0ca30b97,Augusta Taurinorum,2113953,105.0,2008,290346.0,edba9279-b24b-4e0c-9172-2b016cdcbc1e,Rafael Kubelík,1914.0,56.0,4236.0
9,2356295,8a73db92-1396-4969-a938-0dab0ca30b97,Augusta Taurinorum,2113953,105.0,2008,260278.0,59cf9a49-c3b2-4388-8717-4fb836cc7269,Symphonieorchester des Bayerischen Rundfunks,1949.0,81.0,


We want to know more about the meaning of the columns "release_year_x" and the artist's "start_year". Let's take a look, for example, to the release id 71: the artist is Yasuaki Shimizu (written in Japanese in our dataframe). By searching in Wikipedia, we can see that he was born in 1954 (which is the value appearing in "start_year").

If we take anothe example, release_id number 2356295. The composer, Rafael Kubelík, was born in 1914 and tat value also appears as the artist's start year. 

Therefore, we don't want to use the start year for our visualization, as we want the year each release was produced. It would seem more accurate then, to use "release_year_x" from now on.

### 2) Geographical Data

Let's take a deeper look at the column "area_id" (which will be very useful for our visualization).

If we look at the first row (release "A Sorta Fairytale"), our dataframe says it was released in area number 81.

Now, let's bring back our "areas" dataframe we had at the beginning, and see wich country is that:

In [30]:
print(areas.loc[areas['area_id'] == 81])

    area_id                             area_mbid area_name  area_type
14       81  85752fda-13c4-31a3-bee5-0e5cb1f51dad   Germany        1.0


However, there are some cases where the country code is not informative: for the release id 1966951, for instance, the country code is 240:

In [31]:
print(df4.loc[df4['release_id'] == 1966951])

    release_id                          release_mbid        release_name  \
13     1966951  6b81783e-688c-4550-a698-c7b7def52c47  Songs for Marianne   
14     1966951  6b81783e-688c-4550-a698-c7b7def52c47  Songs for Marianne   
15     1966951  6b81783e-688c-4550-a698-c7b7def52c47  Songs for Marianne   
16     1966951  6b81783e-688c-4550-a698-c7b7def52c47  Songs for Marianne   

    credit_id  area_id  release_year_x  artist_id  \
13    1806611    240.0            2017   452830.0   
14    1806611    240.0            2017   270752.0   
15    1806611    240.0            2017   270751.0   
16    1806611    240.0            2017   505754.0   

                             artist_mbid               artist_name_x  \
13  18825671-e037-4a37-aef4-45ed57838fbf               Wynford Evans   
14  2403f8c6-8ccc-48d6-977f-de0baa2d6fed        Sir Roger Norrington   
15  0e84e616-8390-4dbb-ac7f-0cad612c4842  The Schütz Choir of London   
16  b87a668b-54df-47ff-b3d9-63f82676d7ee                  John El

In [32]:
print(areas.loc[areas['area_id'] == 240])

       area_id                             area_mbid    area_name  area_type
16331      240  525d4e18-3d00-31b9-a58b-a146a916de8f  [Worldwide]        NaN


That is probably because this is a collaborative release (we can see that many artists contributed to it), and therefore it was probably released in more than one place.

In order not to loose information for our visualization later, we could replace all the rows which have that 240 value by the value of the artist's start_area. This it's not 100% accurate (an artist can move and live somewhere else during his life), but it will be helpful in this case.

In [33]:
#We replace the values and check if they were removed:
df4['area_id'] = np.where(df4['area_id']==240, df4['start_area1'], df4['area_id'])

In [34]:
df4.loc[df4['area_id'] == 240]

Unnamed: 0,release_id,release_mbid,release_name,credit_id,area_id,release_year_x,artist_id,artist_mbid,artist_name_x,start_year,start_area1,start_area2
3812,1280108,33da9608-a1c7-47fc-af20-3c4bfd480525,Born A Trip,1260851,240.0,2012,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],,240.0,
74577,1146759,00e7ac4b-bd8e-42bc-a407-8c53c4d2271e,"Star Trek: Deep Space Nine, Box 02: The Paul B...",1153656,240.0,2010,974069.0,8489cff0-87b5-41cc-9845-4aefd433a810,Princessa Avenue,2007.0,240.0,
98405,2079085,dac70d75-76d5-453b-9d0e-cce73650e712,"2013-08-30: Dick's Sporting Goods Park, Commer...",1895099,240.0,2013,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],,240.0,
137700,1145395,d3fa2563-22e7-4271-a06f-517d073bd0a8,idle talk,1152554,240.0,2012,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],,240.0,
139383,1609243,17eecde0-566c-4620-a102-fbeee71dcd68,Temporal: Bonus Tracks + Sawblade EP,1209150,240.0,2012,978717.0,57fbea93-1727-4301-968d-91d0a38c7b0e,Silk Road Ensemble,,240.0,
145962,2092207,43c2b460-0d9b-4671-9f21-d7df1d5188c2,Ludomania,1905588,240.0,2008,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],,240.0,
161465,2166876,2173862a-6099-4712-bbdc-1828f3517084,Party InMaHead,1964811,240.0,2018,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],,240.0,
168041,2324555,3b3c18e5-4ce0-41a9-8279-3a5ec7d6a966,Kyrie,2088565,240.0,2014,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],,240.0,
181162,2358770,2889a322-696b-4f90-97e8-47a1b4073930,WILSONWAVE,2116055,240.0,2017,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],,240.0,
203504,1145910,0d956d18-8731-462a-ae16-1d0b43f88b9b,The Sum of Your Disarray,1153006,240.0,2011,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],,240.0,


We can see that, those 240 values that are still remaining come from the fact that the artist's start area was also 240. How many values do we have?

In [35]:
df4.loc[df4['area_id'] == 240].count()

release_id        334
release_mbid      334
release_name      334
credit_id         334
area_id           334
release_year_x    334
artist_id         334
artist_mbid       334
artist_name_x     334
start_year         44
start_area1       334
start_area2        22
dtype: int64

In [36]:
#As they are only 334, we can easily remove them and it won't have much impact in the future:
df4.drop(df4[df4['area_id'] == 240].index , inplace=True)

In [37]:
#Also, as we saw before that we have duplicate entries for the same release (caused by the collaborative releases),
#we should remove the duplicates:
len(df4)

2856811

In [38]:
df4.drop_duplicates(subset='release_id', inplace=True)
len(df4)

1991556

Finally, we can merge the dataframe with the areas, in order to have the country name too:

In [39]:
df5 = pd.merge(df4, areas, how='left', on='area_id')
df5.head()

Unnamed: 0,release_id,release_mbid,release_name,credit_id,area_id,release_year_x,artist_id,artist_mbid,artist_name_x,start_year,start_area1,start_area2,area_mbid,area_name,area_type
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,896742,81.0,2002,,,,,,,85752fda-13c4-31a3-bee5-0e5cb1f51dad,Germany,1.0
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,896742,221.0,2002,,,,,,,8a754a16-0027-3a29-b6d7-2b40ea0481ed,United Kingdom,1.0
2,11,dd1c5726-5a38-47e1-9747-18b919b9948a,Glory of the 80's,95360,81.0,1999,,,,,,,85752fda-13c4-31a3-bee5-0e5cb1f51dad,Germany,1.0
3,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,104189,81.0,1997,,,,,,,85752fda-13c4-31a3-bee5-0e5cb1f51dad,Germany,1.0
4,26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,94299,107.0,1998,,,,,,,2db42837-c832-3c27-b4a3-08198f75693c,Japan,1.0


Do we have enough geographical data?

In [40]:
df5.isnull().sum(axis=0)

release_id              0
release_mbid            0
release_name            5
credit_id               0
area_id            269922
release_year_x          0
artist_id          716062
artist_mbid        716062
artist_name_x      716066
start_year        1514236
start_area1       1255020
start_area2       1660576
area_mbid          269922
area_name          269922
area_type          313728
dtype: int64

In [41]:
df5.area_name.value_counts()

United States                                     484410
United Kingdom                                    249511
Germany                                           189941
Japan                                             168492
France                                             88296
Europe                                             43804
Netherlands                                        37853
Italy                                              37062
Canada                                             36820
Australia                                          34821
Finland                                            28769
Sweden                                             28339
Spain                                              26407
Russia                                             22783
Brazil                                             17911
Belgium                                            16307
Poland                                             13084
Austria                        

In [42]:
df5.area_type.value_counts()

1.0    1660809
3.0      13055
2.0       3230
5.0        348
4.0        326
7.0         40
6.0         20
Name: area_type, dtype: int64

It looks like the majority of the data is classiffied by country, but we also have cities and more geographical subdivisions. 

For our visualization to work, we need to transform all the rows to country.

In [43]:
#Let's bring back our area_types dataframe:
area_types.head(10)

Unnamed: 0,type,code_type,definition
0,Country,1,Country is used for areas included (or previou...
1,Subdivision,2,Subdivision is used for the main administrativ...
2,County,7,County is used for smaller administrative divi...
3,Municipality,4,Municipality is used for small administrative ...
4,City,3,"City is used for settlements of any size, incl..."
5,District,5,District is used for a division of a large cit...
6,Island,6,Island is used for islands and atolls which do...


In [44]:
#Let's analyze the names of the countries:
df5.loc[df5['area_type'] != 1].area_name.value_counts()

Europe                                            43804
London                                              648
Los Angeles                                         554
New York                                            441
Seattle                                             387
Berlin                                              375
Quebec                                              341
Atlanta                                             280
Paris                                               269
Chicago                                             224
California                                          185
England                                             176
Houston                                             176
Wien                                                148
Melbourne                                           146
Toronto                                             142
Brooklyn                                            139
San Francisco                                   

In [45]:
df5.loc[df5['area_type'] != 1].area_id.value_counts()

241.0       43804
1178.0        642
7703.0        554
9655.0        387
326.0         375
7020.0        352
322.0         341
5210.0        279
4434.0        268
5099.0        224
266.0         185
432.0         175
7326.0        159
653.0         148
5121.0        146
5076.0        142
7022.0        139
10861.0       138
7279.0        130
5196.0        124
5179.0        123
7288.0        121
5126.0        118
434.0         115
397.0         113
5147.0        106
115582.0      100
3925.0         99
7707.0         99
5213.0         96
5244.0         91
5212.0         90
435.0          90
295.0          89
5075.0         89
7706.0         86
7295.0         82
684.0          75
331.0          75
695.0          75
2429.0         73
7954.0         71
3821.0         71
5114.0         71
2450.0         70
12021.0        66
12030.0        60
11954.0        58
9279.0         56
292.0          55
5197.0         55
30897.0        55
5241.0         54
7713.0         53
3855.0         53
1812.0    

Now, what we want is to transform all the area_ids that are not 1 (country), into country, to be able to do our visualization by country.

If we look at the Musicbrainz Database schema, we can see that they have some tables that convert the different areas into ISO3166 standard codes:

In [46]:
#The first ISO file that we have gives us the country ISO code for the area_id that are countries:
iso_countries = pd.read_csv('Musicbrainz/Tables_used/iso_3166_1.txt',sep='\t', header=None, engine='c')
iso_countries.columns = ['area_id','country_code']
iso_countries.head()

Unnamed: 0,area_id,country_code
0,1,AF
1,2,AL
2,3,DZ
3,4,AS
4,5,AD


In [47]:
#The second ISO file that we have, gives us the country ISO code and the region for the area_id that are not countries:
iso_areas = pd.read_csv('Musicbrainz/Tables_used/iso_3166_2.txt',sep=',', header=None, engine='c')
iso_areas.columns = ['area_id','country_code','region']
iso_areas.head()

Unnamed: 0,area_id,country_code,region
0,261,US,MD
1,262,US,AK
2,263,US,AL
3,264,US,AR
4,265,US,AZ


We can now merge our dataframe to retrieve the country code for all the areas:

In [48]:
df6 = pd.merge(df5, iso_countries, how='left', on='area_id')
df6.head()

Unnamed: 0,release_id,release_mbid,release_name,credit_id,area_id,release_year_x,artist_id,artist_mbid,artist_name_x,start_year,start_area1,start_area2,area_mbid,area_name,area_type,country_code
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,896742,81.0,2002,,,,,,,85752fda-13c4-31a3-bee5-0e5cb1f51dad,Germany,1.0,DE
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,896742,221.0,2002,,,,,,,8a754a16-0027-3a29-b6d7-2b40ea0481ed,United Kingdom,1.0,GB
2,11,dd1c5726-5a38-47e1-9747-18b919b9948a,Glory of the 80's,95360,81.0,1999,,,,,,,85752fda-13c4-31a3-bee5-0e5cb1f51dad,Germany,1.0,DE
3,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,104189,81.0,1997,,,,,,,85752fda-13c4-31a3-bee5-0e5cb1f51dad,Germany,1.0,DE
4,26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,94299,107.0,1998,,,,,,,2db42837-c832-3c27-b4a3-08198f75693c,Japan,1.0,JP


In [49]:
#Now, we can merge with the other ISO file:
df7 = pd.merge(df6, iso_areas, how='left', on='area_id')
df7.head()

Unnamed: 0,release_id,release_mbid,release_name,credit_id,area_id,release_year_x,artist_id,artist_mbid,artist_name_x,start_year,start_area1,start_area2,area_mbid,area_name,area_type,country_code_x,country_code_y,region
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,896742,81.0,2002,,,,,,,85752fda-13c4-31a3-bee5-0e5cb1f51dad,Germany,1.0,DE,,
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,896742,221.0,2002,,,,,,,8a754a16-0027-3a29-b6d7-2b40ea0481ed,United Kingdom,1.0,GB,,
2,11,dd1c5726-5a38-47e1-9747-18b919b9948a,Glory of the 80's,95360,81.0,1999,,,,,,,85752fda-13c4-31a3-bee5-0e5cb1f51dad,Germany,1.0,DE,,
3,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,104189,81.0,1997,,,,,,,85752fda-13c4-31a3-bee5-0e5cb1f51dad,Germany,1.0,DE,,
4,26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,94299,107.0,1998,,,,,,,2db42837-c832-3c27-b4a3-08198f75693c,Japan,1.0,JP,,


In [50]:
df7.isnull().sum(axis=0)

release_id              0
release_mbid            0
release_name            5
credit_id               0
area_id            269922
release_year_x          0
artist_id          716062
artist_mbid        716062
artist_name_x      716066
start_year        1514278
start_area1       1255020
start_area2       1660616
area_mbid          269922
area_name          269922
area_type          313728
country_code_x     287010
country_code_y    1973486
region            1973486
dtype: int64

If we look at the column "country_code_x", we can see that 287010 rows are missing. If we bring the value of the column "country_code_y" for those rows, we shoul have more rows filled.

In [51]:
df7.country_code_x = np.where(df7.country_code_x.isnull(), df7.country_code_y, df7.country_code_x )

In [52]:
df7.isnull().sum(axis=0)

release_id              0
release_mbid            0
release_name            5
credit_id               0
area_id            269922
release_year_x          0
artist_id          716062
artist_mbid        716062
artist_name_x      716066
start_year        1514278
start_area1       1255020
start_area2       1660616
area_mbid          269922
area_name          269922
area_type          313728
country_code_x     281761
country_code_y    1973486
region            1973486
dtype: int64

We have now 281761 rows missing in "country_code_x", so 5249 less than before.

We are now going to remove the rows and columns that we don't need, in order to have a cleaner Dataframe for later.

In [53]:
#Drop the rows that don't have "area_id":
df7.dropna(subset=['area_id'], axis=0, inplace=True)
#Drop the columns that we don't need anymore for our analysis:
df7.drop(labels=['artist_mbid','start_year','start_area1','start_area2','area_mbid','country_code_y','region'], axis=1, inplace=True)
df7 = df7.rename(columns={'country_code_x':'country_code'})
df7.head()

Unnamed: 0,release_id,release_mbid,release_name,credit_id,area_id,release_year_x,artist_id,artist_name_x,area_name,area_type,country_code
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,896742,81.0,2002,,,Germany,1.0,DE
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,896742,221.0,2002,,,United Kingdom,1.0,GB
2,11,dd1c5726-5a38-47e1-9747-18b919b9948a,Glory of the 80's,95360,81.0,1999,,,Germany,1.0,DE
3,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,104189,81.0,1997,,,Germany,1.0,DE
4,26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,94299,107.0,1998,,,Japan,1.0,JP


In [54]:
len(df7)

1721687

In [55]:
#Do we have any duplicate release?
df7['release_id'].nunique()

1721634

In [56]:
#Let's get rid of the duplicates:
df7.drop_duplicates(subset='release_id', inplace=True)
len(df7)

1721634

Now, we want to see the names of the different countries remaining, to see if there is any inconsistency:

In [57]:
#We build a temporary dataframe which converts country code to its name:
countries = areas.copy()
countries.drop(countries[countries['area_type'] != 1].index , inplace=True)
countries.drop(labels=['area_mbid','area_type'], axis=1, inplace=True)
a = pd.merge(countries, iso_countries, how='left', on='area_id')
a = a.rename(columns={'area_name':'country_name'})
a.head()

Unnamed: 0,area_id,country_name,country_code
0,38,Canada,CA
1,43,Chile,CL
2,44,China,CN
3,36,Cambodia,KH
4,78,Gabon,GA


In [58]:
#Now, we can match our dataframe with the temporary conversion dataframe:
df8 = pd.merge(df7, a, how='left', on='country_code')
df8.drop(labels=['area_id_y'], axis=1, inplace=True)
df8.head()

Unnamed: 0,release_id,release_mbid,release_name,credit_id,area_id_x,release_year_x,artist_id,artist_name_x,area_name,area_type,country_code,country_name
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,896742,81.0,2002,,,Germany,1.0,DE,Germany
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,896742,221.0,2002,,,United Kingdom,1.0,GB,United Kingdom
2,11,dd1c5726-5a38-47e1-9747-18b919b9948a,Glory of the 80's,95360,81.0,1999,,,Germany,1.0,DE,Germany
3,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,104189,81.0,1997,,,Germany,1.0,DE,Germany
4,26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,94299,107.0,1998,,,Japan,1.0,JP,Japan


In [59]:
#Let's analyze the names of the countries:
df8.country_name.value_counts()

United States                                   485290
United Kingdom                                  250755
Germany                                         190463
Japan                                           168659
France                                           88635
Netherlands                                      37872
Canada                                           37217
Italy                                            37148
Australia                                        34926
Finland                                          28773
Sweden                                           28360
Spain                                            26546
Russia                                           22944
Brazil                                           17967
Belgium                                          16319
Poland                                           13086
Namibia                                          11839
Kingdom of the Netherlands                       11839
Austria   

If we have a quick look at the names listed above, we can see that some of them are not 100% accurate or need some updates. Here are a few examples:

- There is one country called "Netherlands" and another one "Kingdom of the Netherlands"
- There is a country named "Czech Republic" and another one "Czechoslovakia"
- There is a country named "Soviet Union" and another one "Russia"

In order to update the countries' names and have accurate information, we have copied the official list of countries and their capitals into a .txt file (the list of countries is available in: https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages).

In [60]:
wiki_countries = pd.read_csv('Musicbrainz/Tables_used/wiki_countries.txt',sep=',', header=0)
wiki_countries.head()

Unnamed: 0,Country,Capital
0,Afghanistan,Kabul
1,Åland Islands,Mariehamn
2,Albania,Tirana
3,Algeria,Algiers
4,American Samoa,Pago Pago


After that, I have compared one by one each of the countries appearing in our df8 and do the following replacements:

In [61]:
df8.replace('Bahamas','The Bahamas',inplace=True)
df8.replace('Bonaire, Sint Eustatius and Saba','Netherlands',inplace=True)
df8.replace('Congo','Republic of the Congo',inplace=True)
df8.replace('Czechoslovakia','Czech Republic',inplace=True)
df8.replace('East Germany','Germany',inplace=True)
df8.replace('Gambia','The Gambia',inplace=True)
df8.replace('Kingdom of the Netherlands','Netherlands',inplace=True)
df8.replace('Macao','Macau',inplace=True)
df8.replace('Macedonia','North Macedonia',inplace=True)
df8.replace('Netherlands Antilles','Netherlands',inplace=True)
df8.replace('Saint Helena, Ascension and Tristan da Cunha','Saint Helena',inplace=True)
df8.replace("Saint Martin (French part)",'Saint Martin',inplace=True)
df8.replace("Sao Tome and Principe",'SÃ£o TomÃ©and PrÃ­ncipe',inplace=True)
df8.replace("Serbia and Montenegro",'Serbia',inplace=True)
df8.replace('Soviet Union','Russia',inplace=True)
df8.replace('Swaziland','Eswatini',inplace=True)
df8.replace('Timor-Leste','East Timor',inplace=True)
df8.replace('Tokelau','New Zealand',inplace=True)
df8.replace('U.S. Virgin Islands','United States Virgin Islands',inplace=True)
df8.replace('Western Sahara','Sahrawi Arab Democratic Republic',inplace=True)
df8.replace('Yugoslavia','Serbia',inplace=True) #I had to chose one for Yugoslavia
df8.drop(df8[df8['country_name'] == 'Antarctica'].index , inplace=True) #Doesn't exist as a country

Now that we have the country names normalized, we can match each of them with their capitals and their coordinates using Pygeocoder with the Google Maps API key (you don't need to do the Geocoding, I have saved a copy of the resulting file with coordinates: wiki_coords.txt)

If you want to follow the geocoding, please run the following commands:
    
API_key = "YOUR_API_KEY"

capitals = wiki_countries['Capital'].values.tolist()

coordinates = []

for i in capitals:

    result = Geocoder(api_key=API_key).geocode(i).coordinates
    coordinates.append(result)

wiki_countries['coordinates'] = coordinates


wiki_countries.head()


And, if you want to save your own file:

wiki_countries.to_csv('wiki_coords.txt',index=None, sep="\t")

In [62]:
wiki_coords = pd.read_csv('Musicbrainz/Tables_used/wiki_coords.txt',sep=',', header=0)
wiki_coords.head()

Unnamed: 0,Country,Capital,coordinates
0,Afghanistan,Kabul,"(34.5553494, 69.207486)"
1,Åland Islands,Mariehamn,"(60.0970945, 19.9348339)"
2,Albania,Tirana,"(41.3275459, 19.8186982)"
3,Algeria,Algiers,"(36.753768, 3.0587561)"
4,American Samoa,Pago Pago,"(-14.2756319, -170.7020359)"


Now, what we want is to create two columns, "latitude" and "longitude", as they will be our x and y axis in our visualization:

In [116]:
lat = []
lng = []

for row in wiki_coords['coordinates']:
    a = tuple(wiki_coords['coordinates'][row])
    lat.append(float(a.split(',')[0]))
    lng.append(float(a.split(',')[1]))

wiki_coords['latitude'] = lat
wiki_coords['longitude'] = lng

wiki_coords.head()

KeyError: '(34.5553494, 69.207486)'

In [114]:
type(tuple(wiki_coords['coordinates'][1]))

tuple

In [63]:
df8.to_csv('df8.csv',index=None)