# DATA GATHERING

# Data from Musicbrainz.org

## 1) Artist information

In [2]:
import pandas as pd
import numpy as np
#import time
#!pip install pygeocoder
#from pygeocoder import Geocoder #If you want to follow the geocoding later, you will need your own Google Maps API key
#import matplotlib.pyplot as plt
#%matplotlib inline
from tqdm import tqdm

In [3]:
artists= pd.read_csv('Musicbrainz/Tables_used/artist.txt',sep='\t', header=None, engine='c', usecols=[0,1,2,11,17])
artists.columns = ['artist_id','artist_mbid','artist_name','start_area1', 'start_area2']
artists.head()

Unnamed: 0,artist_id,artist_mbid,artist_name,start_area1,start_area2
0,805192,8972b1c1-6482-4750-b51f-596d2edea8b1,WIK▲N,,
1,371203,49add228-eac5-4de8-836c-d75cde7369c3,Pete Moutso,,
2,273232,c112a400-af49-4665-8bba-741531d962a1,Zachary,,
3,101060,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,The Silhouettes,222.0,7707.0
4,145773,7b4a548e-a01a-49b7-82e7-b49efeb9732c,Aric Leavitt,,


In [4]:
#Let's see how many artists we have:
artists['artist_id'].nunique()

1476425

In [5]:
#How much info we have for each artist?
artists.isnull().sum(axis=0)

artist_id            0
artist_mbid          0
artist_name          8
start_area1     808442
start_area2    1274001
dtype: int64

What are the "start_area1" and "start_area2"? If we look at Musicbrainz's field description for each artist (https://musicbrainz.org/doc/Artist), we can see that:

Area: The artist area, as the name suggests, indicates the area with which an artist is primarily identified with. It is often, but not always, its birth/formation country.

We will keep this information as the artist's origin for later.

We need to incorporate as well the table called "artist credit", which gives us the artist credit_id. We will use this field to join later on each release with its artist:

In [6]:
artists_credit= pd.read_csv('Musicbrainz/Tables_used/artist_credit_name.txt',sep='\t', header=None, engine='c', usecols=[0,2,3])
artists_credit.columns = ['credit_id','artist_id','artist_name']
artists_credit.head()

Unnamed: 0,credit_id,artist_id,artist_name
0,578352,578352,Gustav Ruppke
1,273232,273232,Zachary
2,153193,153193,The High Level Ranters
3,32262,32262,Georges Brassens
4,1389968,1171184,Harvard of the South


In [7]:
#Let's join the artists with their credit id and verify that the matching is good:
df = pd.merge(artists, artists_credit, how='left', on='artist_id')
df.head()

Unnamed: 0,artist_id,artist_mbid,artist_name_x,start_area1,start_area2,credit_id,artist_name_y
0,805192,8972b1c1-6482-4750-b51f-596d2edea8b1,WIK▲N,,,822846.0,WIK▲N
1,371203,49add228-eac5-4de8-836c-d75cde7369c3,Pete Moutso,,,,
2,273232,c112a400-af49-4665-8bba-741531d962a1,Zachary,,,273232.0,Zachary
3,101060,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,The Silhouettes,222.0,7707.0,101060.0,The Silhouettes
4,145773,7b4a548e-a01a-49b7-82e7-b49efeb9732c,Aric Leavitt,,,145773.0,Aric Leavitt


In [8]:
#It looks like it makes sense. Please note that the credit id is sometimes equal to the artist_id, but not always:
df['check'] = df['artist_id'] - df['credit_id']
df['check'].nunique()

1270628

In [9]:
df.isnull().sum(axis=0)

artist_id              0
artist_mbid            0
artist_name_x         15
start_area1      1120376
start_area2      2109027
credit_id         461241
artist_name_y     461253
check             461241
dtype: int64

In [10]:
#We can now get rid of check and the duplicate artist_name column:
df.drop(labels=['check','artist_name_y'], axis=1, inplace=True)
df.head()

Unnamed: 0,artist_id,artist_mbid,artist_name_x,start_area1,start_area2,credit_id
0,805192,8972b1c1-6482-4750-b51f-596d2edea8b1,WIK▲N,,,822846.0
1,371203,49add228-eac5-4de8-836c-d75cde7369c3,Pete Moutso,,,
2,273232,c112a400-af49-4665-8bba-741531d962a1,Zachary,,,273232.0
3,101060,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,The Silhouettes,222.0,7707.0,101060.0
4,145773,7b4a548e-a01a-49b7-82e7-b49efeb9732c,Aric Leavitt,,,145773.0


## 2) Release information

The objective of this project is to visualize when each artist released for the first time a certain CD/Album/Single etc.

If we look at the "releases" table:

In [11]:
releases = pd.read_csv('Musicbrainz/Tables_used/release.txt',sep='\t', header=None, engine='c', usecols=[0,2,3,4])
releases.columns = ['release_id','release_group','credit_id','group_id']
releases.head()

Unnamed: 0,release_id,release_group,credit_id,group_id
0,9,A Sorta Fairytale,60,896742
1,10,A Sorta Fairytale,60,896742
2,11,Glory of the 80's,60,95360
3,12,Silent All These Years,60,104189
4,26,Demons,20211,94299


We can see, in the first 2 rows, that the same CD/Album can be released/remastered many times. According to Musicbrainz's field description for each release (https://musicbrainz.org/doc/Release):

"A MusicBrainz release represents the unique release (i.e. issuing) of a product on a specific date with specific release information such as the country, label, barcode and packaging. If you walk into a store and purchase an album or single, they are each represented in MusicBrainz as one release".

If we look at another release-related field in Musicbrainz, we find the "release group" (https://musicbrainz.org/doc/Release_Group):

"A release group, just as the name suggests, is used to group several different releases into a single logical entity. Every release belongs to one, and only one release group.

Both release groups and releases are "albums" in a general sense, but with an important difference: a release is something you can buy as media such as a CD or a vinyl record, while a release group embraces the overall concept of an album -- it doesn't matter how many CDs or editions/versions it had."

By reading these descriptions, we can clearly see that the release group is the table we are looking for as it represents a single creation, no matter how many times it has been edited or released afterwards. So we will have to keep the first release id for each release group.

In [12]:
release_country = pd.read_csv('Musicbrainz/Tables_used/release_country.txt',sep='\t', header=None, engine='c', usecols=[0,1,2])
release_country.columns = ['release_id','area_id','release_year']
release_country.head()

Unnamed: 0,release_id,area_id,release_year
0,3,81,1997.0
1,1427792,107,2014.0
2,9,81,2002.0
3,10,221,2002.0
4,11,81,1999.0


In [13]:
df2 = pd.merge(releases, release_country, how='left', on='release_id')
df2.head()

Unnamed: 0,release_id,release_group,credit_id,group_id,area_id,release_year
0,9,A Sorta Fairytale,60,896742,81.0,2002.0
1,10,A Sorta Fairytale,60,896742,221.0,2002.0
2,11,Glory of the 80's,60,95360,81.0,1999.0
3,12,Silent All These Years,60,104189,81.0,1997.0
4,26,Demons,20211,94299,107.0,1998.0


In [14]:
#Let's see how many releases we have:
df2['release_id'].nunique()

2198457

In [15]:
df2.isnull().sum(axis=0)

release_id            0
release_group         7
credit_id             0
group_id              0
area_id          287376
release_year     341983
dtype: int64

In [16]:
#We want to keep only the releases which have a release year, so we can drop the others:
df2.dropna(subset=['release_year'], axis=0, inplace=True)
df2['release_year'] = df2.release_year.astype(int,inplace=True)
df2['release_id'].nunique()

1859982

In [17]:
#Let's analyze the year column:
pd.options.display.max_rows = 2000
df2.groupby('release_year').count()

Unnamed: 0_level_0,release_id,release_group,credit_id,group_id,area_id
release_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,2,2,2,2
4,1,1,1,1,1
5,5,5,5,5,5
7,1,1,1,1,1
8,2,2,2,2,2
10,3,3,3,3,3
14,1,1,1,1,1
17,4,4,4,4,4
18,1,1,1,1,1
19,3,3,3,3,3


By looking at the different year values, and, in order to have enough values per year, we could drop the rows whose year is below 1890 and above 2019. Our visualization would have 130 years, which is pretty good.

In [18]:
df2.drop(df2[df2['release_year'] < 1890].index , inplace=True)
df2.drop(df2[df2['release_year'] >2019].index , inplace=True)
df2.sort_values(by=['release_year']).head()

Unnamed: 0,release_id,release_group,credit_id,group_id,area_id,release_year
1266766,386919,Visions of Paradise Waltz,97546,712605,222.0,1890
1266956,386830,German Ballad with Variations,97546,712514,222.0,1890
1266958,386829,German Ballad with Variations,97546,712514,222.0,1890
1266960,386828,Mountain Bells Polka,97546,712513,222.0,1890
1266961,386827,Mountain Bells Polka,97546,712513,222.0,1890


In [19]:
#Converting the year column to datetime for later:
df2['release_year'] = pd.to_datetime(df2['release_year'].astype(str), format='%Y')
df2.dtypes

release_id                int64
release_group            object
credit_id                 int64
group_id                  int64
area_id                 float64
release_year     datetime64[ns]
dtype: object

In [20]:
#We sort by release id and year (we could have 2 release groups with the same name but produced by different artists):
df2.sort_values(['release_group','release_year','credit_id'], ascending=[True,True,True], inplace=True)
df2.head()

Unnamed: 0,release_id,release_group,credit_id,group_id,area_id,release_year
2026273,2163750,,2205562,1962329,240.0,2014-01-01
1648516,1846605,,1503027,1713833,240.0,2015-01-01
1250325,1714060,Beaux Soirs De Paris,1324142,1609358,73.0,1995-01-01
2116340,2265346,Le 1,2291833,2042812,240.0,2018-01-01
1748061,1895266,M2Music HitDisc Vol. 1,1,1751021,222.0,2006-01-01


In [21]:
df2[df2['release_group'] == 'Artaxerxes']

Unnamed: 0,release_id,release_group,credit_id,group_id,area_id,release_year
1836724,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01
1910376,2379252,Artaxerxes,2392005,2132682,221.0,2009-01-01
1909444,2379244,Artaxerxes,2392011,2133192,222.0,2011-01-01


In [22]:
#Now we can delete the duplicate releases and keep the ones who were first released:
df2.drop_duplicates(subset=['release_group','credit_id'],keep='first', inplace=True)
df2['release_id'].nunique()

1499614

In [23]:
#Just to double-check:
df2[df2['release_group'] == 'Artaxerxes']

Unnamed: 0,release_id,release_group,credit_id,group_id,area_id,release_year
1836724,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01
1909444,2379244,Artaxerxes,2392011,2133192,222.0,2011-01-01


## 3) Matching releases with artists

Now that we have both artist and releases dataframes, we can join them:

In [24]:
df3 = pd.merge(df2, df, how='left', on='credit_id')
df3.head()

Unnamed: 0,release_id,release_group,credit_id,group_id,area_id,release_year,artist_id,artist_mbid,artist_name_x,start_area1,start_area2
0,2163750,,2205562,1962329,240.0,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,7707.0,
1,1846605,,1503027,1713833,240.0,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,
2,1714060,Beaux Soirs De Paris,1324142,1609358,73.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,
3,2265346,Le 1,2291833,2042812,240.0,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,68613.0,
4,1895266,M2Music HitDisc Vol. 1,1,1751021,222.0,2006-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,


In [25]:
df3.isnull().sum(axis=0)

release_id            0
release_group         4
credit_id             0
group_id              0
area_id               0
release_year          0
artist_id           151
artist_mbid         151
artist_name_x       155
start_area1      430452
start_area2      959581
dtype: int64

In [26]:
df3['release_id'].nunique()

1499614

In [27]:
len(df3)

1724524

In [28]:
df3[df3['release_group']=='Artaxerxes']

Unnamed: 0,release_id,release_group,credit_id,group_id,area_id,release_year,artist_id,artist_mbid,artist_name_x,start_area1,start_area2
119493,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01,391603.0,e3062782-ab7b-41bc-8e65-aeea16dc1a89,Ian Partridge,221.0,1178.0
119494,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01,124232.0,4e7f1926-8704-4545-a1a1-ded91651c884,Thomas Arne,221.0,1178.0
119495,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01,688791.0,f34e9da4-2ee7-4f27-aa34-adc5db791bec,Christopher Robson,,
119496,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01,1129787.0,c33f733e-2bf4-402b-9455-1a293601a1cd,Patricia Spence,,
119497,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01,1104538.0,5680c729-615b-47e2-969e-27a087c572fb,Philippa Hyde,221.0,
119498,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01,402986.0,70af5d9a-c6e0-4fcf-9cde-4d3d00e0fcb0,The Parley of Instruments,221.0,1178.0
119499,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01,183632.0,954d1c83-259f-4a25-8878-10c19bb097af,Catherine Bott,221.0,
119500,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01,87510.0,857588a5-b7aa-4f72-a87b-8f03dca60e30,Roy Goodman,221.0,30926.0
119501,2378622,Artaxerxes,2392005,2132682,240.0,1996-01-01,1078968.0,93da7aaa-250b-46e1-b5ef-0ad78d46dc3f,Richard Edgar‐Wilson,,
119502,2379244,Artaxerxes,2392011,2133192,222.0,2011-01-01,854064.0,a87f2b39-84c7-4888-935c-d41943bd7971,Classical Opera Company,221.0,


If we look at the above, we can see that there is one line per each artist that participated for each release ID.

As we don't want to show duplicate releases, we need to keep only one artist per release. We will keep the first artist appearing for each release (even though we know this is not 100% accurate, but we have to avoid duplicates). This will afftect 224.910 rows under a total of 1.499.614 unique releases, so 14% of our dataset.

In [29]:
#Now we can delete the duplicate releases and keep the ones who were first released:
df3.drop_duplicates(subset=['release_id'],keep='first', inplace=True)
df3['release_id'].nunique()

1499614

In [30]:
len(df3)

1499614

## 4) Geographical data

The idea of the visualization is to see where each gender comes from, so, ideally, we would have to look at the artists origins (start area: last 2 columns of our dataframe).

In our dataframe df3, the 5th column "area_id" is related to the area where the release was produced. This isn't directly related to the origin of an artist/band, as many artists have to record their works in different countries/or areas.

Let's see for how many releases we have that information:

In [31]:
df3.isnull().sum(axis=0)

release_id            0
release_group         4
credit_id             0
group_id              0
area_id               0
release_year          0
artist_id           151
artist_mbid         151
artist_name_x       155
start_area1      404503
start_area2      876562
dtype: int64

In Musicbrainz's database, we have some tables related to the areas. Let's see how we can use them to input more geographical information into our dataframe:

In [32]:
areas = pd.read_csv('Musicbrainz/Tables_used/area.txt',sep='\t', header=None, engine='python', usecols=[0,2,3])
areas.columns = ['area_id','area_name','code_type']
areas.head()

Unnamed: 0,area_id,area_name,code_type
0,15449,Greccio,4.0
1,38,Canada,1.0
2,43,Chile,1.0
3,44,China,1.0
4,36,Cambodia,1.0


In [33]:
#Let's see the area types we have:
area_types = pd.read_csv('Musicbrainz/Tables_used/area_type.txt',sep='\t', header=None, engine='python', usecols=[1,3,4], error_bad_lines=False)
area_types.columns = ['type','code_type','definition']
area_types.head(20)

Unnamed: 0,type,code_type,definition
0,Country,1,Country is used for areas included (or previou...
1,Subdivision,2,Subdivision is used for the main administrativ...
2,County,7,County is used for smaller administrative divi...
3,Municipality,4,Municipality is used for small administrative ...
4,City,3,"City is used for settlements of any size, incl..."
5,District,5,District is used for a division of a large cit...
6,Island,6,Island is used for islands and atolls which do...


In [34]:
#Add the area name and type to our main dataframe for the column "area_id":
df4 = pd.merge(df3, areas, how='left', on='area_id')
df4.head()

Unnamed: 0,release_id,release_group,credit_id,group_id,area_id,release_year,artist_id,artist_mbid,artist_name_x,start_area1,start_area2,area_name,code_type
0,2163750,,2205562,1962329,240.0,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,7707.0,,[Worldwide],
1,1846605,,1503027,1713833,240.0,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,,[Worldwide],
2,1714060,Beaux Soirs De Paris,1324142,1609358,73.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,,France,1.0
3,2265346,Le 1,2291833,2042812,240.0,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,68613.0,,[Worldwide],
4,1895266,M2Music HitDisc Vol. 1,1,1751021,222.0,2006-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,,United States,1.0


In [35]:
#Rearranging dataframe columns to have a clearer dataframe:
df4 = df4[['release_id','group_id','release_group','credit_id','area_id','area_name','code_type','release_year','artist_id','artist_mbid','artist_name_x','start_area1','start_area2']]
df4.rename(columns={'area_id':'release_area','area_name':'release_area_name','code_type':'release_code_type','start_area1':'area_id'}, inplace=True)
df4.head()

Unnamed: 0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name_x,area_id,start_area2
0,2163750,1962329,,2205562,240.0,[Worldwide],,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,7707.0,
1,1846605,1713833,,1503027,240.0,[Worldwide],,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,
2,1714060,1609358,Beaux Soirs De Paris,1324142,73.0,France,1.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,
3,2265346,2042812,Le 1,2291833,240.0,[Worldwide],,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,68613.0,
4,1895266,1751021,M2Music HitDisc Vol. 1,1,222.0,United States,1.0,2006-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,


In [36]:
#Add the start area name and type to our main dataframe for the column "area id"(which was "start area 1" before):
df5 = pd.merge(df4, areas, how='left', on='area_id')
df5.head()

Unnamed: 0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name_x,area_id,start_area2,area_name,code_type
0,2163750,1962329,,2205562,240.0,[Worldwide],,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,7707.0,,Philadelphia,3.0
1,1846605,1713833,,1503027,240.0,[Worldwide],,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,,,
2,1714060,1609358,Beaux Soirs De Paris,1324142,73.0,France,1.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,,,
3,2265346,2042812,Le 1,2291833,240.0,[Worldwide],,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,68613.0,,Aix-en-Provence,3.0
4,1895266,1751021,M2Music HitDisc Vol. 1,1,222.0,United States,1.0,2006-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,,,


In [37]:
#Rearranging dataframe columns to have a clearer dataframe:
df5 = df5[['release_id','group_id','release_group','credit_id','release_area','release_area_name','release_code_type','release_year','artist_id','artist_mbid','artist_name_x','area_id','area_name','code_type','start_area2']]
df5.rename(columns={'area_id':'artist_area1','area_name':'artist_area_name1','code_type':'artist_code_type1','start_area2':'area_id'}, inplace=True)
df5.head()

Unnamed: 0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name_x,artist_area1,artist_area_name1,artist_code_type1,area_id
0,2163750,1962329,,2205562,240.0,[Worldwide],,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,7707.0,Philadelphia,3.0,
1,1846605,1713833,,1503027,240.0,[Worldwide],,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,,,
2,1714060,1609358,Beaux Soirs De Paris,1324142,73.0,France,1.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,,,
3,2265346,2042812,Le 1,2291833,240.0,[Worldwide],,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,68613.0,Aix-en-Provence,3.0,
4,1895266,1751021,M2Music HitDisc Vol. 1,1,222.0,United States,1.0,2006-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,,,


In [38]:
#Add the start area 2 name and type to our main dataframe for the column "area id"(which was "start area 2" before):
df6 = pd.merge(df5, areas, how='left', on='area_id')
df6.head()

Unnamed: 0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name_x,artist_area1,artist_area_name1,artist_code_type1,area_id,area_name,code_type
0,2163750,1962329,,2205562,240.0,[Worldwide],,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,7707.0,Philadelphia,3.0,,,
1,1846605,1713833,,1503027,240.0,[Worldwide],,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,,,,,
2,1714060,1609358,Beaux Soirs De Paris,1324142,73.0,France,1.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,,,,,
3,2265346,2042812,Le 1,2291833,240.0,[Worldwide],,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,68613.0,Aix-en-Provence,3.0,,,
4,1895266,1751021,M2Music HitDisc Vol. 1,1,222.0,United States,1.0,2006-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,,,,,


In [39]:
#Renaming columns:
df6.rename(columns={'area_id':'artist_area2','area_name':'artist_area_name2','code_type':'artist_code_type2'}, inplace=True)
df6.head()

Unnamed: 0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name_x,artist_area1,artist_area_name1,artist_code_type1,artist_area2,artist_area_name2,artist_code_type2
0,2163750,1962329,,2205562,240.0,[Worldwide],,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,7707.0,Philadelphia,3.0,,,
1,1846605,1713833,,1503027,240.0,[Worldwide],,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,,,,,
2,1714060,1609358,Beaux Soirs De Paris,1324142,73.0,France,1.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,,,,,
3,2265346,2042812,Le 1,2291833,240.0,[Worldwide],,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,68613.0,Aix-en-Provence,3.0,,,
4,1895266,1751021,M2Music HitDisc Vol. 1,1,222.0,United States,1.0,2006-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,,,,,


Now that we have the names of the different areas, let's check what kind of information we have in those columns.

As we said before, we prefer to keep the artist area preferably, as it represents more the real origin of the music.

1) Artist area 1:

In [40]:
df6.artist_area_name1.value_counts()

United States                           273415
United Kingdom                          133067
Japan                                    83908
Germany                                  67463
France                                   45927
Italy                                    27215
Sweden                                   24982
Canada                                   23619
Finland                                  21981
Netherlands                              18101
Australia                                17738
Spain                                    16090
Russia                                   13821
Brazil                                   11142
Belgium                                  10249
Austria                                   8975
Poland                                    8913
Norway                                    8775
South Korea                               8004
Denmark                                   7140
Argentina                                 6576
Greece       

In [41]:
df6.artist_code_type1.value_counts()

1.0    949862
3.0    112208
2.0     24835
4.0      3058
5.0      2429
7.0       254
6.0       114
Name: artist_code_type1, dtype: int64

As we can see, the majority of the artists' start area type we have is related to countries. This would be good for our visualization except for big countries like USA, Canada or Australia, for which we would prefer to retrieve at least the artist's state, to have a clearer view of the music's origin.

Also, we noticed that we have some area names that don't give us much information: "Worldwide", "Europe", "South Australia", etc.

2) Artist area 2:

In [42]:
df6.artist_area_name2.value_counts()

London                                 23087
Los Angeles                            14173
New York                               12434
Chicago                                 8353
Tokyo                                   7784
Paris                                   6395
Brooklyn                                6258
Berlin                                  5941
Philadelphia                            5274
Detroit                                 4659
San Francisco                           4574
Toronto                                 4068
Boston                                  3959
Seattle                                 3938
Seoul                                   3800
Stockholm                               3448
Melbourne                               3308
Hamburg                                 3259
United Kingdom                          3170
Montreal                                3106
Wien                                    2827
Sydney                                  2798
Buenos Air

In [43]:
df6.artist_code_type2.value_counts()

3.0    481180
2.0     61532
1.0     31001
5.0     25596
4.0     20562
7.0      2487
6.0       556
Name: artist_code_type2, dtype: int64

It looks like this second column could be giving us more detailed information about the artist's origin (only 31K rows have countries). 

We will keep the detail in "artist_area_name2" and "artist_code_type2" as the origin for the rows who have that information, and fill the other rows with "artist_area_name1" and "artist_code_type1"

In [44]:
#First, we rename our columns:
df6.rename(columns={'artist_area_name2':'origin_name','artist_code_type2':'origin_code'}, inplace=True)
df6.head()

Unnamed: 0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name_x,artist_area1,artist_area_name1,artist_code_type1,artist_area2,origin_name,origin_code
0,2163750,1962329,,2205562,240.0,[Worldwide],,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,7707.0,Philadelphia,3.0,,,
1,1846605,1713833,,1503027,240.0,[Worldwide],,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,,,,,
2,1714060,1609358,Beaux Soirs De Paris,1324142,73.0,France,1.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,,,,,
3,2265346,2042812,Le 1,2291833,240.0,[Worldwide],,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,68613.0,Aix-en-Provence,3.0,,,
4,1895266,1751021,M2Music HitDisc Vol. 1,1,222.0,United States,1.0,2006-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,,,,,


In [45]:
#And now we can fill the NaNs with the values in "artist_area_name1" and "artist_code_type1":
df6['origin_name'].fillna(df6['artist_area_name1'], inplace=True)
df6['origin_code'].fillna(df6['artist_code_type1'], inplace=True)
#We can also delete some columns that we don't need anymore:
df6.drop(labels=['artist_area1','artist_area_name1','artist_code_type1','artist_area2'], axis=1, inplace=True)

In [46]:
#Now, let's see what information we have for these new columns:
df6.origin_name.value_counts()

United States                       81824
United Kingdom                      52861
Japan                               42326
Germany                             34085
London                              26075
France                              26057
Los Angeles                         16492
Sweden                              15460
Italy                               15210
New York                            14259
Finland                             13767
Netherlands                         10886
Chicago                              9463
Spain                                9225
Canada                               8819
Australia                            8720
Russia                               8592
Tokyo                                8367
Berlin                               7558
Paris                                7546
Brooklyn                             7031
Poland                               6282
Philadelphia                         5823
Seattle                           

In [47]:
df6.origin_code.value_counts()

3.0    544417
1.0    433886
2.0     74659
5.0     26638
4.0     22369
7.0      2623
6.0       612
Name: origin_code, dtype: int64

In [48]:
#Now, let's see how many empty rows we have:
df6.isnull().sum(axis=0)

release_id                0
group_id                  0
release_group             4
credit_id                 0
release_area              0
release_area_name         0
release_code_type    241017
release_year              0
artist_id               151
artist_mbid             151
artist_name_x           155
origin_name          392302
origin_code          394410
dtype: int64

As we mentioned earlier, we want our visualization to be as detailed as possible, especially where the country of origin is very large, or has a big volume of releases.

What we will do is extract from our dataframe all the rows for which the "origin area" is either empty or represents a large country or a large volume.

Then, we will group by artist and try to find their origin in other sources of information.

In [49]:
#We create a new dataframe containing the target rows:

target = ['United States', 'United Kingdom', 'Japan', 'Germany', 'France', 'Canada', 'Australia', 'Russia',
          '[Worldwide]', 'Europe']
a = df6.loc[df6['origin_name'].isnull()]
b = df6[df6['origin_name'].isin(target)]

unknown_area = pd.concat([a, b], ignore_index=True)
unknown_area.origin_name.value_counts()

United States     81824
United Kingdom    52861
Japan             42326
Germany           34085
France            26057
Canada             8819
Australia          8720
Russia             8592
[Worldwide]        1822
Europe              300
Name: origin_name, dtype: int64

In [50]:
#How many unique artists are there with no area info?
unknown_area['artist_id'].nunique()

215736

In [51]:
#And how many releases are affected?
unknown_area['release_id'].nunique()

657708

So, according to the above line, we have 215.736 artists with unknown or vague origin. Let's have a closer look:

In [52]:
unknown_artist = unknown_area.groupby('artist_name_x').count().sort_values('release_id',ascending=False)
unknown_artist.head(1000)

Unnamed: 0_level_0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,origin_name,origin_code
artist_name_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Various Artists,134857,134857,134857,134857,134857,134857,122637,134857,134857,134857,10,10
[unknown],1363,1363,1363,1363,1363,1363,1319,1363,1363,1363,1363,0
Glee Cast,474,474,474,474,474,474,470,474,474,474,474,474
Duke Ellington & His Orchestra,288,288,288,288,288,288,271,288,288,288,288,288
[language instruction],278,278,278,278,278,278,189,278,278,278,0,0
Vitamin String Quartet,252,252,252,252,252,252,228,252,252,252,252,252
Senmuth,249,249,249,249,249,249,12,249,249,249,249,249
Die drei ???,244,244,244,244,244,244,243,244,244,244,244,244
モーニング娘。,219,219,219,219,219,219,219,219,219,219,219,219
Stefan Wolf,211,211,211,211,211,211,211,211,211,211,211,211


In [53]:
#From what we can see above, the category "Various Artists" has many releases assigned:
df6[df6['artist_name_x']=='Various Artists']

Unnamed: 0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name_x,origin_name,origin_code
4,1895266,1751021,M2Music HitDisc Vol. 1,1,222.0,United States,1.0,2006-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,
26,356044,14028,!!!Here Ain't the Sonics!!!,1,222.0,United States,1.0,1993-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,
41,1623578,1539062,!Go Hit,1298824,81.0,Germany,1.0,1998-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,
44,475440,785494,"!JBL, Volume 2: PROGRESSIVE",1,194.0,Spain,1.0,2004-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,
45,62055,28102,!K7,1,194.0,Spain,1.0,2000-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,
46,1053661,1078102,!K7 2011 Sampler,1,240.0,[Worldwide],,2011-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,
47,62061,147591,!K7 Compilation,1,81.0,Germany,1.0,2003-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,
48,1447852,147591,!K7 Compilation,1298824,81.0,Germany,1.0,2003-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,
49,1012140,955241,!K7 Spring 2002,1,221.0,United Kingdom,1.0,2002-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,
50,2319247,2084339,!Kollections 02: Classics,1,240.0,[Worldwide],,2017-01-01,1.0,89ad4ac3-39f7-470e-963a-56509c546377,Various Artists,,


If we look in detail into these releases, we can see that most of them are music compilations (hence the generic category "Various Artists"). As they are music compilations, that means that the tracks included were originally released before by their genuine author, so we shouldn't take them into account (to avoid duplicates). Also, as we don't have an artist name for them, it will be impossible to retrieve the origin.

We will delete those rows from our datasets.

Let's analyze more in detail who are the rest of artists that have many releases, and decide what to do with them.

In [54]:
#Unknown artist:
df6[df6['artist_name_x']=='[unknown]']

Unnamed: 0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name_x,origin_name,origin_code
7287,546135,843736,100 Beste Kinderliedjes (disc 1),97546,150.0,Netherlands,1.0,1998-01-01,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],[Worldwide],
8817,557295,852580,101 Children's Songs and Nursery Rhymes,97546,221.0,United Kingdom,1.0,2008-01-01,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],[Worldwide],
9663,1232110,1222115,12 Chart Buster Hits: Volume 11,97546,221.0,United Kingdom,1.0,1974-01-01,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],[Worldwide],
9664,665743,936259,12 Chartbuster Hits,97546,221.0,United Kingdom,1.0,1974-01-01,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],[Worldwide],
9817,1235032,1224582,12 Makamda Yaylı Tanbur Taksimleri,97546,214.0,Turkey,1.0,2004-01-01,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],[Worldwide],
9918,638247,914460,12 Tops: Volume 20,97546,221.0,United Kingdom,1.0,1974-01-01,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],[Worldwide],
10852,1208592,1202487,14 Makamda Keman Taksimleri,97546,214.0,Turkey,1.0,2004-01-01,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],[Worldwide],
11319,686485,952465,15 chansons et comptines pour votre bébé,97546,73.0,France,1.0,2004-01-01,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],[Worldwide],
11705,1247647,1234682,16 Makamda Ud Taksimleri,97546,214.0,Turkey,1.0,2004-01-01,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],[Worldwide],
16973,1015709,616974,20 Golden Guitar Hits,97546,221.0,United Kingdom,1.0,1988-01-01,97546.0,125ec42a-7229-4250-afc5-e057484327fe,[unknown],[Worldwide],


The category "unknown" seems to contain music compilations too.

In [55]:
#"Language instruction" artist:
df6[df6['artist_name_x']=='[language instruction]']

Unnamed: 0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name_x,origin_name,origin_code
11186,821781,1057780,15 Minute French,597116,222.0,United States,1.0,2005-01-01,597116.0,80a8851f-444c-4539-892b-ad2a49292aa9,[language instruction],,
11187,822699,1058470,15 Minute Italian,597116,222.0,United States,1.0,2006-01-01,597116.0,80a8851f-444c-4539-892b-ad2a49292aa9,[language instruction],,
30861,536732,836303,450 Nouveaux Exercices Grammaire Niveau Avancé,1964330,73.0,France,1.0,2005-01-01,597116.0,80a8851f-444c-4539-892b-ad2a49292aa9,[language instruction],,
51254,1921485,1771345,A break in/The Police,597116,240.0,[Worldwide],,2007-01-01,597116.0,80a8851f-444c-4539-892b-ad2a49292aa9,[language instruction],,
51811,1920797,1770754,A new telephone number,597116,240.0,[Worldwide],,2006-01-01,597116.0,80a8851f-444c-4539-892b-ad2a49292aa9,[language instruction],,
73778,2107126,1916950,All Audio Spanish - Basic-Intermediate Disc 1,597116,222.0,United States,1.0,1999-01-01,597116.0,80a8851f-444c-4539-892b-ad2a49292aa9,[language instruction],,
78295,685879,731447,All-Audio Spanish,597116,222.0,United States,1.0,1997-01-01,597116.0,80a8851f-444c-4539-892b-ad2a49292aa9,[language instruction],,
81802,1381788,1341586,Alter ego 2,1340398,73.0,France,1.0,2006-01-01,597116.0,80a8851f-444c-4539-892b-ad2a49292aa9,[language instruction],,
82122,1921562,1771391,Alternative forms of energy,597116,240.0,[Worldwide],,2011-01-01,597116.0,80a8851f-444c-4539-892b-ad2a49292aa9,[language instruction],,
100674,1921270,1771164,Apologies and excuses,597116,240.0,[Worldwide],,2007-01-01,597116.0,80a8851f-444c-4539-892b-ad2a49292aa9,[language instruction],,


As its name suggests, these releases are language courses recorded, so they are not music and they are out of our scope too.

The same would apply to the categories [nature sounds], [dialogue] or [christmas music].

We can now delete from our dataframes all these cathegories, and see what we have left.

In [56]:
#In our main dataframe:
df7 = df6.replace(['[nature sounds]','[dialogue]','[christmas music]', 'Various Artists','[unknown]','[language instruction]'], pd.np.nan).dropna(axis=0, subset=['artist_name_x'])

In [57]:
#In our unknown_area dataframe:
unknown_area = unknown_area.replace(['[nature sounds]','[dialogue]','[christmas music]', 'Various Artists','[unknown]','[language instruction]'], pd.np.nan).dropna(axis=0, subset=['artist_name_x'])

In [58]:
#So how many artists do we have to identify their origin for?
unknown_area['artist_id'].nunique()

215727

In [59]:
#We export the unique artists to a dataframe in order to retrieve the information later:
unknown_artist = unknown_area.drop_duplicates(subset='artist_id')
unknown_artist.drop(labels=['release_id','release_group','credit_id','release_area','release_area_name','release_code_type', 'release_year'], axis=1,inplace=True)
unknown_artist.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,group_id,artist_id,artist_mbid,artist_name_x,origin_name,origin_code
0,1713833,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,
1,1609358,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,
3,1435283,1154943.0,2b0e7ee2-a1d0-45d9-9291-2d269bea9160,三田村管打団?,,
6,1596377,1304306.0,27f68a42-6f60-4628-9a01-47a62d7f7cdc,Gus Rachels,,
8,1438559,1170204.0,595b6f86-d893-4cee-8df2-104920ea2a37,SDR,,


# Data from the 1 million songs dataset

Between 2011 and 2012, there was a Music Information Retrieval challenge called "Million Song Dataset". The majority of the data contained was provided by The Echo Nest (today known as Spotify).

At the bottom of the following website, there are links to download the Dataset:

https://labrosa.ee.columbia.edu/millionsong/pages/getting-dataset

As we won't use the whole dataset (just some of the tables), you don't need to download them: they will be attached in the repo.

In [60]:
artists_locations = pd.read_csv('1M_songs/artist_location.csv',sep='<SEP>', header=None, engine='python')

In [61]:
artists_locations.columns = ['artist_id','lat','long','artist_name','location_name']
artists_locations.head()

Unnamed: 0,artist_id,lat,long,artist_name,location_name
0,ARZGXZG1187B9B56B6,-16.96595,-61.14804,Endless Blue,Santa Cruz
1,AR8K6F31187B99C2BC,46.44231,-93.36586,Go Fish,"Twin Cities, MN"
2,ARHJJ771187FB5B581,51.59678,-0.33556,Screaming Lord Sutch,"Harrow, Middlesex, England"
3,ARJ8YLL1187FB3CA93,40.69626,-73.83301,Morton Gould,"Richmond Hill, NY"
4,ARYBAGV11ECC836DAC,43.58828,-79.64372,Crash Parallel,Mississauga


In [62]:
len(artists_locations)

13850

In [63]:
tracks_metadata = pd.read_csv('1M_songs/track_metadata.csv',sep=',', header=0, engine='python', usecols=['artist_id','artist_mbid'])
tracks_metadata.head()

Unnamed: 0,artist_id,artist_mbid
0,ARYZTJS1187B98C555,357ff05d-848a-44cf-b608-cb34b5701ae5
1,ARMVN3U1187FB3A1EB,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9
2,ARGEKB01187FB50750,3d403d44-36ce-465c-ad43-ae877e65adc4
3,ARNWYLR1187B9B2F9C,12be7648-7094-495f-90e6-df4189d68615
4,AREQDTE1269FB37231,


In [64]:
tracks_metadata.dropna(subset=['artist_mbid'],axis=0, inplace=True)
a = pd.merge(artists_locations,tracks_metadata,how='left',on='artist_id')
a.dropna(subset=['artist_mbid'],axis=0, inplace=True)
a.head()

Unnamed: 0,artist_id,lat,long,artist_name,location_name,artist_mbid
0,ARZGXZG1187B9B56B6,-16.96595,-61.14804,Endless Blue,Santa Cruz,0bd9755c-c86d-431c-bc28-ef908b8a9821
1,ARZGXZG1187B9B56B6,-16.96595,-61.14804,Endless Blue,Santa Cruz,0bd9755c-c86d-431c-bc28-ef908b8a9821
2,AR8K6F31187B99C2BC,46.44231,-93.36586,Go Fish,"Twin Cities, MN",d4620364-82ec-4c34-9265-a2b72dfa8e3e
3,AR8K6F31187B99C2BC,46.44231,-93.36586,Go Fish,"Twin Cities, MN",d4620364-82ec-4c34-9265-a2b72dfa8e3e
4,AR8K6F31187B99C2BC,46.44231,-93.36586,Go Fish,"Twin Cities, MN",d4620364-82ec-4c34-9265-a2b72dfa8e3e


In [65]:
a.drop_duplicates(subset='artist_id', inplace=True)
a.drop(labels=['artist_id','lat','long'], axis=1, inplace=True)
a.head()

Unnamed: 0,artist_name,location_name,artist_mbid
0,Endless Blue,Santa Cruz,0bd9755c-c86d-431c-bc28-ef908b8a9821
2,Go Fish,"Twin Cities, MN",d4620364-82ec-4c34-9265-a2b72dfa8e3e
33,Screaming Lord Sutch,"Harrow, Middlesex, England",e1079a78-75d4-4a1a-aef1-0be051386598
64,Morton Gould,"Richmond Hill, NY",4db4e744-3007-4386-b87d-9653acfe0464
78,Crash Parallel,Mississauga,b0d85cf7-b73b-4a5d-bf31-a82493c3a8a8


Can we retrieve some of our "unknown_artist" locations with this dataframe?

In [66]:
b = pd.merge(unknown_artist,a,how='left',on='artist_mbid')
b.notnull().sum(axis=0)

group_id         215728
artist_id        215728
artist_mbid      215728
artist_name_x    215728
origin_name       75476
origin_code       75218
artist_name        3487
location_name      3486
dtype: int64

If we look at the field "location name", we can see that we have just retrieved the location for 3486 artists with the Million Song Dataset help.

In [67]:
#What kind on information is contained in that field?
b.location_name.value_counts()

California - LA                                               130
NY - New York City                                            113
California                                                     67
California - SF                                                66
New York, NY                                                   62
Texas                                                          56
London, England                                                47
Los Angeles, CA                                                46
GERMANY                                                        42
Great Britain / UK                                             42
Tennessee                                                      39
United States                                                  39
Chicago, IL                                                    36
New Jersey                                                     36
CANADA - Ontario                                               34
Washington

It seems pretty detailed so we can keep it for the rest of our project. Let's put that information in the column "origin name" and delete the useless columns:

In [68]:
b[['location_name']].replace(np.nan, 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


In [69]:
b.replace(np.nan, 0, inplace=True) #by doing this the following loop goes much faster
for i in tqdm(range(len(b))):
    if b['location_name'][i] != 0:
        b['origin_name'][i] = b['location_name'][i]
    else:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
100%|██████████| 215728/215728 [00:57<00:00, 3771.93it/s] 


In [70]:
b.replace(0,pd.np.nan, inplace=True)
b.drop(labels=['artist_name','location_name'],axis=1, inplace=True)
b.head()

Unnamed: 0,group_id,artist_id,artist_mbid,artist_name_x,origin_name,origin_code
0,1713833,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,
1,1609358,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,
2,1435283,1154943.0,2b0e7ee2-a1d0-45d9-9291-2d269bea9160,三田村管打団?,,
3,1596377,1304306.0,27f68a42-6f60-4628-9a01-47a62d7f7cdc,Gus Rachels,,
4,1438559,1170204.0,595b6f86-d893-4cee-8df2-104920ea2a37,SDR,,


In [71]:
b.head(1000)

Unnamed: 0,group_id,artist_id,artist_mbid,artist_name_x,origin_name,origin_code
0,1713833,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,
1,1609358,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,
2,1435283,1154943.0,2b0e7ee2-a1d0-45d9-9291-2d269bea9160,三田村管打団?,,
3,1596377,1304306.0,27f68a42-6f60-4628-9a01-47a62d7f7cdc,Gus Rachels,,
4,1438559,1170204.0,595b6f86-d893-4cee-8df2-104920ea2a37,SDR,,
5,1334751,689714.0,66dfce65-8775-45be-9752-2e0ca77aad13,Aaron Roche,,
6,1205176,607730.0,29cd0eb8-b431-4b63-a536-8100c74f0ee0,Grand Groove Bunch,,
7,765607,504980.0,ee1e8412-a686-4e70-a44b-09ba68c2a5ca,Juxta Phona,,
8,683689,448248.0,c83caa4b-aa3c-47db-9a61-3be56942d980,Tama,,
9,747414,511943.0,81427c2e-7c74-42bc-bcbf-756536e52c30,Longina,,


In [72]:
b.isnull().sum(axis=0)

group_id              0
artist_id             0
artist_mbid           0
artist_name_x         0
origin_name      139015
origin_code      140510
dtype: int64

In [73]:
len(b)

215728

In [74]:
df7.head()

Unnamed: 0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name_x,origin_name,origin_code
0,2163750,1962329,,2205562,240.0,[Worldwide],,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,Philadelphia,3.0
1,1846605,1713833,,1503027,240.0,[Worldwide],,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,
2,1714060,1609358,Beaux Soirs De Paris,1324142,73.0,France,1.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,
3,2265346,2042812,Le 1,2291833,240.0,[Worldwide],,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,Aix-en-Provence,3.0
5,1772538,1656147,devil jokes,1653884,240.0,[Worldwide],,2016-01-01,1363025.0,c941ad72-8b13-4940-8d99-0ed9becad2d7,yzome,Seattle,3.0


In [95]:
#We can now merge our dataframe with the information we retrieved from 1 Million Songs Dataset:
df8 = pd.merge(df7, b, how='left', on='artist_id')
df8.head()

Unnamed: 0,release_id,group_id_x,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid_x,artist_name_x_x,origin_name_x,origin_code_x,group_id_y,artist_mbid_y,artist_name_x_y,origin_name_y,origin_code_y
0,2163750,1962329,,2205562,240.0,[Worldwide],,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,Philadelphia,3.0,,,,,
1,1846605,1713833,,1503027,240.0,[Worldwide],,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,,1713833.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,
2,1714060,1609358,Beaux Soirs De Paris,1324142,73.0,France,1.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,,1609358.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,
3,2265346,2042812,Le 1,2291833,240.0,[Worldwide],,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,Aix-en-Provence,3.0,,,,,
4,1772538,1656147,devil jokes,1653884,240.0,[Worldwide],,2016-01-01,1363025.0,c941ad72-8b13-4940-8d99-0ed9becad2d7,yzome,Seattle,3.0,,,,,


In [96]:
#Now, fill the values in the column "origin name x" and "origin code x" with the ones from b:
df8['origin_name_x'].fillna(df8['origin_name_y'], inplace=True)
df8['origin_code_x'].fillna(df8['origin_code_y'], inplace=True)
#We can also delete the extra columns:
df8.drop(labels=['artist_mbid_y','artist_name_x_y','origin_name_y','origin_code_y'], axis=1, inplace=True)
#And change the name of some of the remaining columns:
df8.rename(columns={'artist_mbid_x':'artist_mbid','artist_name_x_x':'artist_name','origin_name_x':'origin_name','origin_code_x':'origin_code'}, inplace=True)
df8.head()

Unnamed: 0,release_id,group_id_x,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name,origin_name,origin_code,group_id_y
0,2163750,1962329,,2205562,240.0,[Worldwide],,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,Philadelphia,3.0,
1,1846605,1713833,,1503027,240.0,[Worldwide],,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,,1713833.0
2,1714060,1609358,Beaux Soirs De Paris,1324142,73.0,France,1.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,,,1609358.0
3,2265346,2042812,Le 1,2291833,240.0,[Worldwide],,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,Aix-en-Provence,3.0,
4,1772538,1656147,devil jokes,1653884,240.0,[Worldwide],,2016-01-01,1363025.0,c941ad72-8b13-4940-8d99-0ed9becad2d7,yzome,Seattle,3.0,


In [77]:
#Now, let's see what information we have:
df8.origin_name.value_counts()

United States                               81909
United Kingdom                              52863
Japan                                       42326
Germany                                     34079
London                                      26113
France                                      26063
Los Angeles                                 16496
Sweden                                      15467
Italy                                       15210
New York                                    14284
Finland                                     13768
Netherlands                                 10894
Chicago                                      9463
Spain                                        9236
Canada                                       8820
Australia                                    8726
Russia                                       8592
Tokyo                                        8367
Berlin                                       7560
Paris                                        7546


In [78]:
#And the Null values:
df8.isnull().sum(axis=0)

release_id                0
group_id_x                0
release_group             7
credit_id                 0
release_area              0
release_area_name         0
release_code_type    228606
release_year              0
artist_id                 0
artist_mbid               0
artist_name               0
origin_name          252988
origin_code          257417
group_id_y           841903
dtype: int64

In [79]:
len(df8)

1362614

As we mentioned earlier, the information that we would ideally need to represent the origin of music is the artist origin. Unfortunately, we have 252.988 releases without that information.

In order to try to keep as much data as possible, we will fill these rows with the release area name: for these releases, instead of taking into account the artist/band origin, we will input the location where the release was produced.

In [97]:
df8['origin_code'].replace(np.nan, 0, inplace=True) 
df8['release_code_type'].replace(np.nan, 0, inplace=True)

for i in tqdm(range(len(df8))):
    if (df8['origin_code'][i] == 1 and df8['release_code_type'][i] > 1) or df8['origin_code'][i] == 0:
        df8['origin_name'][i] = df8['release_area_name'][i]
        df8['origin_code'][i] = df8['release_code_type'][i]       
        
    else:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
100%|██████████| 1362614/1362614 [3:14:32<00:00, 116.74it/s]  


In [98]:
df8.head()

Unnamed: 0,release_id,group_id_x,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name,origin_name,origin_code,group_id_y
0,2163750,1962329,,2205562,240.0,[Worldwide],0.0,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,Philadelphia,3.0,
1,1846605,1713833,,1503027,240.0,[Worldwide],0.0,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,[Worldwide],0.0,1713833.0
2,1714060,1609358,Beaux Soirs De Paris,1324142,73.0,France,1.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,France,1.0,1609358.0
3,2265346,2042812,Le 1,2291833,240.0,[Worldwide],0.0,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,Aix-en-Provence,3.0,
4,1772538,1656147,devil jokes,1653884,240.0,[Worldwide],0.0,2016-01-01,1363025.0,c941ad72-8b13-4940-8d99-0ed9becad2d7,yzome,Seattle,3.0,


In [101]:
df8.origin_name.value_counts()

United States                          152550
United Kingdom                          78494
Japan                                   59394
Germany                                 58890
[Worldwide]                             48636
France                                  36161
London                                  26075
Sweden                                  19322
Italy                                   18680
Netherlands                             16545
Los Angeles                             16491
Finland                                 15073
New York                                14259
Canada                                  13883
Spain                                   12902
Australia                               12541
Russia                                  12351
Chicago                                  9463
Tokyo                                    8367
Poland                                   7826
Brazil                                   7749
Berlin                            

Now that we have input the release area information into the origin columns, we cna change back the zero values into NaNs, to be able to analyze them later:

In [102]:
df8['origin_code'].replace(0,np.nan, inplace=True) 
df8['release_code_type'].replace(0,np.nan, inplace=True)

In [103]:
#For the location types, how much volume do we have for each category?
df8.origin_code.value_counts()

1.0    641233
3.0    544416
2.0     74660
5.0     26636
4.0     22369
7.0      2623
6.0       612
Name: origin_code, dtype: int64

In [104]:
#How many Null values do we have now in the origin columns after the change?
df8.isnull().sum(axis=0)

release_id                0
group_id_x                0
release_group             7
credit_id                 0
release_area              0
release_area_name         0
release_code_type    228606
release_year              0
artist_id                 0
artist_mbid               0
artist_name               0
origin_name               0
origin_code           50065
group_id_y           841903
dtype: int64

In [105]:
#We have reduced the Null values in the origin quite a lot with this last change.
#As the above loop took more than 3h to complete, in order to save time we'll save our dataframe in a csv file:
df8.to_csv('dataframe.csv', sep='\t', encoding='utf-8', index=False)

## 5) Adding music genres to our dataframe

According to Musicbrainz's Genre description in https://wiki.musicbrainz.org/Genre:

"Genres are currently supported in MusicBrainz as part of the tag system.

Some tags (the ones in the genre list) are automatically read and presented as genres."

What we want for our visualization is to have, for each release, its main genre and eventually its subgenre. To do so, I have copied Musicbrainz's "genre list" into a csv file. There are 419 elements considered as genres by Musicbrainz but for our study we'll consider them as our subgenres.

I have manually classified all of these subgenres into 14 categories or "Main genres":

- Blues
- Classical
- Country
- Electronic
- Folk
- Heavy Metal
- Hip Hop
- Jazz
- Latin
- Pop
- Punk
- Rythm & Blues (R&B)
- Rock
- Others (This category contains all the subgenres I haven't been able to classify in the previous categories)

Of course, I wasn't familiar with all the genres appearing in the list so, in order to classify those, I looked at their definition in wikipedia and chose the best main genre for them. If no definition was provided by wikipedia, I searched for them in Google and listened to a representative song in order to make a decision.

In [106]:
#Let's see how the genres and subgenres look like:
genres = pd.read_csv('Musicbrainz/Tables_used/genres.csv',sep='\t', encoding='utf-8')
genres.head()

Unnamed: 0,Main_genre,Subgenre
0,Electronic,acid house
1,Electronic,acid jazz
2,Electronic,acid techno
3,Blues,acoustic blues
4,Rock,acoustic rock


As we read before, Musicbrainz's genre list (subgenre for us) is part of their tag system. Let's import the Musicbrainz's "tags" table and try to identify, from its elements, the ones that are genres.

In [107]:
tags = pd.read_csv('Musicbrainz/Tables_used/tags.txt',sep='\t', header=None, engine='c', usecols=[0,1])
tags.columns = ['tag_id','tag_name']
tags.head()

Unnamed: 0,tag_id,tag_name
0,95,finnish
1,23,slovak
2,801,iowa
3,4,groundbreaking
4,130,taiwanese


In [108]:
#How many tags are there?
tags['tag_id'].nunique()

86806

In [109]:
#What do the tags look like?
tags.tag_name.value_counts()

pacific                                                                                                                                                2
herb recordings                                                                                                                                        2
acid folk                                                                                                                                              2
zelimir kulisic                                                                                                                                        2
new age                                                                                                                                                2
west wales                                                                                                                                             2
new age music                                                                     

As we can see, the tags list contains the genres but also other (more subjective) expressions that some users have chosen as representative for the music entity. 

We will add columns to this tags dataframe to distinguish which of them are actually genres/subgenres:

In [110]:
#First, we change the Subgenre column name to tag_name in our genre file, to be able to join both dataframes:
genres.rename(columns={'Subgenre':'tag_name'}, inplace=True)
tags_genres = pd.merge(tags, genres, how='left', on='tag_name')
tags_genres.head()

Unnamed: 0,tag_id,tag_name,Main_genre
0,95,finnish,
1,23,slovak,
2,801,iowa,
3,4,groundbreaking,
4,130,taiwanese,


In [111]:
#Did we identify all the 419 genres in our dataframe?
pd.notna(tags_genres['Main_genre']).value_counts()

False    86380
True       426
Name: Main_genre, dtype: int64

In [112]:
#We retrieved 7 more, are there duplicates?
table = tags_genres.dropna(subset=['Main_genre'], axis=0).groupby('tag_name').count()
table[table['tag_id'] != 1]

Unnamed: 0_level_0,tag_id,Main_genre
tag_name,Unnamed: 1_level_1,Unnamed: 2_level_1
alternative rock,2,2
hard rock,2,2
hip hop,2,2
indie rock,2,2
new age,2,2
pop punk,2,2
pop rap,2,2
pop rock,2,2
progressive rock,2,2
psychedelic rock,2,2


It seems that we have 12 subgenres repeated twice in our tags_genres dataframe. That means they probably have 2 different tag_id's each:

In [113]:
list_duplicates = table[table['tag_id'] != 1].index.tolist()
tags_genres[tags_genres['tag_name'].isin(list_duplicates)]

Unnamed: 0,tag_id,tag_name,Main_genre
13595,1182,pop rap,Hip Hop
14217,133,punk rock,Rock
14238,235,hip hop,Hip Hop
14373,7,rock,Rock
15338,1100,pop punk,Punk
15380,618,new age,Others
15534,29,progressive rock,Rock
16100,284,indie rock,Rock
16528,271,hard rock,Rock
16616,1091,pop rock,Rock


Indeed, they have two tag_id each so we need to keep both tag_id's in order not to lose information later on.

Musicbrainz provides a table with all the release groups which have been tagged by their users. What we'll do next, is to retrieve those tags and select the ones that are part of the genres list.

In [114]:
release_groups = pd.read_csv('Musicbrainz/Tables_used/release_group.txt',sep='\t', header=None, engine='c', usecols=[0,1,2,3])
release_groups.columns = ['group_id','group_mbid','release_group_name','artist_credit']
release_groups.head()

Unnamed: 0,group_id,group_mbid,release_group_name,artist_credit
0,1964563,f59da930-70ba-4992-a346-7ed2d8e3cda8,Wande,627364
1,12,2b10653e-655d-34fe-9db4-77242d817a17,Chore of Enchantment,12
2,13,0eac6659-d590-3eb7-8c13-ed8b3fdf4ef7,The Inevitable,11
3,28,c554da1a-c1aa-30c3-b0bb-44b1b837de33,Piece and Love,26
4,60,06729175-db17-3443-add7-921739a92762,Ultimate Alternative Wavers,44


In [115]:
release_groups['group_id'].nunique()

1745126

In [116]:
len(release_groups)

1745126

In [117]:
group_tag = pd.read_csv('Musicbrainz/Tables_used/release_group_tag.txt',sep='\t', header=None, engine='c', usecols=[0,1,2])
group_tag.columns = ['group_id','tag_id','tag_counts']
group_tag.head()

Unnamed: 0,group_id,tag_id,tag_counts
0,93688,150,1
1,906692,1371,1
2,906692,6948,1
3,617615,11,1
4,617615,545,1


In [118]:
#We can now merge the release groups with the tag ids and tag counts:
Table = pd.merge(release_groups, group_tag, how='left', on='group_id')
Table.head()

Unnamed: 0,group_id,group_mbid,release_group_name,artist_credit,tag_id,tag_counts
0,1964563,f59da930-70ba-4992-a346-7ed2d8e3cda8,Wande,627364,,
1,12,2b10653e-655d-34fe-9db4-77242d817a17,Chore of Enchantment,12,41017.0,2.0
2,13,0eac6659-d590-3eb7-8c13-ed8b3fdf4ef7,The Inevitable,11,1053.0,2.0
3,13,0eac6659-d590-3eb7-8c13-ed8b3fdf4ef7,The Inevitable,11,1230.0,1.0
4,13,0eac6659-d590-3eb7-8c13-ed8b3fdf4ef7,The Inevitable,11,71.0,3.0


In [119]:
#And finally have our release groups associated with their genres:
release_group_genre = pd.merge(Table, tags_genres, how='left', on='tag_id')
release_group_genre.head()

Unnamed: 0,group_id,group_mbid,release_group_name,artist_credit,tag_id,tag_counts,tag_name,Main_genre
0,1964563,f59da930-70ba-4992-a346-7ed2d8e3cda8,Wande,627364,,,,
1,12,2b10653e-655d-34fe-9db4-77242d817a17,Chore of Enchantment,12,41017.0,2.0,alternative/indie rock,
2,13,0eac6659-d590-3eb7-8c13-ed8b3fdf4ef7,The Inevitable,11,1053.0,2.0,swing,Jazz
3,13,0eac6659-d590-3eb7-8c13-ed8b3fdf4ef7,The Inevitable,11,1230.0,1.0,dixieland,
4,13,0eac6659-d590-3eb7-8c13-ed8b3fdf4ef7,The Inevitable,11,71.0,3.0,jazz,Jazz


Let's stop here for a while and check one of the releases that has several genre tags associated. Let's do this with one of the most popular releases of all times: the album "Thriller", by the king of Pop music: Michael Jackson. 

In [120]:
release_group_genre[release_group_genre['group_mbid']=='f32fab67-77dd-3937-addc-9062e28e4c37']

Unnamed: 0,group_id,group_mbid,release_group_name,artist_credit,tag_id,tag_counts,tag_name,Main_genre
1429052,61656,f32fab67-77dd-3937-addc-9062e28e4c37,Thriller,519,7282.0,2.0,vendu,
1429053,61656,f32fab67-77dd-3937-addc-9062e28e4c37,Thriller,519,642.0,2.0,disco,Electronic
1429054,61656,f32fab67-77dd-3937-addc-9062e28e4c37,Thriller,519,7935.0,1.0,discothèque,
1429055,61656,f32fab67-77dd-3937-addc-9062e28e4c37,Thriller,519,24521.0,0.0,80 s and 90 s pop,
1429056,61656,f32fab67-77dd-3937-addc-9062e28e4c37,Thriller,519,1060.0,1.0,dance-pop,Electronic
1429057,61656,f32fab67-77dd-3937-addc-9062e28e4c37,Thriller,519,303.0,3.0,funk,Others
1429058,61656,f32fab67-77dd-3937-addc-9062e28e4c37,Thriller,519,11.0,0.0,electronic,Electronic
1429059,61656,f32fab67-77dd-3937-addc-9062e28e4c37,Thriller,519,41021.0,2.0,club/dance,
1429060,61656,f32fab67-77dd-3937-addc-9062e28e4c37,Thriller,519,76.0,1.0,dance,Electronic
1429061,61656,f32fab67-77dd-3937-addc-9062e28e4c37,Thriller,519,41027.0,3.0,contemporary r&b,R&B


As we can see, "Pop" is the most used tag for this group so we should keep it as the release's genre.

As music genre is a very subjective feature, in order to be as "objective" as possible, we'll take into consideration the majority of the votes to chose the subgenre and main genre of each release group.

To do so, we will sort the release_group_genre dataframe by number of counts and keep the top tag for each release group.

In [121]:
#We sort by group_id and tag_counts:
release_group_genre.sort_values(['group_id','tag_counts'], ascending=[True,False], inplace=True)
release_group_genre.head()

Unnamed: 0,group_id,group_mbid,release_group_name,artist_credit,tag_id,tag_counts,tag_name,Main_genre
312152,2,e8bee759-9efc-35c2-93d7-09ace9123467,Eclectic Electric,1,1186.0,2.0,acid rap,
312153,2,e8bee759-9efc-35c2-93d7-09ace9123467,Eclectic Electric,1,92310.0,1.0,oldest release group #2,
737291,4,8b6f133a-2fdf-3cc2-b84d-1c889adc0939,Blue Lines,4,1498.0,7.0,trip hop,Hip Hop
737302,4,8b6f133a-2fdf-3cc2-b84d-1c889adc0939,Blue Lines,4,12.0,6.0,downtempo,Electronic
737293,4,8b6f133a-2fdf-3cc2-b84d-1c889adc0939,Blue Lines,4,11.0,5.0,electronic,Electronic


In [122]:
#And now we can drop the duplicate group_ids, keeping the top tags:
release_group_genre.drop_duplicates(subset=['group_id'],keep='first', inplace=True)
release_group_genre.head(20)

Unnamed: 0,group_id,group_mbid,release_group_name,artist_credit,tag_id,tag_counts,tag_name,Main_genre
312152,2,e8bee759-9efc-35c2-93d7-09ace9123467,Eclectic Electric,1,1186.0,2.0,acid rap,
737291,4,8b6f133a-2fdf-3cc2-b84d-1c889adc0939,Blue Lines,4,1498.0,7.0,trip hop,Hip Hop
1756939,11,c6fe6a2b-0ed6-3d2c-b9ce-ddd5421a3452,Hot,11,71.0,3.0,jazz,Jazz
1,12,2b10653e-655d-34fe-9db4-77242d817a17,Chore of Enchantment,12,41017.0,2.0,alternative/indie rock,
4,13,0eac6659-d590-3eb7-8c13-ed8b3fdf4ef7,The Inevitable,11,71.0,3.0,jazz,Jazz
1206877,21,bdd77e94-7917-3aa4-97de-501c53b1d343,The Best of the Art of Noise: Art Works,20,11.0,2.0,electronic,Electronic
312321,24,555dce82-41bf-397a-a487-997b54bee515,Emusic: The Extreme Collection,1,,,,
5,28,c554da1a-c1aa-30c3-b0bb-44b1b837de33,Piece and Love,26,,,,
271,30,2c644807-3b5d-39d4-8c65-dec603bf3f3a,Let It Be,28,41017.0,1.0,alternative/indie rock,
1089677,37,857c3dff-efec-387e-8e07-5b6bdb746afa,Liz Story,1097111,507.0,1.0,piano,


What we want now is to combine our main dataframe (which we exported at the end of part 4 of this notebook) with this new genre information we just retrieved:

In [123]:
#We open our main dataframe:
dataframe = pd.read_csv('dataframe.csv',sep='\t', encoding='utf-8')
dataframe.head()

Unnamed: 0,release_id,group_id_x,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name,origin_name,origin_code,group_id_y
0,2163750,1962329,,2205562,240.0,[Worldwide],,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,Philadelphia,3.0,
1,1846605,1713833,,1503027,240.0,[Worldwide],,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,[Worldwide],,1713833.0
2,1714060,1609358,Beaux Soirs De Paris,1324142,73.0,France,1.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,France,1.0,1609358.0
3,2265346,2042812,Le 1,2291833,240.0,[Worldwide],,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,Aix-en-Provence,3.0,
4,1772538,1656147,devil jokes,1653884,240.0,[Worldwide],,2016-01-01,1363025.0,c941ad72-8b13-4940-8d99-0ed9becad2d7,yzome,Seattle,3.0,


# SEGUIR DESDE AQUI

If we look at the above table, the first line for instance, has the tag "acid rap" associated but this tag isn't considered as a genre by Musicbrainz so, if we decide not to take into consideration this information, we would lose many items for our visualization:

In [46]:
release_group_genre['group_id'].nunique()

1745126

In [56]:
#How many release groups did we retrieved the Main genre for?
pd.notna(release_group_genre['Main_genre']).value_counts()

False    1712671
True      323011
Name: Main_genre, dtype: int64

In [60]:
#Wah are the tag names that don't have a genre?
genreless = release_group_genre[pd.notna(release_group_genre.tag_name) & pd.isnull(release_group_genre.Main_genre)]
genreless.head()

Unnamed: 0,group_id,group_mbid,release_group_name,artist_credit,tag_id,tag_counts,tag_name,Main_genre
1,12,2b10653e-655d-34fe-9db4-77242d817a17,Chore of Enchantment,12,41017.0,2.0,alternative/indie rock,
3,13,0eac6659-d590-3eb7-8c13-ed8b3fdf4ef7,The Inevitable,11,1230.0,1.0,dixieland,
7,60,06729175-db17-3443-add7-921739a92762,Ultimate Alternative Wavers,44,23174.0,1.0,rock_indie rock,
16,454,c997762b-9719-3db3-8681-4e8731324cdf,Solitude/Solitaire,327,5949.0,0.0,hi nrg,
19,454,c997762b-9719-3db3-8681-4e8731324cdf,Solitude/Solitaire,327,1085.0,2.0,adult contemporary,


In [61]:
len(genreless)

154656

As we can see above, some of the tags that don't have a Main genre associated could be easily classified (for instance: "alternative/indie rock", or "rock_indie rock"). 

What I will do now is to group this genreless subset by tag_name, export it to a csv file and manually classify the tags I can, using the same criteria I did before.

In [74]:
genreless_grouped = genreless.groupby('tag_name').count().reset_index()[['tag_name','tag_counts','Main_genre']].copy()
genreless_grouped.to_csv('genreless.csv', sep='\t', index=None, encoding='utf-8')

In [48]:
type(release_genre[release_genre['release_id']==8340])

pandas.core.frame.DataFrame

In [24]:
tags[tags['tag_id']==7]

Unnamed: 0,tag_id,tag_name,b
14373,7,rock,65222


In [9]:
tags[tags['tag_id']==40899]

Unnamed: 0,tag_id,tag_name,b
19638,40899,rock & roll,0


In [17]:
tags[tags['tag_id']==714]

Unnamed: 0,tag_id,tag_name,b
16527,714,britpop,236


In [18]:
artist_tag = pd.read_csv('Musicbrainz/Tables_used/artist_tag.txt',sep='\t', header=None, engine='c', usecols=[0,1,2])
artist_tag.columns = ['artist_id','tag_id','tag_count']
artist_tag.head()

Unnamed: 0,artist_id,tag_id,tag_count
0,468800,29,2
1,522545,63294,1
2,31390,173,1
3,108404,271,1
4,108404,7,1
