# DATA CLEANING

In [80]:
import pandas as pd
import numpy as np
import string
import re

In [2]:
#Importing our main dataframe:
df = pd.read_csv('main_dataframe.csv', sep=',', header=0, encoding='utf-8')
df.head()

Unnamed: 0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name,origin_name,origin_code,tag_id,tag_name,main_genre
0,2163750,1962329,,2205562,240.0,[Worldwide],,2014-01-01,1654312.0,d10d6441-dcc1-4202-93bf-0c0acf72913a,Soul Glo,Philadelphia,3.0,,,
1,1846605,1713833,,1503027,240.0,[Worldwide],,2015-01-01,1112115.0,7b52c77b-1a34-439d-a285-3a7c69cb5b1a,Ben Bennett,,,,,
2,1714060,1609358,Beaux Soirs De Paris,1324142,73.0,France,1.0,1995-01-01,1122795.0,71b8451c-c10a-400e-9544-101f34ab2522,Soixante Étages,France,1.0,,,
3,2265346,2042812,Le 1,2291833,240.0,[Worldwide],,2018-01-01,1720981.0,a69efb5f-0b28-4328-8ff0-44d8d6f39755,TedeuzeM,Aix-en-Provence,3.0,,,
4,1772538,1656147,devil jokes,1653884,240.0,[Worldwide],,2016-01-01,1363025.0,c941ad72-8b13-4940-8d99-0ed9becad2d7,yzome,Seattle,3.0,,,


## 1) Genre information

In [3]:
#Wah are the tag names that don't have a Main genre for?
genreless = df[pd.notna(df.tag_name) & pd.isnull(df.main_genre)]
genreless.head(20)

Unnamed: 0,release_id,group_id,release_group,credit_id,release_area,release_area_name,release_code_type,release_year,artist_id,artist_mbid,artist_name,origin_name,origin_code,tag_id,tag_name,main_genre
5,1494610,1435283,!,1367808,107.0,Japan,1.0,2006-01-01,1154943.0,2b0e7ee2-a1d0-45d9-9291-2d269bea9160,三田村管打団?,Japan,1.0,71814.0,likedis auto,
16,1753475,1640848,! [雨だれ],1360282,107.0,Japan,1.0,2015-01-01,1149234.0,fa2446a5-8f31-4736-81a6-0732ca685d3e,Kidori Kidori,Japan,1.0,32232.0,likedis auto,
18,1494611,1435284,!!,1367808,107.0,Japan,1.0,2009-01-01,1154943.0,2b0e7ee2-a1d0-45d9-9291-2d269bea9160,三田村管打団?,Japan,1.0,71814.0,likedis auto,
20,1494612,1435285,!!!,1367808,107.0,Japan,1.0,2013-01-01,1154943.0,2b0e7ee2-a1d0-45d9-9291-2d269bea9160,三田村管打団?,Japan,1.0,71814.0,likedis auto,
23,2358112,2115512,!!!!YEAH!!!!,857260,107.0,Japan,1.0,2018-01-01,826465.0,eae5c7a7-5f94-4f1d-a2c4-da8db64dc832,グッドモーニングアメリカ,Hachiōji,3.0,1501.0,rock music,
26,112533,227620,!!Going Places!!,1722910,222.0,United States,1.0,1965-01-01,34720.0,012ef4f9-2d35-4000-86e5-bc761f87dab9,Herb Alpert & The Tijuana Brass,Los Angeles,3.0,32232.0,easylistening,
28,1497946,1438559,!(ビックラゲーション),1391421,107.0,Japan,1.0,2010-01-01,1170204.0,595b6f86-d893-4cee-8df2-104920ea2a37,SDR,Japan,1.0,71814.0,likedis auto,
30,1958907,1800619,!..تهموني,1952741,220.0,United Arab Emirates,1.0,2002-01-01,43358.0,2674597f-6c40-47cc-b980-67f94725f7a7,نجوى كرم,Zahlé,3.0,30022.0,pop music,
31,1958915,1209641,!..شو مغيره,1952741,220.0,United Arab Emirates,1.0,2004-01-01,43358.0,2674597f-6c40-47cc-b980-67f94725f7a7,نجوى كرم,Zahlé,3.0,30022.0,pop music,
40,377904,633567,!Ich kann,12087,81.0,Germany,1.0,2006-01-01,12087.0,3531715d-fee3-4c5b-82ff-5d85865b598b,Reinhard Mey,Berlin,3.0,37498.0,folk music,


As we can see above, some of the tags that don't have a Main genre associated could be easily classified (for instance: "pop music", or "experimental hip-hop"). 

Those tag names are not considered as a subgenre by Musicbrainz but they do provide us with some information about the release main genre. We will consider them as subgenre and identify their main genre.

What I will do now is to retrieve more information about these genreless tag_names in order to be able to classiffy them:

In [4]:
genreless_grouped = genreless.groupby('tag_name').count()[['release_id']].copy().reset_index()

In [5]:
genreless_grouped.sort_values(by=['release_id'], ascending=False, inplace=True)
genreless_grouped.rename(columns={'release_id':'release_count'}, inplace=True)
genreless_grouped.head()

Unnamed: 0,tag_name,release_count
6583,pop music,28121
5185,likedis auto,17027
7175,rock music,10356
370,J-pop,9363
4182,hip hop music,4834


In [6]:
#We create new columns to retrieve some information about the content of each tag:
genreless_grouped['Blues'] = np.nan
genreless_grouped['Classical'] = np.nan
genreless_grouped['Country'] = np.nan
genreless_grouped['Electronic'] = np.nan
genreless_grouped['Folk'] = np.nan
genreless_grouped['Heavy_Metal'] = np.nan
genreless_grouped['Hip_Hop'] = np.nan
genreless_grouped['Jazz'] = np.nan
genreless_grouped['Latin'] = np.nan
genreless_grouped['Pop'] = np.nan
genreless_grouped['Punk'] = np.nan
genreless_grouped['RB'] = np.nan
genreless_grouped['Rock'] = np.nan
genreless_grouped.head()

Unnamed: 0,tag_name,release_count,Blues,Classical,Country,Electronic,Folk,Heavy_Metal,Hip_Hop,Jazz,Latin,Pop,Punk,RB,Rock
6583,pop music,28121,,,,,,,,,,,,,
5185,likedis auto,17027,,,,,,,,,,,,,
7175,rock music,10356,,,,,,,,,,,,,
370,J-pop,9363,,,,,,,,,,,,,
4182,hip hop music,4834,,,,,,,,,,,,,


In [7]:
#We create a column tag_name_clean where the text is formatted (remove punctuation, concatenate all words):
punctuation = ['#','!','?','(',')','*','-','%',' ',',',"'",'.','"','/','<','>',':']
genreless_grouped['tag_name_clean'] = genreless_grouped['tag_name'].apply(lambda x: ''.join(c for c in x if c not in punctuation))
genreless_grouped.head()

Unnamed: 0,tag_name,release_count,Blues,Classical,Country,Electronic,Folk,Heavy_Metal,Hip_Hop,Jazz,Latin,Pop,Punk,RB,Rock,tag_name_clean
6583,pop music,28121,,,,,,,,,,,,,,popmusic
5185,likedis auto,17027,,,,,,,,,,,,,,likedisauto
7175,rock music,10356,,,,,,,,,,,,,,rockmusic
370,J-pop,9363,,,,,,,,,,,,,,Jpop
4182,hip hop music,4834,,,,,,,,,,,,,,hiphopmusic


In [8]:
#We create a pattern of words that could be associated with each genre:
Blues = 'blues'
Classical = 'classical|symphony|orchestra|stringquartet'
Country = 'country'
Electronic = 'electronic|electr|dance|house'
Folk = 'folk'
Heavy_Metal = 'metal'
Hip_Hop = 'hiphop|rap'
Jazz = 'jazz|jamband'
Latin = 'latin'
Pop = 'pop'
Punk = 'punk'
RB = 'rhythmandblues|rythmandblues|R&B'
Rock = 'rock'

## INTENTAR MEJORAR LA SIGUIENTE CELDA CON BUCLE FOR:

genres = {'Blues':Blues, 'Classical':Classical,'Country':Country,'Electronic':Electronic,'Folk':Folk,'Heavy_Metal':Heavy_Metal, 'Hip_Hop':Hip_Hop,'Jazz':Jazz, 'Latin':Latin, 'Pop':Pop, 'Punk':Punk, 'RB':RB, 'Rock':Rock}
columns = genreless_grouped.columns[2:15]

for i in columns:
    for genre in genres:
        if str(i) == str(genre):
            print(i)
            print(str(genre))
            print(genreless_grouped.i)
            print(genre)
            print(genres[genre])
            genreless_grouped.i = np.where(genreless_grouped.tag_name_clean.str.contains(str(genres[genre])), 1, np.nan)

genreless_grouped.head(100)




genres = {'Blues':Blues, 'Classical':Classical,'Country':Country,'Electronic':Electronic,'Folk':Folk,'Heavy_Metal':Heavy_Metal, 'Hip_Hop':Hip_Hop,'Jazz':Jazz, 'Latin':Latin, 'Pop':Pop, 'Punk':Punk, 'RB':RB, 'Rock':Rock}
columns = genreless_grouped.columns[2:15]

for i in columns:
    for genre in genres:
        if str(i) == str(genre):
            a = genres[genre]
            print(a)
            print(i)
            print(str(genre))
            print(genreless_grouped.i)
            print(genre)
            print(genres[genre])
            genreless_grouped.i = np.where(genreless_grouped.tag_name_clean.str.contains(a), 1, np.nan)

genreless_grouped.head(100)

In [9]:
#And now we fill each genre column by searching if the column tag_name_clean contains the patterns:
genreless_grouped.Blues = np.where(genreless_grouped.tag_name_clean.str.contains(Blues), 'Blues', np.nan)
genreless_grouped.Classical = np.where(genreless_grouped.tag_name_clean.str.contains(Classical), 'Classical', np.nan)
genreless_grouped.Country = np.where(genreless_grouped.tag_name_clean.str.contains(Country), 'Country', np.nan)
genreless_grouped.Electronic = np.where(genreless_grouped.tag_name_clean.str.contains(Electronic), 'Electronic', np.nan)
genreless_grouped.Folk = np.where(genreless_grouped.tag_name_clean.str.contains(Folk), 'Folk', np.nan)
genreless_grouped.Heavy_Metal = np.where(genreless_grouped.tag_name_clean.str.contains(Heavy_Metal), 'Heavy_Metal', np.nan)
genreless_grouped.Hip_Hop = np.where(genreless_grouped.tag_name_clean.str.contains(Hip_Hop), 'Hip_Hop', np.nan)
genreless_grouped.Jazz = np.where(genreless_grouped.tag_name_clean.str.contains(Jazz), 'Jazz', np.nan)
genreless_grouped.Latin = np.where(genreless_grouped.tag_name_clean.str.contains(Latin), 'Latin', np.nan)
genreless_grouped.Pop = np.where(genreless_grouped.tag_name_clean.str.contains(Pop), 'Pop', np.nan)
genreless_grouped.Punk = np.where(genreless_grouped.tag_name_clean.str.contains(Punk), 'Punk', np.nan)
genreless_grouped.RB = np.where(genreless_grouped.tag_name_clean.str.contains(RB), 'RB', np.nan)
genreless_grouped.Rock = np.where(genreless_grouped.tag_name_clean.str.contains(Rock), 'Rock', np.nan)

In [11]:
genreless_grouped.replace('nan', np.nan, inplace=True)
genreless_grouped.head()

Unnamed: 0,tag_name,release_count,Blues,Classical,Country,Electronic,Folk,Heavy_Metal,Hip_Hop,Jazz,Latin,Pop,Punk,RB,Rock,tag_name_clean
6583,pop music,28121,,,,,,,,,,Pop,,,,popmusic
5185,likedis auto,17027,,,,,,,,,,,,,,likedisauto
7175,rock music,10356,,,,,,,,,,,,,Rock,rockmusic
370,J-pop,9363,,,,,,,,,,Pop,,,,Jpop
4182,hip hop music,4834,,,,,,,Hip_Hop,,,,,,,hiphopmusic


What we want now, is to identify the tag_name which can contain more than 1 Main genre (e.g: "poprock"), and decide which is the main genre for them.

In [57]:
#We create a column "genre_counts" that sums the number of genres identified for each tag_name:
genreless_grouped['genre_counts'] = genreless_grouped.iloc[:,2:15].notnull().sum(axis=1)
genreless_grouped.head()

Unnamed: 0,tag_name,release_count,Blues,Classical,Country,Electronic,Folk,Heavy_Metal,Hip_Hop,Jazz,Latin,Pop,Punk,RB,Rock,tag_name_clean,genre_counts
6583,pop music,28121,,,,,,,,,,Pop,,,,popmusic,1
5185,likedis auto,17027,,,,,,,,,,,,,,likedisauto,0
7175,rock music,10356,,,,,,,,,,,,,Rock,rockmusic,1
370,J-pop,9363,,,,,,,,,,Pop,,,,Jpop,1
4182,hip hop music,4834,,,,,,,Hip_Hop,,,,,,,hiphopmusic,1


In [100]:
genreless_grouped['main_genre'] = np.nan
for i in range(len(genreless_grouped)):
    if genreless_grouped['genre_counts'][i] != 0:
        a = genreless_grouped.loc[i,"Blues":"Rock"].notna()
        z = [i for i, x in enumerate(a) if x]
        genreless_grouped['main_genre'][i] = str(a[z].index.values)
    else:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [111]:
#We can now get rid of the intermediary columns:
genreless_grouped.drop(labels=['Blues', 'Classical', 'Country',
       'Electronic', 'Folk', 'Heavy_Metal', 'Hip_Hop', 'Jazz', 'Latin', 'Pop',
       'Punk', 'RB', 'Rock', 'tag_name_clean'], axis=1, inplace=True)
genreless_grouped.head()

Unnamed: 0,tag_name,release_count,genre_counts,main_genre
6583,pop music,28121,1,['Pop']
5185,likedis auto,17027,0,
7175,rock music,10356,1,['Rock']
370,J-pop,9363,1,['Pop']
4182,hip hop music,4834,1,['Hip_Hop']


What we want now is to analyze the cases where there is more than one main genre identified:

In [113]:
genreless_grouped.genre_counts.value_counts()

0    7293
1    1530
2     229
3      23
4       4
5       1
Name: genre_counts, dtype: int64

In [134]:
multiple_genre = genreless_grouped[genreless_grouped['genre_counts'] >1]

In [138]:
table = pd.pivot_table(multiple_genre, values=['release_count'], index=['main_genre'], aggfunc=np.sum)
table.sort_values(by=['release_count'], ascending=False, inplace=True)
table.head(1000)

Unnamed: 0_level_0,release_count
main_genre,Unnamed: 1_level_1
['Blues' 'RB'],2844
['Pop' 'Rock'],1856
['Blues' 'Jazz'],1569
['Electronic' 'Hip_Hop'],365
['Electronic' 'Pop'],185
['Hip_Hop' 'Pop'],105
['Folk' 'Rock'],87
['Classical' 'Electronic'],65
['Folk' 'Pop'],53
['Heavy_Metal' 'Hip_Hop'],47
