# Creando el grafo

Hemos leído ya el repertorio de la pagina https://www.allmusic.com/ para la lista de artistas que disponemos. Estos datos contienen 4 campos:

- Identificador del artista
- Nombre del artista
- Identificador del artista con el que comparte una cancion
- Cancion que comparten

El objetivo es obtener un documento en el cual tengamos unicamente los nombres de los artistas que son los extremos de la arista y el peso de la arista, del tipo.

|Artist|Collaborator|Number collaborations|
|--|--|--|
|Metallica|ACDC|6|

In [15]:
import pandas as pd
import unidecode
import numpy as np

In [16]:
# Primero leo el repertorio

repertorio = pd.read_json('./Data-Extraction/50_First_Artists.json',orient='records', typ = 'frame', lines=True)

# Todas las canciones a minúscula
repertorio['song'] = repertorio['song'].str.lower()

repertorio['song'] = repertorio['song'].str.normalize('NFD').str.encode('ascii', errors='ignore').str.decode('utf-8')
repertorio.head(5)

Unnamed: 0,artist_id,artist_name,channel_id,channel_name,feat_artist_id,feat_artist_name,song,subs_count,view_count
0,mn0000026061,Ana Belén,UCIbghfR6TYMvvncB1OcglDg,Ana Belén - Topic,mn0000805858,Estopa,y en todos los caminos,1933,525186
1,mn0000026061,Ana Belén,UCIbghfR6TYMvvncB1OcglDg,Ana Belén - Topic,mn0000805858,Estopa,y en todos los caminos,1933,525186
2,mn0000026061,Ana Belén,UCIbghfR6TYMvvncB1OcglDg,Ana Belén - Topic,mn0000805858,Estopa,y en todos los caminos,1933,525186
3,mn0000026061,Ana Belén,UCIbghfR6TYMvvncB1OcglDg,Ana Belén - Topic,mn0000805858,Estopa,y si manan,1933,525186
4,mn0000026061,Ana Belén,UCIbghfR6TYMvvncB1OcglDg,Ana Belén - Topic,mn0000805858,Estopa,y si manana,1933,525186


In [17]:
# Ordeno el repertorio por el nombre del artísta
repertorio = repertorio.sort_values(by='artist_name')
repertorio.head()

Unnamed: 0,artist_id,artist_name,channel_id,channel_name,feat_artist_id,feat_artist_name,song,subs_count,view_count
4060,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000567809,Modern Talking,together again,474001,573163176
4117,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000664817,Gloria Estefan,"merry christmas, happy holidays",474001,573163176
4116,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000664817,Gloria Estefan,"merry christmas, happy holidays",474001,573163176
4115,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000664817,Gloria Estefan,"merry christmas, happy holidays",474001,573163176
4114,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000664817,Gloria Estefan,"merry christmas, happy holidays",474001,573163176


In [18]:
# Hay cancionoes duplicadas (featurings duplicados), los elimino
repertorio = repertorio.drop_duplicates(subset=repertorio.columns).reset_index(drop=True)

In [19]:
repertorio['Number collaborations'] = 1
repertorio.head()

Unnamed: 0,artist_id,artist_name,channel_id,channel_name,feat_artist_id,feat_artist_name,song,subs_count,view_count,Number collaborations
0,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000567809,Modern Talking,together again,474001,573163176,1
1,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000664817,Gloria Estefan,"merry christmas, happy holidays",474001,573163176,1
2,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000664817,Gloria Estefan,megastore mania,474001,573163176,1
3,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000850607,Rosie O'Donnell,love's in our hearts on christmas day,474001,573163176,1
4,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000850607,Rosie O'Donnell,the lion sleeps tonight [a capella version],474001,573163176,1


In [20]:
f = {
    'artist_name': 'first', 
    'feat_artist_name': 'first', 
    'channel_id': 'first',
    'channel_name': 'first',
    'subs_count': 'first',
    'view_count': 'first',
    'Number collaborations': 'sum' 
}
test = repertorio.groupby(['artist_id', 'feat_artist_id'],
                          as_index=False).agg(f)


test.head()

Unnamed: 0,artist_id,feat_artist_id,artist_name,feat_artist_name,channel_id,channel_name,subs_count,view_count,Number collaborations
0,mn0000000534,mn0000344634,All Saints,Melanie Blatt,UCSf2D9Hl9_wLDVr-hp05BmA,ALLSAINTS,24007,101023,10
1,mn0000000534,mn0000642542,All Saints,Burt Bacharach,UCSf2D9Hl9_wLDVr-hp05BmA,ALLSAINTS,24007,101023,6
2,mn0000000534,mn0000815862,All Saints,All-Saints Ensemble,UCSf2D9Hl9_wLDVr-hp05BmA,ALLSAINTS,24007,101023,8
3,mn0000000534,mn0001233067,All Saints,Danny Thompson,UCSf2D9Hl9_wLDVr-hp05BmA,ALLSAINTS,24007,101023,50
4,mn0000005307,mn0000065618,Alicia Keys,Baka Boyz,UCK5X3f0fxO4YnVKVZP8p6hg,Alicia Keys,3264720,43813185,13


In [21]:
output = pd.DataFrame()
for _, row in test.iterrows():
    inverted = test[(test['artist_name'] == row['feat_artist_name']) & (test['feat_artist_name'] == row['artist_name'])]
    if inverted.empty:
        output = output.append(row)

output

Unnamed: 0,Number collaborations,artist_id,artist_name,channel_id,channel_name,feat_artist_id,feat_artist_name,subs_count,view_count
0,10.0,mn0000000534,All Saints,UCSf2D9Hl9_wLDVr-hp05BmA,ALLSAINTS,mn0000344634,Melanie Blatt,24007.0,1.010230e+05
1,6.0,mn0000000534,All Saints,UCSf2D9Hl9_wLDVr-hp05BmA,ALLSAINTS,mn0000642542,Burt Bacharach,24007.0,1.010230e+05
2,8.0,mn0000000534,All Saints,UCSf2D9Hl9_wLDVr-hp05BmA,ALLSAINTS,mn0000815862,All-Saints Ensemble,24007.0,1.010230e+05
3,50.0,mn0000000534,All Saints,UCSf2D9Hl9_wLDVr-hp05BmA,ALLSAINTS,mn0001233067,Danny Thompson,24007.0,1.010230e+05
4,13.0,mn0000005307,Alicia Keys,UCK5X3f0fxO4YnVKVZP8p6hg,Alicia Keys,mn0000065618,Baka Boyz,3264720.0,4.381318e+07
5,28.0,mn0000005307,Alicia Keys,UCK5X3f0fxO4YnVKVZP8p6hg,Alicia Keys,mn0000150962,Eve,3264720.0,4.381318e+07
6,3.0,mn0000005307,Alicia Keys,UCK5X3f0fxO4YnVKVZP8p6hg,Alicia Keys,mn0000197355,Freeway,3264720.0,4.381318e+07
7,2.0,mn0000005307,Alicia Keys,UCK5X3f0fxO4YnVKVZP8p6hg,Alicia Keys,mn0000201983,Westlife,3264720.0,4.381318e+07
8,21.0,mn0000005307,Alicia Keys,UCK5X3f0fxO4YnVKVZP8p6hg,Alicia Keys,mn0000239827,John Mayer,3264720.0,4.381318e+07
9,6.0,mn0000005307,Alicia Keys,UCK5X3f0fxO4YnVKVZP8p6hg,Alicia Keys,mn0000312890,Justin Timberlake,3264720.0,4.381318e+07


In [22]:
output['Number collaborations'] = output['Number collaborations'].astype(np.int32)
output['subs_count'] = output['subs_count'].astype(np.int32)
output['view_count'] = output['view_count'].astype(np.int32)

In [24]:
output = output.sort_values(by='artist_name')
output

Unnamed: 0,Number collaborations,artist_id,artist_name,channel_id,channel_name,feat_artist_id,feat_artist_name,subs_count,view_count
124,30,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000567809,Modern Talking,474001,573163176
125,7,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000664817,Gloria Estefan,474001,573163176
126,4,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000759187,Blaque,474001,573163176
127,1,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000799081,Full Force,474001,573163176
128,5,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000850607,Rosie O'Donnell,474001,573163176
122,31,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000101895,Joe,474001,573163176
123,5,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000337119,Phil Collins,474001,573163176
129,15,mn0000516929,*NSYNC,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,mn0000861351,Nelly,474001,573163176
103,11,mn0000480108,A-ha,UCeEi1My3KTXHT0SG1JkR9gA,a-ha,mn0000716816,Wolfgang Petry,339851,217453217
105,45,mn0000480108,A-ha,UCeEi1My3KTXHT0SG1JkR9gA,a-ha,mn0001489096,Electric Rudeboyz,339851,217453217


In [25]:
output.to_csv('./Data-Extraction/Lista_Grafo.csv', sep=";", encoding="utf-8", index=None)