# Creando el grafo

Hemos leído ya el repertorio de la pagina https://www.allmusic.com/ para la lista de artistas que disponemos. Estos datos contienen 4 campos:

- Identificador del artista
- Nombre del artista
- Identificador del artista con el que comparte una cancion
- Cancion que comparten

El objetivo es obtener un documento en el cual tengamos unicamente los nombres de los artistas que son los extremos de la arista y el peso de la arista, del tipo.

|Artist|Collaborator|Number collaborations|
|--|--|--|
|Metallica|ACDC|6|

In [1]:
import pandas as pd
import unidecode
import numpy as np

In [30]:
# Primero leo el repertorio
repertorio = pd.read_csv('./Data/Repertorio-Final.csv',
                         sep=',', 
                         encoding='utf-8',
                         index_col = None)

# Todas las canciones a minúscula
repertorio['song'] = repertorio['song'].str.lower()

""" Elimina los acentos en las canciones"""
repertorio['song'] = repertorio['song'].str.normalize('NFD').str.encode('ascii', errors='ignore').str.decode('utf-8')
repertorio.head(5)

Unnamed: 0,artist_id,artist_name,feat_artist_id,feat_artist_name,song
0,mn0000690254,ZZ Top,mn0000503563,The Moving Sidewalks,you make me shake
1,mn0000690254,ZZ Top,mn0000503563,The Moving Sidewalks,you were so close to me
2,mn0000246960,Wham!,mn0000545074,George Michael,young guns
3,mn0000246960,Wham!,mn0000545074,George Michael,young guns
4,mn0000246960,Wham!,mn0000545074,George Michael,young guns (go for it!)


In [31]:
# Ordeno el repertorio por el nombre del artísta
repertorio = repertorio.sort_values(by='artist_name')
repertorio.head()

Unnamed: 0,artist_id,artist_name,feat_artist_id,feat_artist_name,song
5671,mn0000516929,*NSYNC,mn0000567809,Modern Talking,you got it
5770,mn0000516929,*NSYNC,mn0000101895,Joe,i'll never stop(vcd)
5771,mn0000516929,*NSYNC,mn0000101895,Joe,if i'm not the one
5772,mn0000516929,*NSYNC,mn0000101895,Joe,if i'm not the one
5773,mn0000516929,*NSYNC,mn0000101895,Joe,if only in heaven's eyes


In [32]:
# Hay cancionoes duplicadas (featurings duplicados), los elimino
repertorio = repertorio.drop_duplicates(subset=repertorio.columns).reset_index(drop=True)

In [33]:
repertorio['Number collaborations'] = 1
repertorio.head()

Unnamed: 0,artist_id,artist_name,feat_artist_id,feat_artist_name,song,Number collaborations
0,mn0000516929,*NSYNC,mn0000567809,Modern Talking,you got it,1
1,mn0000516929,*NSYNC,mn0000101895,Joe,i'll never stop(vcd),1
2,mn0000516929,*NSYNC,mn0000101895,Joe,if i'm not the one,1
3,mn0000516929,*NSYNC,mn0000101895,Joe,if only in heaven's eyes,1
4,mn0000516929,*NSYNC,mn0000101895,Joe,in conversation,1


In [34]:
f = {'Number collaborations': 'sum', 'artist_name': 'first', 'feat_artist_name': 'first'}
test = repertorio.groupby(['artist_id', 'feat_artist_id'],
                          as_index=False).agg(f)


test.head()

Unnamed: 0,artist_id,feat_artist_id,Number collaborations,artist_name,feat_artist_name
0,mn0000000534,mn0000344634,10,All Saints,Melanie Blatt
1,mn0000000534,mn0000642542,6,All Saints,Burt Bacharach
2,mn0000000534,mn0000815862,8,All Saints,All-Saints Ensemble
3,mn0000000534,mn0001233067,50,All Saints,Danny Thompson
4,mn0000002578,mn0000236246,19,Tanita Tikaram,Mark Isham


In [35]:
output = pd.DataFrame()
for _, row in test.iterrows():
    inverted = test[(test['artist_name'] == row['feat_artist_name']) & (test['feat_artist_name'] == row['artist_name'])]
    if inverted.empty:
        output = output.append(row)

output

Unnamed: 0,Number collaborations,artist_id,artist_name,feat_artist_id,feat_artist_name
0,10.0,mn0000000534,All Saints,mn0000344634,Melanie Blatt
1,6.0,mn0000000534,All Saints,mn0000642542,Burt Bacharach
2,8.0,mn0000000534,All Saints,mn0000815862,All-Saints Ensemble
3,50.0,mn0000000534,All Saints,mn0001233067,Danny Thompson
4,19.0,mn0000002578,Tanita Tikaram,mn0000236246,Mark Isham
5,3.0,mn0000002578,Tanita Tikaram,mn0000495877,Moodswings
6,5.0,mn0000002748,Pata Negra,mn0000869068,Mercedes Sosa
7,6.0,mn0000002748,Pata Negra,mn0001589240,Rafael Amador
8,2.0,mn0000004332,Sharleen Spiteri,mn0000302774,Roger Sanchez
9,4.0,mn0000004332,Sharleen Spiteri,mn0000333648,Rammstein


In [42]:
output['Number collaborations'] = output['Number collaborations'].astype(np.int32)

In [43]:
output = output.sort_values(by='artist_name')
output.head(15)

Unnamed: 0,Number collaborations,artist_id,artist_name,feat_artist_id,feat_artist_name
2942,5,mn0000516929,*NSYNC,mn0000337119,Phil Collins
2948,15,mn0000516929,*NSYNC,mn0000861351,Nelly
2947,5,mn0000516929,*NSYNC,mn0000850607,Rosie O'Donnell
2946,1,mn0000516929,*NSYNC,mn0000799081,Full Force
2945,4,mn0000516929,*NSYNC,mn0000759187,Blaque
2944,7,mn0000516929,*NSYNC,mn0000664817,Gloria Estefan
2941,31,mn0000516929,*NSYNC,mn0000101895,Joe
2822,21,mn0000480108,A-ha,mn0001178546,Blue Zoo
2823,45,mn0000480108,A-ha,mn0001489096,Electric Rudeboyz
2820,5,mn0000480108,A-ha,mn0000325459,R.E.M.


In [44]:
output.to_csv('./Data/Lista_Grafo.csv', sep=";", encoding="utf-8", index=None)

In [45]:
artist_list = open('./Data/Lista_Artista.txt', 'r', encoding='utf-8').read().split('\n')

In [49]:
channels_data = pd.read_csv('./Data/Channels_Data.csv', sep=';', encoding="utf-8")
channels_data.head()

Unnamed: 0,channel_id,channel_title,view_count,subscriber_count,playlist_id
0,UCsiSmzL5G7VS_pfKMXqmcjw,NSYNCVEVO,572035818,474128,UUsiSmzL5G7VS_pfKMXqmcjw
1,UC-xTVl97pv9ftbOu8UfVy_Q,NatalieMerchantVideo,26199696,40106,UU-xTVl97pv9ftbOu8UfVy_Q
2,UCeEi1My3KTXHT0SG1JkR9gA,a-ha,216942980,338830,UUeEi1My3KTXHT0SG1JkR9gA
3,UCneR_qQ1NE_u9neBSEJJ7lg,MsAaliyahJay,87678984,1256852,UUneR_qQ1NE_u9neBSEJJ7lg
4,UCa_4DcdTB9QfK0LY9-7qWuQ,AbbaVEVO,1527781457,1008765,UUa_4DcdTB9QfK0LY9-7qWuQ


In [47]:
stats_list = channels_data['subscriber_count'].tolist()

In [23]:
for artist in artist_list:
    if item in 

663

687