# Read dataset

In [1]:
#MSD
!gdown --id 1-8Xc4luyjuOwykbShtTNBDG_3G9uYhzP
dataset='MSD_melspectrogram_vq.parquet'

Downloading...
From: https://drive.google.com/uc?id=1-8Xc4luyjuOwykbShtTNBDG_3G9uYhzP
To: /content/MSD_melspectrogram_vq.parquet
39.4MB [00:00, 149MB/s]


In [2]:
import pandas as pd
import numpy as np
df = pd.read_parquet(dataset)
df.reset_index(drop=True, inplace=True)
df.tail(3)

Unnamed: 0,music_id,melspectrogram,musicnn_tags,msd_tags
3705,TRAGPFL128EF342E1D,"[0.4828488730177744, 2.208650572822823, 4.6983...","[jazz, instrumental, ambient, electronic, chil...","[soul and reggae, new york, us, american, hip-..."
3706,TRAZASM128F932FBEE,"[0.0329444687642183, 0.7910001957829823, 5.080...","[electronic, ambient, instrumental, jazz, chil...","[scottish, uk, indie rock, british, art rock, ..."
3707,TRASQRN128F1464712,"[0.35234342057861534, 1.7942185304222873, 4.28...","[ambient, instrumental, jazz, electronic, expe...","[soundtrack composer, american]"


# Scenario 1 - All tags available (ground truth)
* Ideal scenario for tag-based information retrieval
* elasticsearch to provide the ranking

In [3]:
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.10.0-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-7.10.0-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.10.0-linux-x86_64.tar.gz
!useradd elasticsearch
!chmod -R 777 elasticsearch-7.10.0
!echo "!#/bin/bash" > /content/run.sh
!echo "/content/elasticsearch-7.10.0/bin/elasticsearch &" >> /content/run.sh
!chmod 777 run.sh
!pip install elasticsearch

Collecting elasticsearch
[?25l  Downloading https://files.pythonhosted.org/packages/ab/b1/58cfb0bf54e29c20669d6e588496fb7fe8b54f53bc238be4cb0a185a1e76/elasticsearch-7.13.1-py2.py3-none-any.whl (354kB)
[K     |████████████████████████████████| 358kB 5.1MB/s 
Installing collected packages: elasticsearch
Successfully installed elasticsearch-7.13.1


In [4]:
!export JAVA_HOME="/content/elasticsearch-7.10.0/jdk"; sudo -E -u elasticsearch nohup /content/run.sh  > /content/elasticsearch.run.log &
!sleep 60

nohup: redirecting stderr to stdout


In [5]:
!gdown --id 1n58btLAJbNV_2e1wnLNxOEV4u5edD1fs

Downloading...
From: https://drive.google.com/uc?id=1n58btLAJbNV_2e1wnLNxOEV4u5edD1fs
To: /content/index_tags.json
  0% 0.00/300 [00:00<?, ?B/s]100% 300/300 [00:00<00:00, 256kB/s]


In [6]:
import json
import time
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

client = Elasticsearch()

INDEX_NAME = 'music_ir_all_tags'

client.indices.delete(index=INDEX_NAME, ignore=[404])

with open('/content/index_tags.json') as index_file:
  source = index_file.read().strip()
  client.indices.create(index=INDEX_NAME, body=source)

In [7]:
import json
import time
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import pandas as pd
from tqdm import tqdm
client = Elasticsearch()

total = 0
requests = []
for row in tqdm(df.itertuples(index=False),total=len(df)):
    request = {}
    request['music_id'] = row.music_id
    request['tags'] = row.msd_tags
    request["_op_type"] = "index"
    request["_index"] = INDEX_NAME

    requests.append(request)
    if len(requests) >= 700:
      bulk(client, requests)
      total += len(requests)
      del requests
      requests = []

if len(requests) >= 1:
    bulk(client, requests)
    total += len(requests)
    del requests
    requests = []


100%|██████████| 3708/3708 [00:01<00:00, 2325.76it/s]


In [8]:
from elasticsearch import Elasticsearch


#### conecting to elastic search
client = Elasticsearch()

### simple search using snippet content
def get_script_query(tag_query):
  script_query = {      
          "bool": {
             "must": [ 
                       { "match":  { "tags": tag_query } }
                      ]
                  }
                 }       
  return script_query   
      

In [9]:
def get_ranking_complete_tags(query,search_size,INDEX_NAME = 'music_ir_all_tags'):
  SEARCH_SIZE = search_size
  script_query = get_script_query(query)
  response = client.search(index=INDEX_NAME, body={"size": SEARCH_SIZE, "query": script_query})

  L = []
  for hit in response["hits"]["hits"]:          
          doc = hit["_source"]
          doc['score'] = hit["_score"]
          L.append(doc)
          
  df_temp = pd.DataFrame(L)
  return df_temp

ranking_complete = get_ranking_complete_tags('rock',30)
ranking_complete

Unnamed: 0,music_id,tags,score
0,TRACKAK128F1458461,"[blues rock, hard rock, rock]",1.745026
1,TRAQSEG128F9327EB9,"[blues rock, hard rock, rock]",1.745026
2,TRBHLDQ128F423EF10,"[indie rock, american, rock, rock and indie, 2...",1.667709
3,TRAKDGR128F4294231,"[latin, rock, alternative rock]",1.627379
4,TRAIBXQ128F425E6A5,"[american, southern rock, garage rock, rock an...",1.609801
5,TRABNEX128F92C9DEA,"[american, southern rock, garage rock, rock an...",1.609801
6,TRALROP128F92CA9CC,"[rock, post-grunge, american, alternative rock...",1.594944
7,TRALPKG12903CDCA8A,"[alternative rock, british, indie rock, britan...",1.594944
8,TRARPXT128F425FDF9,"[soft rock, english, progressive rock, symphon...",1.573685
9,TRAMTIH128F933E8E4,"[classic pop and rock, rock]",1.551975


# Scenario 2 - [Baselines] - Few tags available
* Explore the long tail problem
** Computing music similarity via acoustic features
** From the tagged songs, get the closest untagged ones during the search

In [10]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
X = np.array(df['melspectrogram'].to_list())

In [11]:
# tagged song samples 
def get_sample_tagged_music(frac=0.1,random_state=42):
  df_tagged = df.sample(frac=frac,random_state=random_state)
  return df_tagged

df_tagged = get_sample_tagged_music()
df_tagged

Unnamed: 0,music_id,melspectrogram,musicnn_tags,msd_tags
1783,TRAFNVU128F932361A,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[ambient, electronic, instrumental, jazz, chil...",[german]
1929,TRAWLGG128F42884CA,"[0.30511027716112704, 1.3742524130526241, 3.00...","[ambient, jazz, instrumental, electronic, chil...",[hip hop rnb and dance hall]
2916,TRBBHUW128F429476F,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[ambient, instrumental, electronic, jazz, expe...","[german, german hip-hop, hip-hop, rap]"
787,TRANHYE128F92DE232,"[1.033007442098565e-05, 1.522751980081915e-05,...","[ambient, electronic, instrumental, indie, exp...","[taylor-swift, country]"
798,TRAMTTD128F42246F3,"[2.4933844940728773e-05, 3.859209554876429e-05...","[ambient, instrumental, jazz, electronic, chil...",[classic pop and rock]
...,...,...,...,...
869,TRBBMDA12903CB688E,"[0.007138752629086592, 0.07300878004403102, 0....","[ambient, electronic, instrumental, jazz, chil...","[industrial rock, norwegian]"
3488,TRAYDGA128F427D05F,"[3.215731432748817e-05, 0.00027443380239737904...","[jazz, instrumental, electronic, folk, ambient...","[bristol, rock and indie, drum and bass, british]"
1789,TRAYFKN128F148C55A,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.36540729...","[jazz, instrumental, electronic, ambient, chil...",[hip hop rnb and dance hall]
2288,TRAHDCA128F92E8221,"[9.83408754455467e-06, 1.5267338674536554e-05,...","[jazz, ambient, instrumental, electronic, expe...","[french, french metal, metal]"


In [12]:
import json
import time
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import pandas as pd
from tqdm.notebook import tqdm


def sample_music_tag_indexing(df_tagged):
  client = Elasticsearch()

  INDEX_NAME = 'music_ir_partial_tags'

  client.indices.delete(index=INDEX_NAME, ignore=[404])

  with open('/content/index_tags.json') as index_file:
    source = index_file.read().strip()
    client.indices.create(index=INDEX_NAME, body=source)

  total = 0
  requests = []
  for row in tqdm(df_tagged[['music_id','msd_tags']].itertuples(index=False),total=len(df_tagged)):
      request = {}
      request['music_id'] = row.music_id
      request['tags'] = row.msd_tags
      request["_op_type"] = "index"
      request["_index"] = INDEX_NAME

      requests.append(request)
      if len(requests) >= 700:
        bulk(client, requests)
        total += len(requests)
        del requests
        requests = []

  if len(requests) >= 1:
    bulk(client, requests)
    total += len(requests)
    del requests
    requests = []


sample_music_tag_indexing(df_tagged)

HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))




In [14]:
def get_ranking_partial_tags(query,search_size,initial_search_size=5,INDEX_NAME = 'music_ir_partial_tags'):

  SEARCH_SIZE = initial_search_size
  script_query = get_script_query(query)
  response = client.search(index=INDEX_NAME, body={"size": SEARCH_SIZE, "query": script_query})

  L = []
  for hit in response["hits"]["hits"]:
          doc = hit["_source"]
          doc['score'] = hit["_score"]
          L.append(doc)
  
  df_temp = pd.DataFrame(L)
  
  if len(df_temp)==0:
    df_temp['music_id'] = ['music_none']*search_size
    return df_temp

  df_temp = df_temp.sort_values(by='music_id')
  

  L = []
  
  nbrs = NearestNeighbors(n_neighbors=int(search_size/len(df_temp))+1,metric='euclidean',algorithm='brute').fit(np.array(df.melspectrogram.to_list()))

  for row in df_temp.itertuples(index=False):
    selected_music = df[df.music_id==row.music_id]
    distances, indices = nbrs.kneighbors([selected_music.melspectrogram.to_list()[0]])
    
    L_temp = []
    for ix in indices[0]:
      L_temp.append(df[df.index==ix])

    df_sim = pd.concat(L_temp)    
    df_sim['distance'] = distances[0]
    L.append(df_sim)


  df_result = pd.concat(L)
  return df_result.sort_values(by='distance').head(search_size)



ranking_tag_spectrogram = get_ranking_partial_tags('rock',30)
ranking_tag_spectrogram

Unnamed: 0,music_id,melspectrogram,musicnn_tags,msd_tags,distance
1798,TRAAGPJ128F428CD1B,"[5.881624108152409e-05, 0.00010160937873632648...","[jazz, ambient, instrumental, indie, electroni...","[stoner rock, american, hard rock, rock, rock ...",0.0
3526,TRBDKOV128F4290943,"[0.03865253739070618, 0.549890592511512, 1.739...","[jazz, instrumental, ambient, electronic, chil...","[protopunk, folk-rock, noise rock, experimenta...",0.0
2522,TRAIMFD128F92ED3CD,"[0.12335867904427222, 1.1294468544073768, 2.80...","[jazz, instrumental, ambient, electronic, indi...","[southern rock, hard rock, rock, jam band, blu...",3e-06
354,TRAOPYV128F425B2ED,"[0.023948274659249674, 0.13063445750618843, 0....","[ambient, electronic, jazz, instrumental, chil...","[hard rock, protopunk, punk rock, garage rock,...",9e-06
3487,TRBAMES128F14947D4,"[6.624868442199256e-05, 0.00010097655006127525...","[ambient, electronic, instrumental, indie, chi...","[blues rock, hard rock, rock]",1.1e-05
6,TRAVQBP128F145BB9B,"[4.448061158518446e-06, 6.754489786848114e-06,...","[instrumental, ambient, jazz, electronic, folk...","[texas, san antonio, punk, usa]",105.795169
6,TRAVQBP128F145BB9B,"[4.448061158518446e-06, 6.754489786848114e-06,...","[instrumental, ambient, jazz, electronic, folk...","[texas, san antonio, punk, usa]",114.336421
3071,TRBBIPG128F92FA94D,"[1.4137449351902305e-05, 2.16493317665371e-05,...","[ambient, instrumental, electronic, jazz, chil...","[latin, pop]",116.469134
1030,TRAVXEP128F14539AE,"[3.392128830316296e-05, 5.077492582493025e-05,...","[ambient, instrumental, electronic, jazz, expe...","[side project, hip-hop, rap]",117.031514
307,TRAAKJJ128F4228346,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[ambient, electronic, instrumental, indie, chi...","[punk, usa, california, san francisco]",117.173019


# Scenario 2 - [Baselines] - Extraction of tags via autotagging




In [15]:
import json
import time
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk



In [16]:
import json
import time
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import pandas as pd
from tqdm.notebook import tqdm


def sample_music_induced_tag_indexing(df_tagged):
  client = Elasticsearch()

  INDEX_NAME = 'music_ir_induced_tags'

  client.indices.delete(index=INDEX_NAME, ignore=[404])

  with open('/content/index_tags.json') as index_file:
    source = index_file.read().strip()
    client.indices.create(index=INDEX_NAME, body=source)

  total = 0
  requests = []
  for row in tqdm(df[['music_id','musicnn_tags']].itertuples(index=False),total=len(df)):
      request = {}
      request['music_id'] = row.music_id
      request['tags'] = row.musicnn_tags
      request['induced'] = 1
      request["_op_type"] = "index"
      request["_index"] = INDEX_NAME

      requests.append(request)
      if len(requests) >= 700:
        bulk(client, requests)
        total += len(requests)
        del requests
        requests = []

  if len(requests) >= 1:
    bulk(client, requests)
    total += len(requests)
    del requests
    requests = []

  for row in tqdm(df_tagged[['music_id','msd_tags']].itertuples(index=False),total=len(df_tagged)):
      request = {}
      request['music_id'] = row.music_id
      request['tags'] = row.msd_tags
      request['induced'] = 0
      request["_op_type"] = "index"
      request["_index"] = INDEX_NAME

      requests.append(request)
      if len(requests) >= 700:
        bulk(client, requests)
        total += len(requests)
        del requests
        requests = []
  
  if len(requests) >= 1:
    bulk(client, requests)
    total += len(requests)
    del requests
    requests = []  


sample_music_induced_tag_indexing(df_tagged)


HBox(children=(FloatProgress(value=0.0, max=3708.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))




In [17]:
def get_script_query_taggram(tag_query,induced=1):
  script_query = {
      
          "bool": {
              "must": [ 
                                  { "match":  { "tags": tag_query } },
                                  { "match":  { "induced": induced } }
                        ]
                  }
      }       
  return script_query   

def get_ranking_induced_tags(query,search_size,initial_search_size=5,INDEX_NAME = 'music_ir_induced_tags'):

  SEARCH_SIZE = initial_search_size
  script_query = get_script_query_taggram(query,induced=0)
  response = client.search(index=INDEX_NAME, body={"size": SEARCH_SIZE, "query": script_query})

  L1 = []
  for hit in response["hits"]["hits"]:
          doc = hit["_source"]
          doc['score'] = hit["_score"]
          L1.append(doc)
          
  df_temp1 = pd.DataFrame(L1)
  if len(df_temp1)==0:
    df_temp1['music_id'] = ['music_none']*search_size
    return df_temp1
  df_temp1 = df_temp1.sort_values(by='music_id')

  SEARCH_SIZE = search_size-len(df_temp1)
  script_query = get_script_query_taggram(query,induced=1)
  response = client.search(index=INDEX_NAME, body={"size": SEARCH_SIZE, "query": script_query})

  L2 = []
  for hit in response["hits"]["hits"]:
          doc = hit["_source"]
          doc['score'] = hit["_score"]
          L2.append(doc)
  df_temp2 = pd.DataFrame(L2)
  df_temp3 = pd.concat([df_temp1,df_temp2])
  if len(df_temp3) < search_size:
    L3 = df_temp1.music_id.to_list() + ['music_none']*(search_size-len(df_temp3))
    df_temp3 = pd.DataFrame()
    df_temp3['music_id'] = L3

  
  return df_temp3

ranking_taggram = get_ranking_induced_tags('rock',30)
ranking_taggram

Unnamed: 0,music_id,tags,induced,score
3,TRAAGPJ128F428CD1B,"[stoner rock, american, hard rock, rock, rock ...",0,3.676947
1,TRACBWP128C7196948,"[alternative rock, southern rock, hard rock, r...",0,3.692845
4,TRAOPYV128F425B2ED,"[hard rock, protopunk, punk rock, garage rock,...",0,3.643069
0,TRBAMES128F14947D4,"[blues rock, hard rock, rock]",0,3.693554
2,TRBDKOV128F4290943,"[protopunk, folk-rock, noise rock, experimenta...",0,3.691577
0,TRAHRMY128F427D064,"[jazz, instrumental, ambient, electronic, expe...",1,3.0936
1,TRAHCTH128F9330B4A,"[ambient, instrumental, electronic, jazz, chil...",1,3.0936
2,TRAMNGQ128F92F56EE,"[ambient, instrumental, electronic, rock, expe...",1,3.0936
3,TRBFQHY128F4242356,"[jazz, instrumental, ambient, experimental, ro...",1,3.0936
4,TRAFAID128F4243DAF,"[jazz, instrumental, folk, electronic, ambient...",1,3.0936


# Scenario 4 - Proposal
* Build heterogeneous network
** music <-> tags (available)
** music <-> cluster (acoustic features)
* tags embeddings propagation to get full modality
* search using embeddings (tag-based, but using embeddings)

In [18]:
!pip install git+https://github.com/rmarcacini/sentence-transformers
!pip install gdown
!gdown https://drive.google.com/uc?id=1NV5t1YhyyOzMF5zAovfbSLdZZLvqrfZ_
!unzip distiluse-base-multilingual-cased.zip -d language_model

Collecting git+https://github.com/rmarcacini/sentence-transformers
  Cloning https://github.com/rmarcacini/sentence-transformers to /tmp/pip-req-build-qyb3c0pa
  Running command git clone -q https://github.com/rmarcacini/sentence-transformers /tmp/pip-req-build-qyb3c0pa
Collecting transformers<3.2.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 6.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 25.3MB/s 
[?25hCollecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/75/26/c02ba92ecb8b780bdae4a862d351433c2912fe49469dac7f87a5c85ccca6/tokenizers-0.8.1rc2-cp37-cp37m-manylinux1_x86

In [19]:
from sentence_transformers import SentenceTransformer, LoggingHandler
import numpy as np
import logging

np.set_printoptions(threshold=100)
logging.basicConfig(format='%(asctime)s - %(message)s',datefmt='%Y-%m-%d %H:%M:%S',level=logging.INFO,handlers=[LoggingHandler()])

language_model = SentenceTransformer('./language_model')

2021-06-29 14:49:17 - Load pretrained SentenceTransformer: ./language_model
2021-06-29 14:49:17 - Load SentenceTransformer from folder: ./language_model
2021-06-29 14:49:20 - Use pytorch device: cpu


In [22]:
import networkx as nx
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pickle

def generate_music_tag_graph(df_tagged, n_neighbors_tagged=3, n_neighbors_untagged=5):
  G = nx.Graph()
  df_tagged.reset_index(inplace=True,drop=True)

  nbrs = NearestNeighbors(n_neighbors=n_neighbors_untagged,metric='euclidean').fit(df['melspectrogram'].to_list())
  nbrs_tagged = NearestNeighbors(n_neighbors=n_neighbors_tagged,metric='euclidean').fit(df_tagged['melspectrogram'].to_list())

  for index, row in tqdm(df.iterrows(),total=len(df)):
    music_node1 = row.music_id+':music'
    music_audio_features = row.melspectrogram
    distances, indices = nbrs.kneighbors([music_audio_features])
    indices = indices[0]
    for i in range(1,len(indices)):
      music_id_knn = df[df.index==indices[i]].music_id.to_list()[0]
      music_node2 = music_id_knn+':music'
      G.add_edge(music_node1,music_node2)
    if index not in df_tagged.index:
      distances, indices = nbrs_tagged.kneighbors([music_audio_features])
      indices = indices[0]      
      for i in range(1,len(indices)):
        music_id_knn = df_tagged[df_tagged.index==indices[i]].music_id.to_list()[0]
        music_node2 = music_id_knn+':music'
        G.add_edge(music_node1,music_node2)
  return G

G = generate_music_tag_graph(df_tagged)

HBox(children=(FloatProgress(value=0.0, max=3708.0), HTML(value='')))




In [23]:
from tqdm.notebook import tqdm

cache_tag_embedding = {}
def populate_initial_embeddings(G,df_tagged):
  for index,row in tqdm(df_tagged.iterrows(), total=len(df_tagged)):
    music_node = row['music_id']+':music'
    for tag in row['msd_tags']:
      tag_node = tag+':msd_tag'
      if tag not in cache_tag_embedding:
        cache_tag_embedding[tag] = language_model.encode(tag, show_progress_bar=False)
      embedding = cache_tag_embedding[tag]
      G.add_edge(music_node,tag_node)
      G.nodes[tag_node]['y'] = embedding

populate_initial_embeddings(G,df_tagged)

HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))




In [24]:
# graph regularization
import numpy as np
from tqdm.notebook import tqdm
import random


def regularization(G, dim=512, iterations=30, alpha=1.0):

    nodes = []

    # inicialize vector f to all nodes 
    for node in G.nodes():
        G.nodes[node]['f'] = np.array([0.0]*dim)
        if 'y' in G.nodes[node]:
            G.nodes[node]['f'] = G.nodes[node]['y']*1.0
        nodes.append(node)

    pbar = tqdm(range(0, iterations))

    for iteration in pbar:
        random.shuffle(nodes)
        energy = 0.0

        # iteration on nodes
        for node in nodes:
            f_new = np.array([0.0]*dim)
            f_old = np.array(G.nodes[node]['f'])*1.0
            sum_w = 0.0

            # iteration on node neighbors
            for neighbor in G.neighbors(node):
                w = 1.0
                if 'weight' in G[node][neighbor]:
                    w = G[node][neighbor]['weight']

                w /= np.sqrt(G.degree[neighbor])

                f_new += w*G.nodes[neighbor]['f']

                sum_w += w

            f_new /= sum_w

            G.nodes[node]['f'] = f_new*1.0

            if 'y' in G.nodes[node]:
                G.nodes[node]['f'] = G.nodes[node]['y'] * alpha + G.nodes[node]['f']*(1.0-alpha)

            energy += np.linalg.norm(f_new-f_old)

        iteration += 1
        message = 'Iteration '+str(iteration)+' | Energy = '+str(energy)
        pbar.set_description(message)

    return G


In [25]:
regularization(G)

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




<networkx.classes.graph.Graph at 0x7fbb96fb8990>

In [26]:
def get_music_embeddings(G):
  L = []
  for row in df.itertuples(index=False):
    embedding_musics = G.nodes[row.music_id+':music']['f']
    L.append(embedding_musics)

  df['embedding_musics'] = L
  return df

get_music_embeddings(G)

Unnamed: 0,music_id,melspectrogram,musicnn_tags,msd_tags,embedding_musics
0,TRBCSSE12903CBE261,"[3.85365668975425e-05, 9.06009774978107e-05, 0...","[ambient, electronic, instrumental, chillout, ...","[grunge, alternative metal, hard rock, rock, m...","[-0.0018454722197426456, 0.0025892093836807416..."
1,TRAKHXX128E07877EB,"[1.7909441124022883e-05, 4.980093894518541e-05...","[ambient, instrumental, electronic, jazz, indi...","[alternative rock, funk rock, funk, rock, cros...","[0.002411981225087851, 0.0015354811485621196, ..."
2,TRAFURE12903D1135B,"[8.584539134438656e-05, 0.00015190471595207244...","[ambient, instrumental, electronic, jazz, chil...","[chinese, mandarin, taiwanese]","[0.0020144952068674583, 0.001763669886531224, ..."
3,TRAKKCL128F92FAE59,"[1.1533951954828387e-05, 1.7950241829831163e-0...","[jazz, ambient, instrumental, electronic, chil...",[folk],"[0.002251583996677475, 0.0009102123581323686, ..."
4,TRBHORY128F148D422,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[jazz, instrumental, electronic, ambient, expe...",[hip hop],"[0.003504453485705241, 0.0002954780460474615, ..."
...,...,...,...,...,...
3703,TRAOASQ128F429822C,"[1.6727553879161785e-05, 2.409692993870103e-05...","[ambient, electronic, instrumental, experiment...",[classic pop and rock],"[0.002759739607949006, 0.0018126294224453077, ..."
3704,TRADJTR128F423CEB8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[jazz, instrumental, ambient, electronic, expe...",[classic pop and rock],"[0.0024315697174480365, -0.0018469248247029216..."
3705,TRAGPFL128EF342E1D,"[0.4828488730177744, 2.208650572822823, 4.6983...","[jazz, instrumental, ambient, electronic, chil...","[soul and reggae, new york, us, american, hip-...","[0.002637474627175736, 0.0016376893520920555, ..."
3706,TRAZASM128F932FBEE,"[0.0329444687642183, 0.7910001957829823, 5.080...","[electronic, ambient, instrumental, jazz, chil...","[scottish, uk, indie rock, british, art rock, ...","[-0.004224894079469056, 0.0030357163944166434,..."


In [27]:
def get_ranking_graph(query,search_size,initial_search_size=5,INDEX_NAME = 'music_ir_partial_tags'):

  SEARCH_SIZE = initial_search_size
  script_query = get_script_query(query)
  response = client.search(index=INDEX_NAME, body={"size": SEARCH_SIZE, "query": script_query})

  L = []
  for hit in response["hits"]["hits"]:
          doc = hit["_source"]
          doc['score'] = hit["_score"]
          L.append(doc)
          
  df_temp = pd.DataFrame(L)
  if len(df_temp)==0:
    df_temp['music_id'] = ['music_none']*search_size
    return df_temp
  df_temp = df_temp.sort_values(by='music_id')
  
  
  L = []

  nbrs = NearestNeighbors(n_neighbors=int(search_size/len(df_temp))+1,metric='euclidean',algorithm='brute').fit(df.embedding_musics.to_list())
  
  for row in df_temp.itertuples(index=False):
    selected_music = df[df.music_id==row.music_id]

    distances, indices = nbrs.kneighbors([selected_music.embedding_musics.to_list()[0]])
    L_temp = []
    for ix in indices[0]:
      L_temp.append(df[df.index==ix])

    df_sim = pd.concat(L_temp)
    df_sim['distance'] = distances[0]
    L.append(df_sim)


  df_result = pd.concat(L)
  return df_result.sort_values(by='distance').head(search_size)



ranking_graph_embeddings = get_ranking_graph('rock',30)
ranking_graph_embeddings

2021-06-29 14:58:06 - POST http://localhost:9200/music_ir_partial_tags/_search [status:200 request:0.016s]


Unnamed: 0,music_id,melspectrogram,musicnn_tags,msd_tags,embedding_musics,distance
3526,TRBDKOV128F4290943,"[0.03865253739070618, 0.549890592511512, 1.739...","[jazz, instrumental, ambient, electronic, chil...","[protopunk, folk-rock, noise rock, experimenta...","[0.002756575751468919, 0.001835921468177996, -...",0.0
2522,TRAIMFD128F92ED3CD,"[0.12335867904427222, 1.1294468544073768, 2.80...","[jazz, instrumental, ambient, electronic, indi...","[southern rock, hard rock, rock, jam band, blu...","[0.0034248010032058057, 0.00024911201177338176...",0.0
354,TRAOPYV128F425B2ED,"[0.023948274659249674, 0.13063445750618843, 0....","[ambient, electronic, jazz, instrumental, chil...","[hard rock, protopunk, punk rock, garage rock,...","[0.012496697335521537, 0.0068938988831063475, ...",0.0
3487,TRBAMES128F14947D4,"[6.624868442199256e-05, 0.00010097655006127525...","[ambient, electronic, instrumental, indie, chi...","[blues rock, hard rock, rock]","[0.006362915560016608, -0.005159277609958891, ...",1.053671e-08
1798,TRAAGPJ128F428CD1B,"[5.881624108152409e-05, 0.00010160937873632648...","[jazz, ambient, instrumental, indie, electroni...","[stoner rock, american, hard rock, rock, rock ...","[0.017127524842489746, -0.0037303801068851398,...",2.107342e-08
1796,TRBENWE128F424AE09,"[1.7246855866863902e-05, 2.5090101795318667e-0...","[jazz, instrumental, ambient, electronic, indi...","[ragtime, country blues, delta blues, chicago ...","[0.003648156344668619, 0.0009689248680853319, ...",0.01447461
1412,TRAOATJ128F147FC9B,"[2.9960421510247717e-05, 4.799073962852832e-05...","[jazz, instrumental, ambient, electronic, chil...","[british, soft rock, jazz, rock, adult contemp...","[0.0029412053729642312, 0.0009885946800815772,...",0.0153361
3306,TRBBWDJ128F42595D5,"[0.000966384592655192, 0.013518461226720983, 0...","[ambient, instrumental, jazz, indie, electroni...","[irish, rock, ireland, irlandais, classic pop ...","[0.0026455014177807173, 0.000992497455370926, ...",0.0173759
3484,TRARMRC128F424317A,"[0.21439422243857456, 2.4466480811057623, 9.65...","[jazz, instrumental, electronic, ambient, chil...",[deutschland],"[0.0034050086001812595, 0.000757370842940641, ...",0.0177947
2908,TRAPQNA128F93000EB,"[4.772501902555977e-06, 7.749645500986996e-06,...","[jazz, ambient, instrumental, electronic, chil...","[country, rock roll, rock, honky tonk, nashvil...","[0.0026092092114358774, 0.0015624692634163666,...",0.01796681


# Avaliação

## Gerando queries para a avaliação

In [39]:
queries = {}
for index,row in df.iterrows():
  for tag in row['msd_tags']:
    if tag not in queries: queries[tag]=0
    queries[tag]+=1

q = sorted(queries.items(), key=lambda item: item[1], reverse=True)
queries = q[0:10] #q[0:200]



## Definindo métricas

In [29]:
!pip install ml_metrics


Collecting ml_metrics
  Downloading https://files.pythonhosted.org/packages/c1/e7/c31a2dd37045a0c904bee31c2dbed903d4f125a6ce980b91bae0c961abb8/ml_metrics-0.1.4.tar.gz
Building wheels for collected packages: ml-metrics
  Building wheel for ml-metrics (setup.py) ... [?25l[?25hdone
  Created wheel for ml-metrics: filename=ml_metrics-0.1.4-cp37-none-any.whl size=7845 sha256=d97e4a9921af675788186b024e1a014ab496e6284ae3fbab67c061c8458323c0
  Stored in directory: /root/.cache/pip/wheels/b3/61/2d/776be7b8a4f14c5db48c8e5451451cabc58dc6aa7ee3801163
Successfully built ml-metrics
Installing collected packages: ml-metrics
Successfully installed ml-metrics-0.1.4


In [30]:
import ml_metrics as metrics
import numpy as np
from subprocess import check_output

def precison_recall(L_true,L_pred):
  precision = len(set(L_true).intersection(set(L_pred))) / len(L_pred)
  recall = len(set(L_true).intersection(set(L_pred))) / len(L_true)
  return precision,recall



## Parâmetros da avaliação

In [31]:
random_states = [8]#[1,2,3,4,5,6,7,8,9,10]
ranking_size = [10]#[10,20,30,40,50,60,70,80,90,100]
sampling_size = [0.5]#[0.1,0.2,0.3,0.4,0.5]
initial_search_sizes = [5]
graph_n_neighbors_taggeds = [3]#[3,5,7]
graph_n_neighbors_untaggeds = [3]#[3,5,7]#[3,5,7]

## Avaliando MelSpec e Taggrams

In [40]:
from tqdm.notebook import tqdm
from elasticsearch import logger as es_logger
import logging



es_logger = logging.getLogger('elasticsearch')
es_logger.setLevel(logging.WARNING)

L_results = []

for random_state in tqdm(random_states):
  for frac in tqdm(sampling_size):
    df_tagged = get_sample_tagged_music(frac=frac,random_state=random_state)
    sample_music_tag_indexing(df_tagged)

    pbar = tqdm(queries)
    for query in pbar:
      query = query[0]
      
      pbar.set_description('query='+query)
      for k in ranking_size:
        # oracle
        ranking_complete = get_ranking_complete_tags(query,k)
        L_true = ranking_complete['music_id'].to_list()
        total = len(L_true)


        for initial_search_size in initial_search_sizes:
          # baselines
          ranking_tag_spectrogram = get_ranking_partial_tags(query,k,initial_search_size=initial_search_size)
          L_melspec = ranking_tag_spectrogram['music_id'].to_list()
          apk_melspec = metrics.apk(L_true,L_melspec,k)
          ap10_melspec = metrics.apk(L_true,L_melspec,10)
          L_results.append(['melspec',frac,random_state,query,k,initial_search_size,'-','-',apk_melspec,ap10_melspec,total,len(L_melspec),L_true,L_melspec])

          ranking_taggram = get_ranking_induced_tags(query,k,initial_search_size=initial_search_size)
          L_taggram = ranking_taggram['music_id'].to_list()
          apk_taggram = metrics.apk(L_true,L_taggram,k)
          ap10_taggram = metrics.apk(L_true,L_taggram,10)
          L_results.append(['taggram',frac,random_state,query,k,initial_search_size,'-','-',apk_taggram,ap10_taggram,total,len(L_taggram),L_true,L_taggram])

    pd.DataFrame(L_results).to_pickle('df_exp_results_spec_taggram_'+str(random_state)+'.pkl')
  

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1854.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))






## Avaliando Proposta

In [43]:
from tqdm.notebook import tqdm
from elasticsearch import logger as es_logger
import datetime
import logging

es_logger = logging.getLogger('elasticsearch')
es_logger.setLevel(logging.WARNING)


for random_state in tqdm(random_states):
  for frac in tqdm(sampling_size):
    df_tagged = get_sample_tagged_music(frac=frac,random_state=random_state)
    sample_music_tag_indexing(df_tagged)

    for graph_n_neighbors_tagged in graph_n_neighbors_taggeds:
      for graph_n_neighbors_untagged in graph_n_neighbors_untaggeds:

        L_results2 = []

        # build graph
        G = generate_music_tag_graph(df_tagged, n_neighbors_tagged=graph_n_neighbors_tagged, n_neighbors_untagged=graph_n_neighbors_untagged)
        populate_initial_embeddings(G,df_tagged)
        regularization(G)


        pbar = tqdm(queries)
        for query in pbar:
          
          query = query[0]
          pbar.set_description('query='+query)
          for k in ranking_size:
            # oracle
            ranking_complete = get_ranking_complete_tags(query,k)
            L_true = ranking_complete['music_id'].to_list()
            total = len(L_true)


            for initial_search_size in initial_search_sizes:
                  get_music_embeddings(G)
                  
                  ranking_graph_embeddings = get_ranking_graph(query,k,initial_search_size=initial_search_size)
                  L_graph = ranking_graph_embeddings['music_id'].to_list()
                  apk_graph = metrics.apk(L_true,L_graph,k)
                  ap10_graph = metrics.apk(L_true,L_graph,10)
                  L_results2.append(['graph',frac,random_state,query,k,initial_search_size,graph_n_neighbors_tagged,graph_n_neighbors_untagged,apk_graph,ap10_graph,total,len(L_graph),L_true,L_graph])
        pd.DataFrame(L_results2).to_pickle('df_exp_results_graph_'+str(random_state)+'_'+str(graph_n_neighbors_tagged)+'_'+str(graph_n_neighbors_untagged)+'.pkl')



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1854.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3708.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1854.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))






In [44]:
results = pd.concat([pd.read_pickle("df_exp_results_spec_taggram_8.pkl"),pd.read_pickle("df_exp_results_graph_8_3_3.pkl")])
results.columns = ['method','sample_size','run','query_tag','search_size','initial_search_size','p1','p2','apk','ap10','r1','r2','L1','L2'] #MSD 
results

Unnamed: 0,method,sample_size,run,query_tag,search_size,initial_search_size,p1,p2,apk,ap10,r1,r2,L1,L2
0,melspec,0.5,8,american,10,5,-,-,0.0,0.0,10,10,"[TRADHBZ128F932D9E9, TRAAJJV128F42A2F99, TRBIH...","[music_none, music_none, music_none, music_non..."
1,taggram,0.5,8,american,10,5,-,-,0.05,0.05,10,10,"[TRADHBZ128F932D9E9, TRAAJJV128F42A2F99, TRBIH...","[TRADRXC128F4230BE8, TRAGGHA128F4262D55, TRAOO..."
2,melspec,0.5,8,rock,10,5,-,-,0.0,0.0,10,10,"[TRACKAK128F1458461, TRAQSEG128F9327EB9, TRAZD...","[music_none, music_none, music_none, music_non..."
3,taggram,0.5,8,rock,10,5,-,-,0.02,0.02,10,10,"[TRACKAK128F1458461, TRAQSEG128F9327EB9, TRAZD...","[TRAAGPJ128F428CD1B, TRACBWP128C7196948, TRAOP..."
4,melspec,0.5,8,classic pop and rock,10,5,-,-,0.0,0.0,10,10,"[TRAGPMR128F930C0D8, TRBHHDU128F92FD6B5, TRAUC...","[music_none, music_none, music_none, music_non..."
5,taggram,0.5,8,classic pop and rock,10,5,-,-,0.0,0.0,10,10,"[TRAGPMR128F930C0D8, TRBHHDU128F92FD6B5, TRAUC...","[TRAIVMM128F4266056, TRAOTHS12903D07021, TRAWP..."
6,melspec,0.5,8,british,10,5,-,-,0.0,0.0,10,10,"[TRACFJE128F934C8D5, TRBFRZI128F427E827, TRAQF...","[music_none, music_none, music_none, music_non..."
7,taggram,0.5,8,british,10,5,-,-,0.0,0.0,10,10,"[TRACFJE128F934C8D5, TRBFRZI128F427E827, TRAQF...","[TRAGAVB128F932585E, TRATPDA12903CB2D90, TRATP..."
8,melspec,0.5,8,rock and indie,10,5,-,-,0.1,0.1,10,10,"[TRAMIUD128E07825DB, TRAXIBJ128F4238364, TRAAI...","[TRAMIUD128E07825DB, TRBGMOG128F92D75BD, TRAEN..."
9,taggram,0.5,8,rock and indie,10,5,-,-,0.0,0.0,10,10,"[TRAMIUD128E07825DB, TRAXIBJ128F4238364, TRAAI...","[TRADNOD128F4262F3D, TRAELRO128F92DD2F1, TRALH..."
