In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import pickle
import os

In [3]:
id_information = pd.read_csv('id_information_mmsr.tsv', sep='\t')
id_information = id_information.sort_values('id', ascending=True)
id_genres = pd.read_csv('id_genres_mmsr.tsv', sep='\t')
id_genres = id_genres.sort_values('id', ascending=True)
id_bert = pd.read_csv('id_bert_mmsr.tsv', sep='\t')
id_bert = id_bert.sort_values('id', ascending=True)
id_lyrics_tfidf = pd.read_csv('id_lyrics_tf-idf_mmsr.tsv', sep='\t')
id_lyrics_tfidf = id_lyrics_tfidf.sort_values('id', ascending=True)
id_lyrics_word2vec = pd.read_csv('id_lyrics_word2vec_mmsr.tsv', sep='\t')
id_lyrics_word2vec = id_lyrics_word2vec.sort_values('id', ascending=True)

In [48]:
artist = 'Dua Lipa'
song_query = 'New Rules (Initial Talk Remix)'
artist = 'Demi Lovato'
song_query = 'Two Pieces'
#song = id_information.loc[(id_information['artist']==artist)&(id_information['song']==song_query)]

The Retrieved Song IDs are stored in a dictionary {ID: [Top1_ID, Top2_ID,...]} which is stored in a pickle (.pkl) file.

For searching the top k similar songs to a song based on artist and song title 

In [53]:
def cosine_similarities(x, Y):
    nx = np.linalg.norm(x)
    nY = np.linalg.norm(Y, axis=1, keepdims=True)
    DotP = np.dot(x, Y.T).T
    
    return DotP/((nx*nY)+1e-10)

def search(artist:str, song_query:str, k:int=10, measure='cosine'):
    '''
    artist: Artist name as string
    song_query: Song title as string
    k: Number of Top elements, defaults to 10
    measure: Similarity measure, ["cosine", "jaccard"], default = cosine
    '''
    if os.path.exists(measure + '_retrieved_ids.pkl'):
        with open(measure + '_retrieved_ids.pkl', 'rb') as f:
            retrieved_dict = pickle.load(f)
    else:
        retrieved_dict = {}
        
    # first we get the id of the query song
    song = id_information.loc[(id_information['artist']==artist)&(id_information['song']==song_query)]
    query_id = song.iloc[0]['id']
    if query_id in retrieved_dict:
        top_k_ids = retrieved_dict[query_id]
        top_k_ids = top_k_ids[:k]
    else:
        tfidf_query = id_lyrics_tfidf.loc[id_lyrics_tfidf['id']==query_id]
        tfidf_query = np.array(tfidf_query.drop(columns=['id']))
        tfidf_subset = id_lyrics_tfidf.copy()
        subset_ids = tfidf_subset[['id']].copy()
        tfidf_no_id = np.array(tfidf_subset.drop(columns=['id']))

        # next we apply our similarity measure
        if measure == 'cosine':
            cosine = cosine_similarities(tfidf_query, tfidf_no_id)
            subset_ids.insert(1,'cosine', cosine)
        elif measure == 'jaccard':
            tfidf_query[tfidf_query != 0.0]=1
            tfidf_no_id[tfidf_no_id != 0.0]=1
            scores = []
            for row in tqdm(tfidf_no_id):
                jaccard = jaccard_score(tfidf_query[0], row)
                scores.append(jaccard)
            subset_ids.insert(1,'jaccard', scores)

        # then we sort our values and get the top k 
        subset_ids= subset_ids.sort_values(measure, ascending=False)
        sorted_cosine = subset_ids.iloc[1: , :]
        top_k = sorted_cosine.head(k)
        top_k_ids = np.array(top_k['id'])
        retrieved_dict[query_id] = list(top_k_ids)
        with open(measure + '_retrieved_ids.pkl', 'wb') as f:
            pickle.dump(retrieved_dict, f)
    retr = []
    for idx in top_k_ids:
        retr.append(id_information.loc[(id_information['id'] == idx)][['artist', 'song']])
    return retr

In [57]:
print(search(artist, song_query, 10, 'cosine'))

[            artist           song
17548  Saint Motel  Puzzle Pieces,       artist              song
34542  Spoon  The Fitted Shirt,             artist            song
19821  Mac DeMarco  Still Together,             artist          song
39185  Tulipa Ruiz  Proporcional,             artist              song
12044  ONE OK ROCK  Stand Out Fit In,             artist     song
23166  Benny Sings  Realize,             artist                 song
43196  The Streets  Fit But You Know It,               artist      song
25008  Avril Lavigne  Together,         artist                      song
18962  Dr. Dog  Jackie Wants a Black Eye,        artist      song
67340  The xx  Together]


In [51]:
#print(search(artist, song_query, 10, 'jaccard'))

In [52]:
with open('cosine_retrieved_ids.pkl', 'rb') as f:
    retrieved_dict = pickle.load(f)
print(retrieved_dict)

{'zzznMjZAKnJJXQSj': ['m3bU7wEiG8i3QgLU', 'gSDEde5fekD0MWlk', 'VxUQdLc5okNKJStp', 'YeNBs5M5SgaiJwMO', 'fKMI8NwzTaL2psTG', 'E7uZQXT21LTv1pbw', '1raObPYSGFUis50J', 'aTzS0RJbn5d8Hfsm', '1Y6EGNZVrpANBgpf', 'y2LRzlis7LRc8Ty7', 'kYfCb26Tr5KHGNMa', 'TYBQEoxbwS46h6EI', 'bRVaqSYPB0gJs2ll', 'WghryleumgzVbBIC', 'hBoMFUTDkHh92FXQ', 'bFLxKxwIYL34fqHS', 'UZSRLO2i0viYqPOr', 'uayjip3AZujLNRyE', 'wMbpkrTBWxZMmLvz', 'tGTirpqfEaQTCRkB', 'bdw1UJJiVDwzWxTA', 'yfm7RER1PWTqgjIp', '7x0vdMZzvsZnTEeh', 'KZofx143Sisd1IPV', 'I3jZIAHsf0TJZl7r', 'y90hEqKUtKBNni6m', 'BmyGIIw6qI5iIRTJ', 'NCjuLPAErqECTCHc', '0CJwNNthAtbk3RR5', 'fSOnwyynrTF9ZOEW', 'kAxHb79vWr7yndEM', 'VbZCE45WUmPmZWMp', 'LFy7e4MyG6iK6wdd', 'NsfxwqApHJKu4yMQ', 't0I99OFIbnkvwxe5', 'elTaa88FF8GSQLgd', '9R1cIVbrUnZSTeXv', 'xeuqdL3dNeZCSH46', 'f5gTrdbFYEfAGajD', 'lMAeVZ9XdEiR0WpU', 'bdBmwV9kVgbRehwX', 'WeD43cVwc9LBAotY', 'pLKeEmwpCncgcbZG', 'lTJVFhGsiLggKCEq', 'E4u7VIBtVLpg3VJf', 'naUeu78RnRNx0dWq', 'jGIHp0MVsFnKuOr1', 'iNpZmU1ZWOexDfqj', 'IjJI6Sg9MNoge3U6'

For calculating the cosine similarity or jaccard on the tf-idf dataset with the use of the song-ids (for getting the top 100 songs for each song, for evaluation)

In [10]:
def search_with_id(song_id:str, k:int=10, measure='cosine'):
    '''
    artist: Artist name as string
    song_query: Song title as string
    k: Number of Top elements, defaults to 10
    measure: Similarity measure, ["cosine", "jaccard"], default = cosine
    '''
    if os.path.exists(measure + '_retrieved_ids.pkl'):
        with open(measure + '_retrieved_ids.pkl', 'rb') as f:
            retrieved_dict = pickle.load(f)
    else:
        retrieved_dict = {}
        
    # first we get the id of the query song
    #song = id_information.loc[(id_information['artist']==artist)&(id_information['song']==song_query)]
    query_id = song_id
    if query_id in retrieved_dict:
        top_k_ids = retrieved_dict[query_id]
        top_k_ids = top_k_ids[:k]
    else:
        tfidf_query = id_lyrics_tfidf.loc[id_lyrics_tfidf['id']==query_id]
        tfidf_query = np.array(tfidf_query.drop(columns=['id']))
        tfidf_subset = id_lyrics_tfidf.copy()
        subset_ids = tfidf_subset[['id']].copy()
        tfidf_no_id = np.array(tfidf_subset.drop(columns=['id']))

        # next we apply our similarity measure
        if measure == 'cosine':
            cosine = cosine_similarities(tfidf_query, tfidf_no_id)
            subset_ids.insert(1,'cosine', cosine)
        elif measure == 'jaccard':
            tfidf_query[tfidf_query != 0.0]=1
            tfidf_no_id[tfidf_no_id != 0.0]=1
            scores = []
            for row in tqdm(tfidf_no_id):
                jaccard = jaccard_score(tfidf_query[0], row)
                scores.append(jaccard)
            subset_ids.insert(1,'jaccard', scores)

        # then we sort our values and get the top k 
        subset_ids= subset_ids.sort_values(measure, ascending=False)
        sorted_cosine = subset_ids.iloc[1: , :]
        top_k = sorted_cosine.head(k)
        top_k_ids = np.array(top_k['id'])
        retrieved_dict[query_id] = list(top_k_ids)
        with open(measure + '_retrieved_ids.pkl', 'wb') as f:
            pickle.dump(retrieved_dict, f)
    #retr = []
    #for idx in top_k_ids:
    #    retr.append(id_information.loc[(id_information['id'] == idx)][['artist', 'song']])
    #return retr

For getting the top 100 songs for each song based on song_id, with the average over cosine(on tfidf, bert, and word2vec) and jaccard score on tfidf (not recommended because it takes forever)

In [17]:
def cosine_similarities(x, Y):
    nx = np.linalg.norm(x)
    nY = np.linalg.norm(Y, axis=1, keepdims=True)
    DotP = np.dot(x, Y.T).T
    
    return DotP/((nx*nY)+1e-10)

def search_with_id(song_id:str, k:int=10, measure='cosine'):
    '''
    artist: Artist name as string
    song_query: Song title as string
    k: Number of Top elements, defaults to 10
    measure: Similarity measure, ["cosine", "jaccard"], default = cosine
    '''
    if os.path.exists(measure + '_avg_j_retrieved_ids.pkl'):
        with open(measure + '_avg_j_retrieved_ids.pkl', 'rb') as f:
            retrieved_dict = pickle.load(f)
    else:
        retrieved_dict = {}
        
    # first we get the id of the query song
    #song = id_information.loc[(id_information['artist']==artist)&(id_information['song']==song_query)]
    query_id = song_id
    if query_id in retrieved_dict:
        top_k_ids = retrieved_dict[query_id]
        top_k_ids = top_k_ids[:k]
    else:
        tfidf_subset = id_lyrics_tfidf
        tfidf_query = id_lyrics_tfidf.loc[id_lyrics_tfidf['id']==query_id]
        tfidf_query = np.array(tfidf_query.drop(columns=['id']))
        subset_ids = tfidf_subset[['id']].copy()

        tfidf_no_id = np.array(tfidf_subset.drop(columns=['id']))
        
        bert_subset = id_bert
        bert_query = id_bert.loc[id_bert['id']==query_id]
        bert_query = np.array(bert_query.drop(columns=['id']))

        bert_no_id = np.array(bert_subset.drop(columns=['id']))
        word2vec_subset = id_lyrics_word2vec
        word2vec_query = id_lyrics_word2vec.loc[id_lyrics_word2vec['id']==query_id]
        word2vec_query = np.array(word2vec_query.drop(columns=['id']))

        word2vec_no_id = np.array(word2vec_subset.drop(columns=['id']))

        # next we apply our similarity measure
        if measure == 'cosine':
            cosine = cosine_similarities(tfidf_query, tfidf_no_id)
            subset_ids.insert(1,'cosine_ifidf', cosine)
            
            cosine = cosine_similarities(bert_query, bert_no_id)
            subset_ids.insert(1,'cosine_bert', cosine)
            
            cosine = cosine_similarities(word2vec_query, word2vec_no_id)
            subset_ids.insert(1,'cosine_word2vec', cosine)
            subset_ids_without_id = subset_ids.drop(columns=['id'])

            avg_cosin = subset_ids_without_id.mean(axis=1)
            subset_ids.insert(1, 'cosine', avg_cosin)
            
            tfidf_query[tfidf_query != 0.0]=1
            tfidf_no_id[tfidf_no_id != 0.0]=1
            scores = []
            for row in tfidf_no_id:
                jaccard = jaccard_score(tfidf_query[0], row)
                scores.append(jaccard)
            subset_ids.insert(1,'jaccard', scores)
            
            subset_ids_new = subset_ids.drop(columns=[measure])
            subset_ids_without_id = subset_ids_new.drop(columns=['id'])
            avg_sim = subset_ids_without_id.mean(axis=1)
            subset_ids_new.insert(1, 'avg_similarity', avg_sim)

        # then we sort our values and get the top k 
        subset_ids= subset_ids_new.sort_values('avg_similarity', ascending=False)
        sorted_measure = subset_ids.iloc[1: , :]
        top_k = sorted_measure.head(k)
        top_k_ids = np.array(top_k['id'])
        retrieved_dict[query_id] = list(top_k_ids)
        with open(measure + '_avg_j_retrieved_ids.pkl', 'wb') as f:
            pickle.dump(retrieved_dict, f)
#Jaccard dauert viel zu lange zum berechnen

For calculating the top 100 songs for each song_id based on cosine average (on tfidf, bert and word2vec) or on the jaccard similarity(redundant here because same as above)

In [14]:
def search_with_id(song_id:str, k:int=10, measure='cosine'):
    '''
    artist: Artist name as string
    song_query: Song title as string
    k: Number of Top elements, defaults to 10
    measure: Similarity measure, ["cosine", "jaccard"], default = cosine
    '''
    if os.path.exists(measure + '_avg_retrieved_ids.pkl'):
        with open(measure + '_avg_retrieved_ids.pkl', 'rb') as f:
            retrieved_dict = pickle.load(f)
    else:
        retrieved_dict = {}
        
    # first we get the id of the query song
    #song = id_information.loc[(id_information['artist']==artist)&(id_information['song']==song_query)]
    query_id = song_id
    if query_id in retrieved_dict:
        top_k_ids = retrieved_dict[query_id]
        top_k_ids = top_k_ids[:k]
    else:
        tfidf_subset = id_lyrics_tfidf
        tfidf_query = id_lyrics_tfidf.loc[id_lyrics_tfidf['id']==query_id]
        tfidf_query = np.array(tfidf_query.drop(columns=['id']))
        subset_ids = tfidf_subset[['id']].copy()

        tfidf_no_id = np.array(tfidf_subset.drop(columns=['id']))
        
        bert_subset = id_bert
        bert_query = id_bert.loc[id_bert['id']==query_id]
        bert_query = np.array(bert_query.drop(columns=['id']))

        bert_no_id = np.array(bert_subset.drop(columns=['id']))
        word2vec_subset = id_lyrics_word2vec
        word2vec_query = id_lyrics_word2vec.loc[id_lyrics_word2vec['id']==query_id]
        word2vec_query = np.array(word2vec_query.drop(columns=['id']))

        word2vec_no_id = np.array(word2vec_subset.drop(columns=['id']))

        # next we apply our similarity measure
        if measure == 'cosine':
            cosine = cosine_similarities(tfidf_query, tfidf_no_id)
            subset_ids.insert(1,'cosine_ifidf', cosine)
            
            cosine = cosine_similarities(bert_query, bert_no_id)
            subset_ids.insert(1,'cosine_bert', cosine)
            
            cosine = cosine_similarities(word2vec_query, word2vec_no_id)
            subset_ids.insert(1,'cosine_word2vec', cosine)
            subset_ids_without_id = subset_ids.drop(columns=['id'])

            avg_cosin = subset_ids_without_id.mean(axis=1)
            subset_ids.insert(1, 'cosine', avg_cosin)
            
        elif measure == 'jaccard':
            tfidf_query[tfidf_query != 0.0]=1
            tfidf_no_id[tfidf_no_id != 0.0]=1
            scores = []
            for row in tqdm(tfidf_no_id):
                jaccard = jaccard_score(tfidf_query[0], row)
                scores.append(jaccard)
            subset_ids.insert(1,'jaccard', scores)

        # then we sort our values and get the top k 
        subset_ids= subset_ids.sort_values(measure, ascending=False)
        sorted_measure = subset_ids.iloc[1: , :]
        top_k = sorted_measure.head(k)
        top_k_ids = np.array(top_k['id'])
        retrieved_dict[query_id] = list(top_k_ids)
        with open(measure + '_avg_retrieved_ids.pkl', 'wb') as f:
            pickle.dump(retrieved_dict, f)

# nur mit dem average von den 3 cosine werten bekommen wir schon bessere werte als nur mit cosine von tf-idf

Function for calculating the cosine similarity over tfidf, bert and word2vec as average, or calculating the innerproduct over tfidf, bert and word2vec. (also jaccard over tfidf but again redundant)

In [21]:
def search_with_id(song_id:str, k:int=10, measure='cosine'):
    '''
    artist: Artist name as string
    song_query: Song title as string
    k: Number of Top elements, defaults to 10
    measure: Similarity measure, ["cosine", "jaccard", "inner_product"], default = cosine
    '''
    if os.path.exists(measure + '_avg_retrieved_ids.pkl'):
        with open(measure + '_avg_retrieved_ids.pkl', 'rb') as f:
            retrieved_dict = pickle.load(f)
    else:
        retrieved_dict = {}
        
    # first we get the id of the query song
    #song = id_information.loc[(id_information['artist']==artist)&(id_information['song']==song_query)]
    query_id = song_id
    if query_id in retrieved_dict:
        top_k_ids = retrieved_dict[query_id]
        top_k_ids = top_k_ids[:k]
    else:
        tfidf_subset = id_lyrics_tfidf
        tfidf_query = id_lyrics_tfidf.loc[id_lyrics_tfidf['id']==query_id]
        tfidf_query = np.array(tfidf_query.drop(columns=['id']))
        subset_ids = tfidf_subset[['id']].copy()

        tfidf_no_id = np.array(tfidf_subset.drop(columns=['id']))
        
        bert_subset = id_bert
        bert_query = id_bert.loc[id_bert['id']==query_id]
        bert_query = np.array(bert_query.drop(columns=['id']))

        bert_no_id = np.array(bert_subset.drop(columns=['id']))
        word2vec_subset = id_lyrics_word2vec
        word2vec_query = id_lyrics_word2vec.loc[id_lyrics_word2vec['id']==query_id]
        word2vec_query = np.array(word2vec_query.drop(columns=['id']))

        word2vec_no_id = np.array(word2vec_subset.drop(columns=['id']))

        # next we apply our similarity measure
        if measure == 'cosine':
            cosine = cosine_similarities(tfidf_query, tfidf_no_id)
            subset_ids.insert(1,'cosine_ifidf', cosine)
            
            cosine = cosine_similarities(bert_query, bert_no_id)
            subset_ids.insert(1,'cosine_bert', cosine)
            
            cosine = cosine_similarities(word2vec_query, word2vec_no_id)
            subset_ids.insert(1,'cosine_word2vec', cosine)
            subset_ids_without_id = subset_ids.drop(columns=['id'])

            avg_cosin = subset_ids_without_id.mean(axis=1)
            subset_ids.insert(1, 'cosine', avg_cosin)
            
        elif measure == 'jaccard':
            tfidf_query[tfidf_query != 0.0]=1
            tfidf_no_id[tfidf_no_id != 0.0]=1
            scores = []
            for row in tqdm(tfidf_no_id):
                jaccard = jaccard_score(tfidf_query[0], row)
                scores.append(jaccard)
            subset_ids.insert(1,'jaccard', scores)
            
        elif measure == 'inner_product':
            inner = np.dot(tfidf_query, tfidf_no_id.T).T
            subset_ids.insert(1,'inner_tfidf', inner)
            
            inner = np.dot(bert_query, bert_no_id.T).T
            subset_ids.insert(1,'inner_bert', inner)
            
            inner = np.dot(word2vec_query, word2vec_no_id.T).T
            subset_ids.insert(1,'inner_word2vec', inner)
            subset_ids_without_id = subset_ids.drop(columns=['id'])

            avg_inner = subset_ids_without_id.mean(axis=1)
            subset_ids.insert(1, 'inner_product', avg_inner)

        # then we sort our values and get the top k 
        subset_ids= subset_ids.sort_values(measure, ascending=False)
        sorted_measure = subset_ids.iloc[1: , :]
        top_k = sorted_measure.head(k)
        top_k_ids = np.array(top_k['id'])
        retrieved_dict[query_id] = list(top_k_ids)
        with open(measure + '_avg_retrieved_ids.pkl', 'wb') as f:
            pickle.dump(retrieved_dict, f)

In [22]:
for song_id in tqdm(id_information.head(100)['id']):
    search_with_id(song_id, 100, 'inner_product')

100%|█████████████████████████████████████████| 100/100 [00:51<00:00,  1.93it/s]


In [23]:
with open('inner_product_avg_retrieved_ids.pkl', 'rb') as f:
    retrieved_dict = pickle.load(f)
print(retrieved_dict)

{'0009fFIM1eYThaPg': ['2HMIpSuCforil1f7', 'Bp2C0upIsbgDIj6g', 'HFLuvJXc6SjcJt7d', 'AnOdZ5pDIL2LhtCb', 'aY7VhvjZJ0vLZX5F', 'qhUahI7HuwtK1PxU', 'pxgFoLb5tSjKO0Rr', 'SpXoTY1cFhXQpIST', 'vrdiSqqIArskZJan', 'LZq9SJvs3onFjszl', 'wXcMv63aWS4KEPm8', 'cUtnKV2vkqB3hhkX', 'ARAbmSl33s9qdv3W', 'a3tBLpk3yOTcyCL8', 'Y1k787b0BxYP5Whh', 'OjwXTPCan4M238cI', 'ao7dLWmpTymdh4NP', 'lbTW6YAzKbARhB59', 'nMWxakz2ha5Qtlk8', 'WJRG3U9F9ugC8EGH', 'Nv1MG1CNMFIvY5Zc', 'sP1S3wvw4rfgUoG0', 'L81uiGRAAlvx6Rad', 'PJCZM1so3O1xzQPH', '6ml9Cbwu3x4lYSuP', '4GqEFv6gojmcXsKX', 'nroPbzmOcivht0Ff', 'UUT3MnnHgEmcJVPR', 'MKXZ24rbDRjuRcPR', 'It0fYgx5KlrHgz8b', 'fKhzVeUSWITbWSox', 'i2TKUAC4U1cFtSUa', 'knhW09phfx5SaXWZ', 'NGhsmk5BSCAoQijc', 'SLqWKJ6If76WShsD', 'wsoQaKtyjn9EUO2Z', 'GAeGNMTGrp6ky2um', '2uHnZJwYQhfrrhAm', 'XAkjDQwHOgm9fcCQ', 'Af926lrdYuaRdEQe', 'sp7bCKfR7gsV2a0i', 'PhiCkfcbSDGbmIW0', 'HHB09OecPCe3AVRh', 'DdLl8jGNxGKG9Y0t', 'xfowo70kQkyYMdv4', '7xXE7YCfUASLPmHN', 'yO8XfIppngDk8jhs', 'fjHKCe0xPvO3R7VT', 'WAFIWIziIPINi0MC'