In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [4]:
id_information = pd.read_csv('id_information_mmsr.tsv', sep='\t')
id_genres = pd.read_csv('id_genres_mmsr.tsv', sep='\t')
id_bert = pd.read_csv('id_bert_mmsr.tsv', sep='\t')
id_lyrics_tfidf = pd.read_csv('id_lyrics_tf-idf_mmsr.tsv', sep='\t')
id_lyrics_word2vec = pd.read_csv('id_lyrics_word2vec_mmsr.tsv', sep='\t')

In [5]:
artist = 'Dua Lipa'
song_query = 'New Rules (Initial Talk Remix)'
#song = id_information.loc[(id_information['artist']==artist)&(id_information['song']==song_query)]

In [6]:
def cosine_similarities(x, Y):
    nx = np.linalg.norm(x)
    nY = np.linalg.norm(Y, axis=1, keepdims=True)
    DotP = np.dot(x, Y.T).T
    
    return DotP/((nx*nY)+1e-10)

def search(artist:str, song_query:str, k:int=10, measure='cosine'):
    '''
    artist: Artist name as string
    song_query: Song title as string
    k: Number of Top elements, defaults to 10
    measure: Similarity measure, ["cosine", "jaccard"], default = cosine
    '''
    
    # first we get the id of the query song
    song = id_information.loc[(id_information['artist']==artist)&(id_information['song']==song_query)]
    query_id = song.iloc[0]['id']
    tfidf_query = id_lyrics_tfidf.loc[id_lyrics_tfidf['id']==song.iloc[0]['id']]
    tfidf_query = np.array(tfidf_query.drop(columns=['id']))
    tfidf_subset = id_lyrics_tfidf.copy()
    subset_ids = tfidf_subset[['id']].copy()
    tfidf_no_id = np.array(tfidf_subset.drop(columns=['id']))
    
    # next we apply our similarity measure
    if measure == 'cosine':
        cosine = cosine_similarities(tfidf_query, tfidf_no_id)
        subset_ids.insert(1,'cosine', cosine)
    elif measure == 'jaccard':
        tfidf_query[tfidf_query != 0.0]=1
        tfidf_no_id[tfidf_no_id != 0.0]=1
        scores = []
        for row in tqdm(tfidf_no_id):
            jaccard = jaccard_score(tfidf_query[0], row)
            scores.append(jaccard)
        subset_ids.insert(1,'jaccard', scores)

    # then we sort our values and get the top k 
    subset_ids= subset_ids.sort_values(measure, ascending=False)
    sorted_cosine = subset_ids.iloc[0: , :]
    top_10 = sorted_cosine.head(10)
    top_10_ids = np.array(top_10['id'])
    retr = []
    for idx in top_10_ids:
        retr.append(id_information.loc[(id_information['id'] == idx)][['artist', 'song']])
    return retr

In [7]:
print(search(artist, song_query, 10, 'cosine'))

[         artist                            song
76113  Dua Lipa  New Rules (Initial Talk Remix),          artist                            song
58942  Dua Lipa  New Rules - Initial Talk Remix,          artist       song
52034  Dua Lipa  New Rules,               artist                           song
39323  Lenny Kravitz  Let Love Rule (Justice Remix),         artist song
42582  J. Cole  ATM,                    artist         song
50654  Jefferson Starship  Count on Me,                 artist         song
17479  Whitney Houston  Count On Me,           artist         song
2424  Bruno Mars  Count on Me,         artist      song
44737  Ramones  Commando,      artist    song
2019   Kllo  Virtue]


In [8]:
print(search(artist, song_query, 10, 'jaccard'))

100%|███████████████████████████████████| 76115/76115 [01:08<00:00, 1111.53it/s]


[         artist                            song
58942  Dua Lipa  New Rules - Initial Talk Remix,          artist                            song
76113  Dua Lipa  New Rules (Initial Talk Remix),          artist       song
52034  Dua Lipa  New Rules,               artist                song
21158  The Veronicas  Someone Wake Me Up,         artist                      song
24861  DeBarge  Who's Holding Donna Now?,             artist                       song
2630  Cathy Dennis  Touch Me (All Night Long),                 artist       song
23627  Louis Tomlinson  Two of Us,             artist                                     song
56770  Carly Simon  His Friends Are More Than Fond Of Robin,                artist             song
55945  Jennifer Lopez  Never Satisfied,            artist      song
22235  Delegation  Oh Honey]


In [9]:
cosine_scores = pd.DataFrame(id_information['id'].copy())

In [11]:
idx = 1       
tfidf_subset = id_lyrics_tfidf.copy() 
subset_ids = tfidf_subset[['id']].copy()    
tfidf_no_id = np.array(tfidf_subset.drop(columns=['id']))
for query_id in tqdm(cosine_scores['id']):
    tfidf_query = id_lyrics_tfidf.loc[id_lyrics_tfidf['id']==query_id]
    tfidf_query = np.array(tfidf_query.drop(columns=['id']))
    cosine = cosine_similarities(tfidf_query, tfidf_no_id)
    subset_ids = subset_ids.copy()
    subset_ids.insert(idx, str(query_id), cosine)
    idx += 1
    
# versuch war dass für jede id de cosine similarity mit jeder id berechnet wird und in einer riesigen datei
# gespeichert wird. Dauert halt leider ewig.
# Mögliche lösung wäre die similarity von 2 songs gleich für beide songs zu speichern nur weiß ich grade nicht
# wie ich das effizient in einem pandas dataframe mache, (maybe als numpy array erstellen und dann unformen aber 
# dabei müss ma halt auf die indices achten) weil wir brauchen dann eben ein dataframe wo die columns auch die id namen haben
# ich hoffe das macht sinn... 

  0%|                                      | 64/76115 [00:11<3:42:11,  5.70it/s]


KeyboardInterrupt: 

In [2]:
def csm(A,B):
    num=np.dot(A,B.T)
    p1=np.sqrt(np.sum(A**2,axis=1))[:,np.newaxis]
    p2=np.sqrt(np.sum(B**2,axis=1))[np.newaxis,:]
    return num/(p1*p2)

# Funktion zum berechnen der Cosine similarity matrix von 2 matrixes, leider ist tfidf_no_id zu groß,
# vielleicht mit bert oder so, ist glaub ich kleiner, aber frage ist ob da cosine similarity funktioniert
# kenn mich mit bert embeddings leider zu wenig aus (mit word2vec funktioniert cosine auf jeden fall)