# Environment configuration

In [21]:
# Upgrading pip which will be used to install all libraries
!pip install -r requirements.txt
# !pip install pymongo



In [2]:
# to check if working on GPU
# !pip install DeepSaki
from tensorflow import keras
import DeepSaki
strategy, RUNTIME_ENVIRONMENT, hw_accelerator_handle = DeepSaki.utils.DetectHw()

Running on single GPU  /device:GPU:0
Number of accelerators:  1
____________________________________________________________________________________
Device List: 
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9027242169430391422
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 1721342363
locality {
  bus_id: 1
  links {
  }
}
incarnation: 17701183863651686981
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3050 Ti Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


# Content-based filtering using TF-IDF score

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:

from typing import List
# from ../models/msd_song import MsdSongWithLyrics
from models.msd_song import MsdSongWithLyrics
from dao.dao_msd_songs_with_lyrics import DAOMsdSongsWithLyrics

dao_songs_with_lyrics: DAOMsdSongsWithLyrics = DAOMsdSongsWithLyrics()
songs: List[MsdSongWithLyrics] = dao_songs_with_lyrics.find_many_by_query({'lyrics': {'$ne':None}})

In [3]:
headers = songs[0].dict().keys()
songs_values = [song.dict().values() for song in songs]
songs_df = pd.DataFrame(songs_values, columns=headers)
songs_df.head()

Unnamed: 0,song_id,title,release,artist_name,year,ratings_amount,tag,features,lyrics
0,SOLJTLX12AB01890ED,El hijo del pueblo,32 Grandes Éxitos CD 2,Jorge Negrete,1997,,pop,[],Es mi orgullo haber nacido en el barrio más hu...
1,SOMPVQB12A8C1379BB,Pilots,The Loyal,Tiger Lou,2005,,pop,[],Raise the chandelier light the candels dear i ...
2,SOSDCFG12AB0184647,006,Lena 20 År,Lena Philipsson,1998,,pop,[],I had come in the name of love\nWith a mission...
3,SOKOVRQ12A8C142811,Ethos of Coercion,Descend Into Depravity,Dying Fetus,2009,,rock,[],"Castigation of the offenders, no punishment ou..."
4,SOIMMJJ12AF72AD643,Rock-N-Rule,I'm Only A Man (Bonus Track Version),Emery,2007,,rock,[],[Intro]\nThis is a waking up\nThis is your fin...


In [4]:
dao_songs: DAOMsdSongsWithLyrics = DAOMsdSongsWithLyrics()
songs_without: List[MsdSongWithLyrics] = dao_songs.find_all()

In [10]:
headers = songs[0].dict().keys()
songs_values = [song.dict().values() for song in songs_without]
songs_without_df = pd.DataFrame(songs_values, columns=headers)
songs_without_df.head()

Unnamed: 0,song_id,title,release,artist_name,year,ratings_amount,tag,features,lyrics
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003,,,,
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995,,,,
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006,,,,
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003,,,,
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0,,,,


In [11]:
print(len(songs_df))
print(len(songs_without_df))
songs_without_df.dropna(subset='lyrics', inplace=True)
print(len(songs_without_df))

181334
1000000
181334


In [4]:
!pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer





In [5]:
songs_df['lyrics'] = songs_df['lyrics'].str.replace(r'\n', ' ')

  songs_df['lyrics'] = songs_df['lyrics'].str.replace(r'\n', ' ')


In [6]:
songs_df['lyrics']

0         Es mi orgullo haber nacido en el barrio más hu...
1         Raise the chandelier light the candels dear i ...
2         I had come in the name of love With a mission ...
3         Castigation of the offenders, no punishment ou...
4         [Intro] This is a waking up This is your final...
                                ...                        
181329    Yesterday I went outside And all my grass had ...
181330    Big girls like telling boys that shove "fact i...
181331    [Verse 1] Think of me as your soldier The man ...
181332    Interrotte speranze, eterna fede Fiamme e stra...
181333    Where do you go with all of those scars They r...
Name: lyrics, Length: 181334, dtype: object

In [7]:
!pip install --user -U nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
final_stopwords_list = stopwords.words('english') + stopwords.words('french') + stopwords.words('spanish') + stopwords.words('swedish')
tfidf = TfidfVectorizer(analyzer='word', stop_words=final_stopwords_list)





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ewakae\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
songs_df = songs_df[:20000]
tfidf_matrix = tfidf.fit_transform(songs_df['lyrics'])


In [9]:
tfidf_matrix

<20000x106683 sparse matrix of type '<class 'numpy.float64'>'
	with 1170789 stored elements in Compressed Sparse Row format>

In [10]:
tfidf_matrix.shape

(20000, 106683)

## Content-based recommendation based on sigmoid kernel

In [11]:
from sklearn.metrics.pairwise import sigmoid_kernel

songs_sig = sigmoid_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
songs_sig

array([[0.76159809, 0.76159416, 0.76159416, ..., 0.76159416, 0.76159416,
        0.76159432],
       [0.76159416, 0.76159809, 0.76159416, ..., 0.76159416, 0.76159416,
        0.76159416],
       [0.76159416, 0.76159416, 0.76159809, ..., 0.7615942 , 0.76159416,
        0.76159416],
       ...,
       [0.76159416, 0.76159416, 0.7615942 , ..., 0.76159809, 0.76159416,
        0.76159416],
       [0.76159416, 0.76159416, 0.76159416, ..., 0.76159416, 0.76159809,
        0.76159416],
       [0.76159432, 0.76159416, 0.76159416, ..., 0.76159416, 0.76159416,
        0.76159809]])

In [13]:
songs_sig[0]

array([0.76159809, 0.76159416, 0.76159416, ..., 0.76159416, 0.76159416,
       0.76159432])

In [14]:
indices = pd.Series(songs_df.index, index=songs_df['title'])

In [15]:
indices

title
El hijo del pueblo        0
Pilots                    1
006                       2
Ethos of Coercion         3
Rock-N-Rule               4
                      ...  
Tyttö metsässä        19995
Inis Mona             19996
I Can Tell            19997
The April Fools       19998
Daría                 19999
Length: 20000, dtype: int64

In [16]:
idx = indices['Before He Kissed Me']

In [17]:
idx

7

In [None]:
idx

In [24]:
idx2 = indices['Night And Day']
idx2

9

In [28]:
sig_scores = list(enumerate(songs_sig[idx]))
print(sig_scores[-1])
print(sig_scores[1])
sig_scores2 = list(enumerate(songs_sig[idx2]))
print(sig_scores2[-1])
print(sig_scores2[1])
print(type(sig_scores[1]))

(19999, 0.7615941559557649)
(1, 0.7615941798472373)
(19999, 0.7615942580695211)
(1, 0.7615941559557649)
<class 'tuple'>


In [32]:
sig_scores_df = pd.DataFrame(sig_scores, columns=['song_id', 'sig_score'])
sig_scores_df.sort_values(by='sig_score', ascending=False)

sig_scores_df2 = pd.DataFrame(sig_scores2, columns=['song_id', 'sig_score'])
sig_scores_df2.sort_values(by='sig_score', ascending=False)

Unnamed: 0,song_id,sig_score
9,9,0.761598
15480,15480,0.761596
16609,16609,0.761596
5526,5526,0.761596
8373,8373,0.761596
...,...,...
5298,5298,0.761594
12823,12823,0.761594
12816,12816,0.761594
5306,5306,0.761594


In [45]:
sig_scores_df = pd.DataFrame(sig_scores, columns=['song_id', 'sig_score'])
sig_scores_df.set_index('song_id', inplace=True)

sig_scores_df2 = pd.DataFrame(sig_scores2, columns=['song_id', 'sig_score'])
sig_scores_df2.set_index('song_id', inplace=True)
# sig_scores_df.sort_values(by='sig_score', ascending=False)

result_df = sig_scores_df.add(sig_scores_df2)
result_df = result_df.sort_values(by='sig_score', ascending=False)
result_df = result_df[1:10]

In [54]:
print(f'Recommendations for {idx}:')
for index, song_score in result_df.iterrows():
    # print(index)
    # print(song_score[0])
    # print(songs_df['title'].iloc[index])
    # print(song_score)
    print(songs_df['title'].iloc[index], songs_df['artist_name'].iloc[index])

Recommendations for 7:
Waterlogged Broken Hope
Night And Day The Maytals
The Life Alicia Keys
Skinhead Moonstomp Symarip
Babies Pulp
Will You Be OK 702
Inner Smile Texas
Philby Rory Gallagher
Tomorrow May Never Come The Spinners


In [18]:
sig_scores = list(enumerate(songs_sig[idx]))
sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
sig_scores = sig_scores[1:11]
sig_scores

[(17140, 0.7615980925843139),
 (6679, 0.7615946934019091),
 (16945, 0.7615946934019091),
 (18372, 0.7615946753420102),
 (5809, 0.7615946434644573),
 (12405, 0.761594615341704),
 (12947, 0.7615945966095496),
 (19732, 0.7615945928406871),
 (12072, 0.7615945801236044),
 (4867, 0.7615945403184269)]

In [91]:
for song_score in sig_scores:
    print(songs_df['title'].iloc[song_score[0]], songs_df['artist_name'].iloc[song_score[0]])

Waterlogged Broken Hope
Gone Kissin Lunachicks
Gone Kissin Lunachicks
Flowers Grow Out of My Grave Dead Man's Bones
When Pain Comes To Surface Skinlab
Present Arrived Tom Verlaine
Peace Senser
Beautiful Mind The Verve
Envelopes Another Day Ariel Pink's Haunted Graffiti
Drifting Texas Sand Webb Pierce


## Content-based recommendation based on cosine similarity

In [55]:
from sklearn.metrics.pairwise import cosine_similarity

In [56]:
cosine_song_similarity = cosine_similarity(tfidf_matrix)

In [57]:
cosine_song_similarity

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.04216187],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.01048144, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.01048144, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.04216187, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [58]:
len(cosine_song_similarity)

20000

In [59]:
indices = pd.Series(songs_df.index, index=songs_df['title'])

In [60]:
idx = indices['Before He Kissed Me']
idx

7

In [65]:
idx2 = indices['Night And Day']
idx2

9

In [66]:
cosine_song_similarity[idx2]

array([0.        , 0.        , 0.01800646, ..., 0.00988273, 0.        ,
       0.02593921])

In [77]:
(cosine_song_similarity[idx])
df = pd.DataFrame(cosine_song_similarity[idx], columns=['score'])
df2 = pd.DataFrame(cosine_song_similarity[idx2], columns=['score'])
# df = df.add(df2)
df.sort_values(by='score', ascending=False)

Unnamed: 0,score
7,1.000000
17140,1.000000
16945,0.136524
6679,0.136524
18372,0.131936
...,...
9440,0.000000
9442,0.000000
9446,0.000000
9449,0.000000


In [None]:
(cosine_song_similarity[idx])

In [64]:
type(cosine_song_similarity[idx])

numpy.ndarray

In [63]:
import numpy as np

In [75]:
results = cosine_song_similarity[idx].argsort()[:-50:-1]
results

array([    7, 17140,  6679, 16945, 18372,  5809, 12405, 12947, 19732,
       12072,  4867,  3948, 18692,   307,   794, 11696,   380, 18240,
        7338, 16291,  4990, 13953,  2519,  6977,  6467,  2883,  5644,
        1785,  1756,  4993,  5067,  9333,  4830,  7632, 13274, 18815,
        9284, 15293, 19154, 18474,  1380,  7228,  2184, 17989, 11769,
       15259, 17348, 18026,  6839], dtype=int64)

In [140]:
np.sort(cosine_song_similarity[idx])[::-1]

array([1.        , 1.        , 0.13652363, ..., 0.        , 0.        ,
       0.        ])

In [133]:
type(results)

numpy.ndarray

In [135]:
for result in results:
    print(songs_df['title'].iloc[result], songs_df['artist_name'].iloc[result])

Before He Kissed Me Lisa Brokop
Waterlogged Broken Hope
Gone Kissin Lunachicks
Gone Kissin Lunachicks
Flowers Grow Out of My Grave Dead Man's Bones
When Pain Comes To Surface Skinlab
Present Arrived Tom Verlaine
Peace Senser
Beautiful Mind The Verve
Envelopes Another Day Ariel Pink's Haunted Graffiti
Drifting Texas Sand Webb Pierce
The Abandoned Ava Inferi
Lay Some Flowers On My Grave Blind Willie McTell
Hold Fast Call To Preserve
Fish Mr. Scruff
Come Death Blood Red Throne
Home Again Beach House
Mouth Machine Gun Our Last Night
Fallen Angel Seelenkrank
Settling Down Jerry Cantrell
Gerontion A Silent Film
The Grove Chuck Ragan
Lucky Lips Gale Storm
Crushed Eighteen Visions
Upon Raging Waves Mithotyn
Deep Dark Side Cowboys
Under My Skin Paffendorf
Your Place In The World The Space Brothers
This Would Be Paradise Melissa Auf der Maur
Soft Lips Hank Thompson
Lay Your Body Down Divinyls
The Morning After Tankard
Bukkake Tsunami Cattle Decapitation
Not Now Coffin Break
Devotion Luscious Jac

## Get songs user has listened to

In [78]:
def database_data_to_dataframe(data):
    headers = data[0].dict().keys()
    data_values = [d.dict().values() for d in data]
    df = pd.DataFrame(data_values, columns=headers)
    return df

In [79]:
from models.msd_triplet import MsdTriplet
from dao.dao_msd_triplets import DAOMsdTriplets

dao_triplets: DAOMsdTriplets = DAOMsdTriplets()
triplets: List[MsdTriplet] = dao_triplets.find_all()
triplets_df = database_data_to_dataframe(triplets)

In [84]:
def get_songs_user_has_listened_to(triplets_df, user_id):
    return triplets_df.loc[triplets_df['user_id'] == user_id, 'song_id']

In [86]:
songs = get_songs_user_has_listened_to(triplets_df, 'b80344d063b5ccb3212f76538f3d9e43d87dca9e')

In [89]:
len(songs)

45

In [88]:
songs.to_list()

['SOAKIMP12A8C130995',
 'SOBBMDR12A8C13253B',
 'SOBXHDL12A81C204C0',
 'SOBYHAJ12A6701BF1D',
 'SODACBL12A8C13C273',
 'SODDNQT12A6D4F5F7E',
 'SODXRTY12AB0180F3B',
 'SOFGUAY12AB017B0A8',
 'SOFRQTD12A81C233C0',
 'SOHQWYZ12A6D4FA701',
 'SOIYTOA12A6D4F9A23',
 'SOIZAZL12A6701C53B',
 'SOJNNUA12A8AE48C7A',
 'SOJPFQG12A58A7833A',
 'SOKRIMP12A6D4F5DA3',
 'SOLLGNU12AF72A4D4F',
 'SOMGIYR12AB0187973',
 'SOMLMKI12A81C204BC',
 'SOMSQJY12A8C138539',
 'SONSAEZ12A8C138D7A',
 'SOOKGRB12A8C13CD66',
 'SOPCVQE12AC468AF36',
 'SOQIVUD12AB01821D2',
 'SOQJLDY12AAF3B456D',
 'SOQLCKR12A81C22440',
 'SORPMYJ12AF729EB90',
 'SORQHCG12A58A7EEBA',
 'SORUFVF12AB018230B',
 'SORWLTW12A670208FA',
 'SORZASF12A6D4F8CFA',
 'SOSYBEV12AB0182933',
 'SOTFATN12A6D4FA74D',
 'SOTLVCL12AB0182D22',
 'SOTRSFZ12A8C142BF6',
 'SOUKXIN12A8C133C7F',
 'SOVHRGF12A8C13852F',
 'SOVQEYZ12A8C1379D8',
 'SOVYIYI12A8C138D88',
 'SOWQLXP12AF72A08A2',
 'SOWSPUS12AC468BEE3',
 'SOXMIUS12A8C13CD59',
 'SOXRXDG12A8C131DE5',
 'SOXZQDE12A8C135833',
 'SOYHEPA12