In [6]:
!pip install -q fuzzywuzzy
!pip install scikit-surprise
!pip install python-Levenshtein



In [7]:
import numpy as np
import pandas as pd

from typing import List, Dict
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz

from surprise import SVD
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

import warnings
warnings.filterwarnings('ignore')

In [8]:
songs_meta =  pd.read_csv("C:\\Users\\HP\\Desktop\\Datasets\\song_data.csv")
songs_meta.drop_duplicates(subset='song_id', inplace=True)
songs_meta.head()

Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


In [9]:
interactions = pd.read_csv("C:\\Users\\HP\\Desktop\\Datasets\\triplets_file.csv")
interactions.drop_duplicates(subset=['song_id','user_id'], inplace=True, keep='last')
interactions.head()

Unnamed: 0,user_id,song_id,listen_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1


In [26]:



# Get how many user have listened to each song
user_counts = interactions.groupby('song_id')['user_id'].count()

# Get songs which have been listened at least 4 times
user_ten_id = user_counts[user_counts > 200].index.to_list()

# Filtered the dataset to keep only those users with more than 16 listened
df_song_reduced = interactions[
  (interactions['song_id'].isin(user_ten_id))].reset_index(drop=True)

df_song_reduced.shape

KeyError: 'Column not found: user_id'

In [17]:
bins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2214]

# Binning
df_song_reduced['listen_count'] = pd.cut(df_song_reduced['listen_count'], bins=bins, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])



In [18]:
# Initialize Reader class
# Our rating scale is from 1 to 10
reader = Reader(rating_scale=(1, 10))

# now we apply the binning
data = Dataset.load_from_df(df_song_reduced[['user_id', 'song_id', 'listen_count']], reader)

# We'll split into the trainset and testset
trainset, testset = train_test_split(data, test_size=.25)

In [19]:
param_grid = {'n_factors': [120, 160], 'n_epochs': [100, 110], 'lr_all': [0.001, 0.005], 'reg_all': [0.08, 0.12]}
              
grid_search_svd = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, joblib_verbose=4, n_jobs=-2)
grid_search_svd.fit(data)
find_algo = grid_search_svd.best_estimator['rmse']
print(grid_search_svd.best_score['rmse'])
print(grid_search_svd.best_params['rmse'])

# Perform the cross validation
cross_validate(find_algo, data, measures=['RMSE'], cv=5, verbose=True)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  11 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-2)]: Done  48 out of  48 | elapsed: 21.5min remaining:    0.0s
[Parallel(n_jobs=-2)]: Done  48 out of  48 | elapsed: 21.5min finished


2.218153888976039
{'n_factors': 160, 'n_epochs': 110, 'lr_all': 0.005, 'reg_all': 0.12}
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    2.1951  2.1974  2.1914  2.1928  2.1909  2.1935  0.0024  
Fit time          118.21  115.84  122.77  129.06  113.42  119.86  5.54    
Test time         3.03    5.05    3.56    3.84    3.08    3.71    0.74    


{'test_rmse': array([2.19512001, 2.1974167 , 2.19137717, 2.19280712, 2.19087216]),
 'fit_time': (118.20701956748962,
  115.83808088302612,
  122.7709584236145,
  129.064519405365,
  113.42310547828674),
 'test_time': (3.0250461101531982,
  5.053813457489014,
  3.56123685836792,
  3.8372411727905273,
  3.083331346511841)}

In [23]:

# After getting the best parameters, we fit the model again
final_algorithm = SVD(n_factors=160, n_epochs=100, lr_all=0.005, reg_all=0.1)
final_algorithm.fit(trainset)

# And we test it
test_predictions = final_algorithm.test(testset)

# Get the accuracy
print(f"The RMSE is {accuracy.rmse(test_predictions, verbose=True)}")

RMSE: 2.2036
The RMSE is 2.2035633800617096


In [24]:
df_song_reduced.head()

Unnamed: 0,user_id,song_id,listen_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODXRTY12AB0180F3B,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFRQTD12A81C233C0,1


In [39]:

# pick a song
song = df_song_reduced['song_id'].iloc[220]
# pick an user
user = df_song_reduced['user_id'].iloc[1]
name = songs_meta[songs_meta['song_id'] =={song}]# get the prediction
prediction = final_algorithm.predict(1, 220)
name_song = name['title']
# print prediction
print(f"The estimate rating that the user {user} will give to the song {name_song} is {round(prediction.est, 2)}")

The estimate rating that the user b80344d063b5ccb3212f76538f3d9e43d87dca9e will give to the song Series([], Name: title, dtype: object) is 2.58
