**Music Recommendation System**

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [3]:
!kaggle competitions download -c kkbox-music-recommendation-challenge

Downloading kkbox-music-recommendation-challenge.zip to /content
 95% 329M/345M [00:02<00:00, 153MB/s]
100% 345M/345M [00:02<00:00, 149MB/s]


**Extracting the Main Folder**

In [4]:
import zipfile
zip=zipfile.ZipFile("/content/kkbox-music-recommendation-challenge.zip","r")
zip.extractall('/content')
zip.close()

**Extracting the 7z zip files**

In [5]:
!7z e "/content/members.csv.7z"
!7z e "/content/songs.csv.7z"
!7z e "/content/test.csv.7z"
!7z e "/content/train.csv.7z"
!7z e "/content/song_extra_info.csv.7z"


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan /content/                   1 file, 1349856 bytes (1319 KiB)

Extracting archive: /content/members.csv.7z
--
Path = /content/members.csv.7z
Type = 7z
Physical Size = 1349856
Headers Size = 130
Method = LZMA2:3m
Solid = -
Blocks = 1

  0%    Everything is Ok

Size:       2503827
Compressed: 1349856

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan /content/                   1 file, 105809525 bytes (101 MiB)

Extracting archive: /content/songs.csv.7z
--
Path = /content/songs.csv.7

**Reading the datasets**

In [106]:
import pandas as pd

songs_df = pd.read_csv("/content/songs.csv")
songs_extra_df = pd.read_csv("/content/song_extra_info.csv")
members_df = pd.read_csv("/content/members.csv")
train_df = pd.read_csv("/content/train.csv", nrows = 50000) #Reducing the dataset to 50000 rows
t_s = pd.merge(train_df, songs_df, on='song_id', how='left')
t_s_se = pd.merge(t_s, songs_extra_df, on='song_id', how='left')
songs = pd.merge(t_s_se, members_df, on='msno', how='left')
del songs_df, songs_extra_df, members_df, train_df, t_s, t_s_se #To avoid memory overflow

songs.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc,city,bd,gender,registered_via,registration_init_time,expiration_date
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,359,Bastille,Dan Smith| Mark Crew,,52.0,Good Grief,GBUM71602854,1,0,,7,20120102,20171005
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584.0,1259,Various Artists,,,52.0,Lords of Cardboard,US3C69910183,13,24,female,9,20110525,20170911
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,225396.0,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,,52.0,Hip Hop Is Dead(Album Version (Edited)),USUM70618761,13,24,female,9,20110525,20170911
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,255512.0,1019,Soundway,Kwadwo Donkoh,,-1.0,Disco Africa,GBUQH1000063,13,24,female,9,20110525,20170911
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,187802.0,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,,52.0,Sleep Without You,QM3E21606003,1,0,,7,20120102,20171005


In [156]:
#Checking any random user_id (Here in this dataset msno is user_id)
songs.iloc[9999]['msno']

'wx1pcB+XEaZxSohS/x79dYBdT79F4YDNFpHyuGGObNM='

In [110]:
#Installing Surprise Module
!pip install surprise



In [111]:
from surprise import Dataset, Reader,SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [112]:
# Converting 'target' column to binary implicit feedback (1 for positive, 0 for missing)
songs['target'] = songs['target'].apply(lambda x: 1 if x == 1 else 0)

In [113]:
# Creating a constant rating column (e.g., 0.5) since Surprise requires a rating column
songs['rating'] = 0.5

In [114]:
# Defining a reader object for explicit ratings (0.5 is the constant rating value)
reader = Reader(rating_scale=(0, 1))

In [115]:
# Loading the data into a Dataset object
data = Dataset.load_from_df(songs[['msno', 'song_id', 'rating']], reader)

In [116]:
# Splitting the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [117]:
# Training the recommender model
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x79a6a64b0b20>

In [157]:
# Making recommendations for a specific user
user_id = 'wx1pcB+XEaZxSohS/x79dYBdT79F4YDNFpHyuGGObNM='
n_rec = 10 # Number of recommendations to generate
recommendations = []

In [158]:
# Finding items that the user has not interacted with (implicit feedback: target=0)
user_items = set(songs[songs['msno'] == user_id]['song_id'])
all_items = set(songs['song_id'])
unrated_items = all_items - user_items

In [159]:
for item_id in unrated_items:
    predicted_score = model.predict(user_id, item_id).est
    song_name = songs[songs['song_id'] == item_id]['name'].iloc[0]  # Get the song name from the original 'songs' DataFrame
    recommendations.append((song_name, predicted_score))


In [160]:
# Sorting recommendations by predicted score
recommendations.sort(key=lambda x: x[1], reverse=True)
top_n_recommendations = recommendations[:n_rec]

In [161]:
# Print the top recommendations
print("********************** Top 10 Recommended Songs For the User *******************************")
for song_name, predicted_score in top_n_recommendations:
    print(f"Song Name: {song_name}, Predicted Score: {predicted_score}")

********************** Top 10 Recommended Songs For the User *******************************
Song Name: 熱血燃燒 (Burning Heart) [Live], Predicted Score: 0.9277629844158933
Song Name: Healing Love, Predicted Score: 0.9188486164657578
Song Name: Break Up In A Small Town, Predicted Score: 0.904699841636651
Song Name: 有人在嗎, Predicted Score: 0.8975807042116792
Song Name: 不再, Predicted Score: 0.896735146373207
Song Name: HOOT, Predicted Score: 0.8769117319362256
Song Name: 花的冥想(黃露儀), Predicted Score: 0.8729344976588198
Song Name: 已離不開, Predicted Score: 0.8665560787012924
Song Name: 你聖名 (At Your Name), Predicted Score: 0.8572245966785574
Song Name: 有事嗎 (Are You OK?), Predicted Score: 0.8557113079951044
