In [9]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset

In [10]:
import numpy as np
import scipy.sparse as sparse
from lightfm import LightFM
from sklearn.model_selection import train_test_split

class SongRecommender:
    def __init__(self, learning_rate=0.05, epochs=50, num_components=30, loss='warp'):
        self.model = LightFM(learning_rate=learning_rate,
                            no_components=num_components,
                            loss=loss)
        self.epochs = epochs
        
    def create_interaction_matrix(self, user_ids, song_ids, play_counts):
        self.user_mapping = {user: idx for idx, user in enumerate(np.unique(user_ids))}
        self.song_mapping = {song: idx for idx, song in enumerate(np.unique(song_ids))}
        
        mapped_users = [self.user_mapping[user] for user in user_ids]
        mapped_songs = [self.song_mapping[song] for song in song_ids]
        
        n_users = len(self.user_mapping)
        n_songs = len(self.song_mapping)
        
        numeric_play_counts = np.array(play_counts).astype(np.float32)
        normalized_counts = np.log1p(numeric_play_counts)
        
        interaction_matrix = sparse.coo_matrix(
            (normalized_counts, (mapped_users, mapped_songs)),
            shape=(n_users, n_songs)
        )
        
        return interaction_matrix.tocsr()
    
    def fit(self, interaction_matrix):
        self.model.fit(interaction_matrix,
                      epochs=self.epochs,
                      num_threads=4,
                      verbose=True)
        
    def recommend_songs(self, user_id, n_recommendations=10):
        if user_id not in self.user_mapping:
            raise ValueError("User ID not found in training data")
            
        user_idx = self.user_mapping[user_id]
        n_songs = len(self.song_mapping)
        
        scores = self.model.predict(user_idx, np.arange(n_songs))
        
        top_song_indices = np.argsort(-scores)[:n_recommendations]
        
        reverse_mapping = {idx: song for song, idx in self.song_mapping.items()}
        recommendations = [reverse_mapping[idx] for idx in top_song_indices]
        
        return recommendations, scores[top_song_indices]

In [11]:
df = pd.read_csv('song_dataset.csv', header=None, 
                 names=['user_id', 'song_id', 'play_count', 'title', 'album', 'artist', 'year'])

df = df[1:]


In [12]:
# Building the interactions matrix https://making.lyst.com/lightfm/docs/lightfm.data.html
# Some play_count values are missing, so we'll fill them with 0
# and convert the column to numeric because it's currently a string
'''TODO: Decide what we do with string values'''
recomender = SongRecommender()

interaction_matrix = recomender.create_interaction_matrix(df['user_id'], df['song_id'], df['play_count'])
recomender.fit(interaction_matrix)

#export the model
import pickle
model = recomender.model
user_mapping = recomender.user_mapping
song_mapping = recomender.song_mapping

with open('model.pkl', 'wb') as f:
    pickle.dump({
        'model': model,
        'user_mapping': user_mapping,
        'song_mapping': song_mapping
    }, f)

Epoch: 100%|██████████| 50/50 [00:02<00:00, 22.84it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'recommender.pkl'

In [None]:
USER_INDEX = 1
(song_id, score) = recomender.recommend_songs(df['user_id'][USER_INDEX])

print(f"Recommendations for user {df['user_id'][USER_INDEX]}")
for i, song in enumerate(song_id):
    #get the song names
    print(f"{i+1}. {df[df['song_id'] == song]['title'].values[0]} with score {score[i]}")

Recommendations for user b80344d063b5ccb3212f76538f3d9e43d87dca9e
1. Chiro with score 0.6081976294517517
2. Nothing Gives Me Pleasure with score 0.2901929020881653
3. It's The Night Time with score 0.2830503582954407
4. Learn To Fly with score 0.2483392059803009
5. La Costa Blanca (Album Version) with score 0.20953169465065002
6. Silvery Sleds (Album Version) with score 0.19437994062900543
7. His Majesty Rides (Album Version) with score 0.1334516853094101
8. Soy with score 0.0908401682972908
9. Sin límites (I) with score 0.07975905388593674
10. It Hasn't Been Long Enough (Album Version) with score 0.03585544973611832


In [None]:
#https://stackoverflow.com/questions/68857138/predict-new-user-using-lightfm

In [None]:
new_user = "new_user"
five_random_songs = df.sample(5)['song_id'].values
playcounts = [1,2,4,5,6]


