# Projet : Système de recommandation de musiques

# Sélection et Implémentation de 2-3 Approches

In [1]:
import pandas as pd

file_path = 'MillionSongSubset/user_item_weighted_matrix.csv'
user_item_data = pd.read_csv(file_path)

user_item_data.head()


Unnamed: 0,user_id,SOAAAQN12AB01856D3,SOAAEHR12A6D4FB060,SOAANKE12A8C13CF5C,SOAAQAB12A8AE4769F,SOAAXAK12A8C13C030,SOABFNB12AB0180FAA,SOABFQI12A58A7D162,SOABLAF12AB018E1D9,SOABPYH12A8C13CC48,...,SOZZBDC12A8C146917,SOZZEXN12AB0184E54,SOZZFUI12A8C133D6D,SOZZFVR12AB0188142,SOZZPYH12AB0187578,SOZZQBH12A6D4FAFD8,SOZZUJY12A6D4F5786,SOZZVMW12AB0183B52,SOZZWBA12A8C13F403,SOZZWWW12A58A8146A
0,user_1,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,user_10,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,user_100,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,user_1000,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,user_101,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
import pandas as pd
import numpy as np

user_item_matrix = pd.read_csv('MillionSongSubset/user_item_weighted_matrix.csv', index_col='user_id')

songs_data_path = 'MillionSongSubset/songs_data.csv'
df_songs = pd.read_csv(songs_data_path)

# Convertir la durée en secondes
def convert_duration_to_seconds(duration):
    if isinstance(duration, float) or isinstance(duration, int):
        return duration
    else:
        minutes, seconds = map(int, duration.split(':'))
        return minutes * 60 + seconds

df_songs['duration_in_seconds'] = df_songs['duration'].apply(convert_duration_to_seconds)

# Normaliser les données de durée
max_duration = df_songs['duration_in_seconds'].max()
min_duration = df_songs['duration_in_seconds'].min()
df_songs['normalized_duration'] = (df_songs['duration_in_seconds'] - min_duration) / (max_duration - min_duration)

# Normaliser la 'hotness'
df_songs['song_hotttnesss'] = df_songs['song_hotttnesss'].fillna(0)
max_hotness = df_songs['song_hotttnesss'].max()
min_hotness = df_songs['song_hotttnesss'].min()
df_songs['normalized_hotness'] = (df_songs['song_hotttnesss'] - min_hotness) / (max_hotness - min_hotness)

# Combiner durée normalisée et hotness normalisée pour créer une mesure de pertinence
df_songs['combined_relevance'] = (df_songs['normalized_duration'] + df_songs['normalized_hotness']) / 2

# Créer un dictionnaire de pertinence
relevance = df_songs.set_index('song_id')['combined_relevance'].to_dict()

# Transformer la matrice utilisateur-item en dictionnaire d'interactions avec pertinence normalisée
actual_interactions = {user_id: {song_id: relevance.get(song_id, 0) for song_id in user_item_matrix.columns[(user_item_matrix.loc[user_id] > 0)]} for user_id in user_item_matrix.index}

# Affichage des données de pertinence normalisées
print(relevance)


{'SOGSOUE12A58A76443': 0.054694571538637646, 'SOEYIHF12AB017B5F4': 0.021163983897341344, 'SOKSZVC12A8C142004': 0.23369440699138266, 'SOKRHNY12AB01837DB': 0.08576188998780446, 'SOBTEHX12A6D4FBF18': 0.40044001669703894, 'SOXGDVW12AB01864E7': 0.36010161531310886, 'SOVVDCO12AB0187AF7': 0.29923933706574884, 'SOGSMXL12A81C23D88': 0.31458086981095534, 'SOMBCOW12AAF3B229F': 0.30708513722304004, 'SOZQSGL12AF72A9145': 0.4535207532702665, 'SODJYEC12A8C13D757': 0.044697874789934366, 'SORWTIF12A6D4FAA41': 0.07100383666394722, 'SOGOZYQ12AB0182359': 0.13351102484684607, 'SOERJUK12AF72A49F7': 0.3308465140283218, 'SOJVFTP12A58A7A4D4': 0.3043028399692799, 'SOYMUKA12A8C13C923': 0.04520058183453054, 'SOBCCVG12A8C13804C': 0.05343780255255655, 'SOQUOUX12A8C14439E': 0.10281803619304876, 'SOHMLNS12A58A7DA68': 0.37186335977385454, 'SODRZXF12A6D4F89FE': 0.04402999143459575, 'SOYCTRU12A6701DA91': 0.4270049132221106, 'SODCJNH12AB018A3A9': 0.06492107510121646, 'SOYOOPY12A58A7C4F8': 0.048496905194378465, 'SOLWNAR12

In [3]:
def calculate_dcg(recommended_items, relevance, k):
    dcg = 0
    for i in range(k):
        item_relevance = relevance.get(recommended_items[i], 0)
        dcg += item_relevance / np.log2(i + 2)  # i+2 car l'index commence à 0 et le log de 1 est 0
    return dcg

def calculate_ndcg(recommended_items, relevance, k):
    dcg = calculate_dcg(recommended_items, relevance, k)
    idcg = calculate_dcg(sorted(recommended_items, key=lambda x: relevance.get(x, 0), reverse=True), relevance, k)
    return dcg / idcg if idcg > 0 else 0

# Methode SVD

In [4]:
from scipy.sparse.linalg import svds
import numpy as np

# Setting up the matrix for SVD
# Dropping the user_id column to work with the matrix
matrix = user_item_data.drop('user_id', axis=1).values

# Applying SVD
U, sigma, Vt = svds(matrix, k=50)
sigma = np.diag(sigma)

# Reconstructing the predictions matrix
svd_all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)

# The result is a matrix of predicted ratings
svd_all_user_predicted_ratings_df = pd.DataFrame(svd_all_user_predicted_ratings, columns=user_item_data.columns[1:], index=user_item_data['user_id'])

# Display the first few rows of the reconstructed matrix
svd_all_user_predicted_ratings_df.head()


Unnamed: 0_level_0,SOAAAQN12AB01856D3,SOAAEHR12A6D4FB060,SOAANKE12A8C13CF5C,SOAAQAB12A8AE4769F,SOAAXAK12A8C13C030,SOABFNB12AB0180FAA,SOABFQI12A58A7D162,SOABLAF12AB018E1D9,SOABPYH12A8C13CC48,SOABRXK12A8C130A36,...,SOZZBDC12A8C146917,SOZZEXN12AB0184E54,SOZZFUI12A8C133D6D,SOZZFVR12AB0188142,SOZZPYH12AB0187578,SOZZQBH12A6D4FAFD8,SOZZUJY12A6D4F5786,SOZZVMW12AB0183B52,SOZZWBA12A8C13F403,SOZZWWW12A58A8146A
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_1,5.911147,-8.76463,-1.162249,13.737985,2.753857,0.0,4.325621,-2.771261,0.0,10.696927,...,0.0,0.0,10.667825,6.59341,2.528983,-5.594253,3.051086,2.075779,4.318475,3.275258
user_10,-1.593721,-8.988663,11.490689,-1.807686,0.036827,0.0,-12.683615,12.126999,0.0,-6.340841,...,0.0,0.0,-1.139425,0.475075,-4.079253,2.74653,4.392301,0.573129,-1.615019,4.604651
user_100,0.256056,-15.556241,0.613185,3.637314,-2.18523,0.0,1.63446,4.153377,0.0,4.291898,...,0.0,0.0,1.868863,0.806579,-5.028407,10.102301,5.05022,0.639794,0.384725,1.42616
user_1000,2.226486,41.101733,6.9387,0.180724,6.796696,0.0,-9.91546,-1.49499,0.0,-17.2133,...,0.0,0.0,-0.127079,-0.394473,0.668354,1.904908,1.972732,8.276547,-0.517649,-0.231448
user_101,1.864549,4.83269,1.583131,1.810774,0.168281,0.0,36.591733,15.297011,0.0,2.250732,...,0.0,0.0,10.996548,8.346875,-2.639284,3.233691,1.142704,5.986251,0.606038,6.379469


In [5]:
def recommend_items(user_id, original_matrix, predictions_matrix, num_recommendations=5):
    # Get and sort the user's predictions
    user_row_number = original_matrix.index.get_loc(user_id)
    sorted_user_predictions = predictions_matrix.iloc[user_row_number].sort_values(ascending=False)

    # Get the user's data and merge in the item information
    user_data = original_matrix.loc[user_id, :]
    user_full = pd.concat([user_data, sorted_user_predictions], axis=1)
    user_full.columns = ['original_rating', 'predicted_rating']

    # Recommend the highest predicted rating items that the user hasn't interacted with yet
    recommendations = user_full[user_full['original_rating'] == 0].sort_values('predicted_rating', ascending=False)

    return recommendations.head(num_recommendations)

# Example: Recommend items for a specific user
recommendations_for_user = recommend_items('user_1', user_item_data.set_index('user_id'), svd_all_user_predicted_ratings_df)
recommendations_for_user['predicted_rating']


SOHSENM12AB017FFF5    56.069660
SOMPGSD12A58A7F840    52.030597
SOXVGNX12A8C13D8A3    49.579409
SOKJUZQ12AB0185E37    46.541791
SOIHJSD12A6701EB04    44.343668
Name: predicted_rating, dtype: float64

In [6]:
# Calculate NDCG for each user SVD
ndcg_scores = []
k = 100

for user_id in svd_all_user_predicted_ratings_df.index:
    recommended_items = svd_all_user_predicted_ratings_df.loc[user_id].sort_values(ascending=False).index.tolist()[:k]
    ndcg_score = calculate_ndcg(recommended_items, relevance, k)
    ndcg_scores.append(ndcg_score)

# Calculate NDCG score
svd_average_ndcg = np.mean(ndcg_scores)
svd_variance_ndcg = np.var(ndcg_scores)
print(f"Mean : {svd_average_ndcg:.5f} - Variance : {svd_variance_ndcg:.5f}")

Mean : 0.96640 - Variance : 0.00006


# Methode ALS

In [7]:
import implicit
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

user_item_data = pd.read_csv('MillionSongSubset/user_item_weighted_matrix.csv')  # Remplacez par le chemin de votre fichier

model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.1, iterations=50, use_gpu=False)

# Convert user-item to item-user
sparse_item_user = csr_matrix(user_item_data.drop('user_id', axis=1).T.values)

alpha_val = 40
data_conf = (sparse_item_user * alpha_val).astype('double')
model.fit(data_conf)

user_factors = model.user_factors
item_factors = model.item_factors

als_predicted_scores = np.dot(model.user_factors, model.item_factors.T)

predicted_scores = np.dot(user_factors, item_factors.T)
print(predicted_scores.shape)

als_predicted_scores_df = pd.DataFrame(als_predicted_scores.T, index=user_item_data['user_id'], columns=user_item_data.columns[1:])

  check_blas_config()


  0%|          | 0/50 [00:00<?, ?it/s]

(5648, 1000)


In [8]:
# Calculate NDCG for each user ALS
ndcg_scores = []
k = 100

for user_id in als_predicted_scores_df.index:
    recommended_items = als_predicted_scores_df.loc[user_id].sort_values(ascending=False).index.tolist()[:k]
    ndcg_score = calculate_ndcg(recommended_items, relevance, k)
    ndcg_scores.append(ndcg_score)

# Calculate NDCG score
als_average_ndcg = np.mean(ndcg_scores)
als_variance_ndcg = np.var(ndcg_scores)
print(f"Mean : {als_average_ndcg:.5f} - Variance : {als_variance_ndcg:.5f}")

Mean : 0.90158 - Variance : 0.00024


# Methode EASE

In [9]:
# Charger les recommandations EASE
df_recommendations = pd.read_csv('MillionSongSubset/filtered_recommendations.csv', index_col=0)

ndcg_scores = []
k = 100

for user_id in df_recommendations.index:
    recommended_items = df_recommendations.loc[user_id].sort_values(ascending=False).index.tolist()[:k]
    ndcg_score = calculate_ndcg(recommended_items, relevance, k)
    ndcg_scores.append(ndcg_score)

# Calculer score NDCG
ease_average_ndcg = np.mean(ndcg_scores)
ease_variance_ndcg = np.var(ndcg_scores)
print(f"Mean : {ease_average_ndcg:.5f} - Variance : {ease_variance_ndcg:.5f}")

Mean : 0.95743 - Variance : 0.00011


# Comparaison et Analyse des Résultats

In [10]:
svd_average_ndcg = round(svd_average_ndcg, 5)
svd_variance_ndcg = round(svd_variance_ndcg, 5)

als_average_ndcg = round(als_average_ndcg, 5)
als_variance_ndcg = round(als_variance_ndcg, 5)

ease_average_ndcg = round(ease_average_ndcg, 5)
ease_variance_ndcg = round(ease_variance_ndcg, 5)

results_df = pd.DataFrame({
    'Method': ['SVD', 'ALS', 'EASE'],
    'Average NDCG': [svd_average_ndcg, als_average_ndcg, ease_average_ndcg],
    'Variance NDCG': [svd_variance_ndcg, als_variance_ndcg, ease_variance_ndcg]
})

results_df

Unnamed: 0,Method,Average NDCG,Variance NDCG
0,SVD,0.9664,6e-05
1,ALS,0.90158,0.00024
2,EASE,0.95743,0.00011
