In [14]:
import pandas as pd
from scipy.sparse.linalg import svds
import numpy as np

ratings = pd.read_csv('data/ratings.csv')
ratings.head()

Unnamed: 0,userId,activityId,rate
0,72,31,0
1,146,40,1
2,131,68,4
3,86,17,4
4,286,59,4


In [5]:
user_item = ratings.groupby(['userId', 'activityId'])['rate'].first().unstack(fill_value=0.0)
user_item.shape

(300, 100)

In [6]:
user_item.shape

(300, 100)

In [7]:
user_item.describe

<bound method NDFrame.describe of activityId  1    2    3    4    5    6    7    8    9    10   ...  91   92   \
userId                                                        ...             
1           2.0  2.0  4.0  3.0  4.0  3.0  0.0  2.0  5.0  4.0  ...  5.0  4.0   
2           1.0  4.0  5.0  2.0  3.0  2.0  4.0  4.0  0.0  5.0  ...  1.0  4.0   
3           0.0  2.0  1.0  0.0  3.0  4.0  2.0  1.0  2.0  0.0  ...  5.0  2.0   
4           5.0  1.0  2.0  2.0  4.0  5.0  4.0  4.0  4.0  0.0  ...  2.0  3.0   
5           5.0  5.0  0.0  2.0  1.0  3.0  0.0  1.0  0.0  4.0  ...  2.0  1.0   
...         ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
296         3.0  1.0  4.0  5.0  4.0  0.0  0.0  2.0  3.0  2.0  ...  5.0  5.0   
297         2.0  5.0  1.0  0.0  2.0  1.0  0.0  4.0  4.0  0.0  ...  2.0  0.0   
298         0.0  1.0  3.0  2.0  5.0  4.0  3.0  2.0  4.0  2.0  ...  4.0  2.0   
299         3.0  1.0  4.0  1.0  5.0  3.0  0.0  1.0  1.0  4.0  ...  2.0  0.0   
300         1.0  4

In [8]:
user_item.loc[42].sort_values(ascending=False).head()
# Trie des valeur pour l'user_item 42

activityId
29    5.0
85    5.0
36    5.0
79    5.0
91    5.0
Name: 42, dtype: float64

In [12]:
U, sigma, Vt = svds(user_item.to_numpy(), k=50)
#U, sigma et VT sont des matrices
#svds décompose partiellement en valeurs singulières d'une matrice creuse.
U.shape

(300, 50)

In [13]:
Vt.shape

(50, 100)

In [15]:
sigma_diag_matrix=np.diag(sigma)
# La méthode diag sert a créer / extraire une diagonale

In [16]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma_diag_matrix), Vt)
# Ici la variable all_user est un produit scalaire des matrices / diagonale, la methode dot est la pour
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = user_item.columns, index=user_item.index)
# On crée un Dataframe

In [17]:
preds_df.shape

(300, 100)

In [35]:
preds_df.head()

activityId,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.755334,1.479483,3.276907,3.114003,4.222705,2.901777,1.350925,2.882597,4.289786,4.598076,...,2.635799,3.312325,3.228195,2.582283,0.448442,3.438094,3.436295,3.774701,3.363541,3.164079
2,0.168665,1.980643,3.838244,1.772846,3.247659,1.910559,4.331522,3.068027,0.0281,3.090986,...,0.572777,4.128643,4.930914,3.997414,2.236307,2.385088,0.500017,2.481687,4.020347,1.623153
3,0.039696,1.203199,0.79363,-1.184727,3.449504,2.994979,1.949149,1.897347,2.565578,1.087569,...,4.406843,0.594777,4.733335,1.928929,4.638685,0.098549,2.548471,0.162239,4.139914,2.02929
4,4.572355,1.246771,2.680501,2.889297,3.490588,4.754823,3.300002,3.956127,4.588822,0.279282,...,0.337683,3.162584,5.567612,5.443045,5.266292,1.480316,3.274899,0.868203,-0.529539,4.469491
5,4.451593,3.837126,0.414484,3.393053,0.95387,2.224678,-0.057911,2.005477,0.354622,3.430439,...,2.135538,0.636388,1.826441,2.42536,3.868088,4.851948,3.859404,2.476079,2.631557,3.680789


In [37]:
user_item.loc[42].sort_values(ascending=False).head(10)

activityId
29    5.0
85    5.0
36    5.0
79    5.0
91    5.0
27    5.0
95    5.0
61    5.0
89    5.0
17    5.0
Name: 42, dtype: float64

In [39]:
activities_user_42 = user_item.loc[42]

In [40]:
high_rated_activities_42 = activities_user_42[activities_user_42 > 3].index

In [41]:
high_rated_activities_42

Int64Index([ 2,  4,  5,  7, 14, 17, 18, 19, 25, 27, 28, 29, 30, 31, 33, 36, 39,
            46, 53, 54, 55, 61, 62, 64, 79, 82, 85, 88, 89, 91, 93, 95, 97,
            99],
           dtype='int64', name='activityId')

In [42]:
activities_recommended_for_42 = preds_df.loc[42]

In [44]:
activities_high_recommend_for_42 = activities_recommended_for_42[activities_recommended_for_42 > 3].index

In [45]:
activities_high_recommend_for_42

Int64Index([ 2,  4,  5,  7, 11, 14, 17, 18, 19, 25, 27, 28, 29, 30, 31, 32, 33,
            36, 39, 46, 50, 53, 54, 55, 56, 57, 61, 62, 64, 69, 73, 76, 77, 79,
            81, 82, 85, 88, 89, 91, 93, 95, 96, 97, 99],
           dtype='int64', name='activityId')

In [46]:
set(activities_high_recommend_for_42) - set(high_rated_activities_42)

{11, 32, 50, 56, 57, 69, 73, 76, 77, 81, 96}

In [66]:
def get_high_recommended_activities(userId):
    activities_rated_by_user = user_item.loc[userId]
    activities_high_rated_by_user =  activities_rated_by_user[activities_rated_by_user > 3].index
    activities_recommended_for_user = preds_df.loc[userId]
    activities_high_recommend_for_user = activities_recommended_for_user[activities_recommended_for_user > 3].index
    res = dict()
    res["activityId"] = set(activities_high_recommend_for_user) - set(activities_high_rated_by_user)
    res["rate"] = preds_df.loc[userId, set(activities_high_recommend_for_user) - set(activities_high_rated_by_user)]
    print(res)
    return res