In [1]:
import numpy as np
import pandas as pd
import math
from helper.loader import load_ratings, load_movies, load_lists, load_corr
from helper.preprocesser import preprocess_ratings
import helper.network_explorer as ne
import helper.collaborative as coll
import cupy as cp
import cupyx
from tqdm import tqdm

## Preprocessing cf Jonas

In [2]:
ratings = load_ratings()
movies = load_movies()
lists = load_lists()

In [3]:
ratings_new, lists_new = preprocess_ratings(ratings, lists, min_ratings = 500)
print(f'{round((len(ratings) - len(ratings_new)) / len(ratings) * 100, 2)} % of ratings removed (out of {len(ratings)})')
print(f'{round((len(lists) - len(lists_new)) / len(lists) * 100, 2)} % lists removed (out of {len(lists)})')

47.91 % of ratings removed (out of 15459945)
58.93 % lists removed (out of 80311)


In [4]:
print("ratings new len {}, index user_id from {} to {}, movie_id from {} to {}".format(len(ratings_new),ratings_new["user_id"].min(),np.max(ratings_new["user_id"]),ratings_new["movie_id"].min(),np.max(ratings_new["movie_id"])))

ratings new len 8052995, index user_id from 2941 to 99988538, movie_id from 1 to 263709


In [5]:
print("{} unique users rate {} unique movies".format(ratings_new["user_id"].nunique(),ratings_new["movie_id"].nunique()))

6547 unique users rate 136555 unique movies


In [6]:
print("{} unique users created {} unique list".format(lists_new["user_id"].nunique(),lists_new["list_id"].nunique()))

3106 unique users created 32983 unique list


## User-item matrix 

The sparse version:

- faster
- lighter in memory
- cleaner
- no index/columns naming possibilities

The dense version:

- DataFrame so more used
- way more data...
- manipulate with care because that thing is killing kernel after kernel... :D

In [7]:
dense_user_item = coll.get_dense_user_item(ratings_new)

dense_user_item

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,263678,263682,263685,263688,263689,263692,263693,263701,263708,263709
2941,,,,,,,,,,,...,,,,,,,,,,
14464,,,,,,,,,,,...,,,,,,,,,,
15344,,,,,,,,,,,...,,,,,,,,,,
29237,,,,,,,,,,,...,,,,,,,,,,
78410,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99909118,,,,4.0,,,,,,,...,,,,,,,,,,
99927639,,,,,,,,,,,...,,,,,,,,,,
99942902,,,,,,,,,,,...,,,,,,,,,,
99950057,,,,,,,,,,4.0,...,,,,,,,,,,


In [8]:
sparse_user_item,user_id,movie_id = coll.get_sparse_user_item(ratings_new)

In [9]:
print(user_id)
print(movie_id)
sparse_user_item

[    2941    14464    15344 ... 99942902 99950057 99988538]
[     1      2      3 ... 263701 263708 263709]


<6547x136555 sparse matrix of type '<class 'numpy.float64'>'
	with 8052975 stored elements in Compressed Sparse Row format>

## Correlation

### Approach

Static (compute all the correlation once):

- long process
- memory consumption during matrix 
- not realistic (the ratings may change)
- extremely fast at inference (lookup table)

Dynamic (when asked, compute all the correlation between one of the user and the others):

- slower at inference
- must be computed each time we want a recommendation
- the user-item matrix can be modified with new ratings -> changes taken into account

### Implementation

Sparse:

- no correlation (pearsonr consider the 0 as a value of 0 instead of ignoring)

Pandas:

- corr is extremely slow 

Numpy:

- doesn't work with NaN, if replaced by 0, consider it as a value of 0 instead of ignoring it (impact the rating average and therefore the Pearson correlation coefficient)

### Decision

- Ran once the correlation with pandas to get the complete user-user correlation matrix which is saved and now available (WARNING: replace all the diagonal values of 1 with 0 to avoid taking into account itself). Extremely fast (basically a value look up)
- Implement the user-users correlation if we want to run for a specific user on a modified user-item matrix (still quite fast ~2sec for gaining the possibility to change the ratings and adapt the corr)

In [10]:
correlation_matrix = load_corr()

In [11]:
correlation_matrix

Unnamed: 0,2941,14464,15344,29237,78410,83575,94978,99812,114207,118503,...,99842199,99845748,99866338,99866452,99869481,99909118,99927639,99942902,99950057,99988538
2941,0.000000,0.180883,0.455090,0.324109,0.438966,0.142645,0.318194,0.343710,0.257014,0.306580,...,0.293498,0.113829,0.234693,0.339208,0.252182,0.327150,0.444337,0.342765,0.042680,0.400991
14464,0.180883,0.000000,0.168324,0.180878,0.034347,0.032793,0.095701,0.050917,0.103081,-0.037541,...,0.151123,0.071456,0.134331,0.184118,0.146321,0.146878,0.216887,0.290084,0.026588,0.077703
15344,0.455090,0.168324,0.000000,0.342190,0.439753,-0.038968,0.241974,0.284743,0.185410,0.270044,...,0.349389,0.215421,0.222203,0.335984,0.276928,0.148763,0.454404,0.351177,0.121245,0.388434
29237,0.324109,0.180878,0.342190,0.000000,0.310913,0.034585,0.310180,0.221524,0.139391,0.184467,...,0.305879,0.042415,0.358378,0.215187,0.177015,0.215797,0.436691,0.163031,0.148984,0.348098
78410,0.438966,0.034347,0.439753,0.310913,0.000000,0.094151,0.405524,0.261443,0.145734,0.247828,...,0.387691,0.223250,0.330389,0.309805,0.323445,0.337737,0.510442,0.349845,-0.030461,0.464165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99909118,0.327150,0.146878,0.148763,0.215797,0.337737,0.064197,0.389983,0.362495,0.136374,0.344308,...,0.325487,0.071251,0.282040,0.285459,0.050702,0.000000,0.324746,0.265419,0.133017,0.332724
99927639,0.444337,0.216887,0.454404,0.436691,0.510442,0.256607,0.460343,0.456266,0.233540,0.044746,...,0.338277,0.344320,0.380061,0.527994,0.253761,0.324746,0.000000,0.393622,0.076607,0.534535
99942902,0.342765,0.290084,0.351177,0.163031,0.349845,0.000000,0.283869,0.301925,0.047731,-0.116750,...,0.031551,0.260236,0.199393,0.388187,0.138173,0.265419,0.393622,0.000000,0.247024,0.231132
99950057,0.042680,0.026588,0.121245,0.148984,-0.030461,-0.010737,0.206570,0.216791,0.145257,0.131435,...,0.098022,0.053073,0.088702,-0.109035,-0.099000,0.133017,0.076607,0.247024,0.000000,0.081648


In [12]:
corr_test_1 = coll.get_k_similar_users(correlation_matrix,2941,K = 200)

In [13]:
corr_test_1

Unnamed: 0,88953139,79410098,607820,76437799,50596997,52537817,14361993,79308895,14343424,37367509,...,13008832,29028584,76113303,27933874,24603322,9235437,53761570,71779215,90024115,5366442
2941,0.752096,0.662893,0.639772,0.637455,0.609387,0.597889,0.596845,0.592807,0.591678,0.589577,...,0.487219,0.486749,0.486683,0.486235,0.486059,0.485904,0.485871,0.485865,0.485756,0.485648


In [14]:
corr_test_2 = coll.get_k_dynamic_similar_users(dense_user_item,2941,K = 200)

In [15]:
corr_test_2

Unnamed: 0,88953139,79410098,607820,76437799,50596997,52537817,14361993,79308895,14343424,37367509,...,13008832,29028584,76113303,27933874,24603322,9235437,53761570,71779215,90024115,5366442
2941,0.752096,0.662893,0.639772,0.637455,0.609387,0.597889,0.596845,0.592807,0.591678,0.589577,...,0.487219,0.486749,0.486683,0.486235,0.486059,0.485904,0.485871,0.485865,0.485756,0.485648


## Popularity

Judge the popularity of user by the number of followers for its list

In [16]:
popularity = coll.get_popularity(lists_new,dense_user_item)

In [17]:
popularity

Unnamed: 0,popularity
2941,0.000000
14464,0.000000
15344,0.000000
29237,0.000000
78410,0.001207
...,...
99909118,0.013594
99927639,0.000000
99942902,0.000000
99950057,0.000000


In [18]:
k_popu = coll.get_k_popularity(popularity,corr_test_2)
k_popu

Unnamed: 0,88953139,79410098,607820,76437799,50596997,52537817,14361993,79308895,14343424,37367509,...,13008832,29028584,76113303,27933874,24603322,9235437,53761570,71779215,90024115,5366442
popularity,0.0,0.0,0.000269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000724,0.0,0.0,0.000181,0.0


## Recommendation

In [19]:
popu_matrix = coll.get_popularity(lists_new,dense_user_item)
popu_matrix

Unnamed: 0,popularity
2941,0.000000
14464,0.000000
15344,0.000000
29237,0.000000
78410,0.001207
...,...
99909118,0.013594
99927639,0.000000
99942902,0.000000
99950057,0.000000


# test

In [22]:
test_weight = corr_test_1

test_user = 2941

test_matrix = dense_user_item

test_average = coll.compute_average_ratings(dense_user_item)

In [21]:
test_weight

Unnamed: 0,88953139,79410098,607820,76437799,50596997,52537817,14361993,79308895,14343424,37367509,...,13008832,29028584,76113303,27933874,24603322,9235437,53761570,71779215,90024115,5366442
2941,0.752096,0.662893,0.639772,0.637455,0.609387,0.597889,0.596845,0.592807,0.591678,0.589577,...,0.487219,0.486749,0.486683,0.486235,0.486059,0.485904,0.485871,0.485865,0.485756,0.485648


In [41]:
dense_user_item.loc[test_weight.columns.values]

number_of_nan = dense_user_item.loc[test_weight.columns.values].isna().sum()
print(len(number_of_nan))
possible_index = number_of_nan[number_of_nan <= 195]
possible_index

136555


Int64Index([     1,     10,     11,     12,     13,     14,     15,     21,
                25,     26,
            ...
            238671, 238988, 240428, 241025, 241524, 241947, 242224, 249720,
            254687, 255150],
           dtype='int64', length=10239)

In [42]:
prediction = []
norm = False
for movie in tqdm(possible_index.index.values):

    if np.isnan(dense_user_item.loc[test_user,movie]):

        if norm:
            prediction.append([movie,coll.predict_value_norm(test_user,movie,test_weight,dense_user_item,test_average)])
        else:
            prediction.append([movie,coll.predict_value(test_user,movie,test_weight,dense_user_item,test_average)])

100%|██████████| 10239/10239 [00:39<00:00, 259.63it/s]


In [24]:
test_weight.shape[1]

200

In [40]:
prediction = np.array(prediction)
pred = pd.DataFrame(prediction[:,1],index=prediction[:,0])
    
recommendation = pred.nlargest(10,columns=0)

recommendation.rename(columns={0:"prediction"},inplace=True)
recommendation.index = recommendation.index.astype(int)

recommendation

Unnamed: 0,prediction
23369,4.933355
441,4.85754
3027,4.855849
185418,4.843423
406,4.842156
771,4.835325
172222,4.834533
2142,4.829196
161305,4.828839
1919,4.824794


In [38]:
prediction

[[1, 4.008861010690103],
 [10, 4.28846862713263],
 [12, 4.597746892476789],
 [13, 3.9550913362270985],
 [14, 4.596961411793808],
 [15, 3.9018184878334443],
 [21, 4.470873116495435],
 [25, 3.662309123973032],
 [26, 3.6176605359955034],
 [30, 4.089000607998235],
 [44, 4.610710473116762],
 [46, 3.4089128963381836],
 [47, 3.6374676055461093],
 [50, 4.098156491795781],
 [55, 2.8078739007559963],
 [61, 3.0426363793838314],
 [69, 3.3296681203205636],
 [73, 3.840384765628484],
 [77, 3.514036116344589],
 [85, 4.3381487441530435],
 [89, 4.098657673553749],
 [91, 4.078939940555216],
 [93, 4.212805233304131],
 [94, 3.5197809483693665],
 [95, 3.2036495403750083],
 [99, 2.897245256826953],
 [103, 3.818322289304548],
 [104, 4.144998904585574],
 [105, 3.8277126716007914],
 [106, 3.5041367237581516],
 [109, 3.4384293734967883],
 [110, 3.6193221632036616],
 [111, 3.7489739309798313],
 [113, 3.933410172408555],
 [116, 4.334281750439405],
 [119, 2.801750869330594],
 [121, 3.8079953300261575],
 [122, 4.047

# end test

In [20]:
recomm_hybdrid = coll.hybdrid_RS(2941,dense_user_item,popu_matrix,neighboor_size=100,top_K=30,norm=True)
recomm_hybdrid.transpose()

136555
6004


100%|██████████| 6004/6004 [00:13<00:00, 454.36it/s]


Unnamed: 0,300,406,441,442,1385,1421,1507,1508,1919,2142,...,435,88967,228,298,39349,668,558,1505,44,150494
prediction,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,4.952519,4.95244,4.951538,4.946623,4.942616,4.932101,4.929935,4.92353,4.918668,4.915638


In [21]:
recomm_pop = coll.popularity_RS(2941,dense_user_item,popu_matrix,neighboor_size=100,top_K=30,norm=True)
recomm_pop.transpose()

136555
6004


100%|██████████| 6004/6004 [00:13<00:00, 452.83it/s]


Unnamed: 0,21,44,135,152,153,228,248,298,300,302,...,435,442,474,498,510,552,637,728,823,826
prediction,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [22]:
recomm_classic = coll.classic_RS(2941,dense_user_item,neighboor_size=100,top_K=30,norm=True)
recomm_classic.transpose()

136555
6004


100%|██████████| 6004/6004 [00:13<00:00, 453.69it/s]


Unnamed: 0,300,406,441,442,1385,1507,1508,1919,2142,11990,...,228,435,298,88967,39349,668,558,1505,150494,44
prediction,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,4.950528,4.949051,4.946516,4.94607,4.941331,4.93239,4.929951,4.919641,4.915743,4.91279


(Too) many prediction are actually higher than 5 (it can happens if the average of U is higher than average of V, but V rated highly some movie in question -> around 6.1 value)

That's really annoying and didn't really found solution

So just redoing the formula without normalization

In [23]:
recomm_hybdrid_not = coll.hybdrid_RS(2941,dense_user_item,popu_matrix,neighboor_size=100,top_K=30)
recomm_hybdrid_not.transpose()

136555
6004


100%|██████████| 6004/6004 [00:12<00:00, 480.24it/s]


Unnamed: 0,1919,23369,406,20978,441,3062,442,1508,21430,11990,...,228,300,558,435,238,1421,743,2212,2091,367
prediction,5.0,5.0,4.861399,4.848184,4.845113,4.839777,4.836396,4.835697,4.807595,4.806765,...,4.727857,4.72623,4.724111,4.715798,4.711315,4.706946,4.705057,4.702941,4.696884,4.690935


In [24]:
recomm_pop_not = coll.popularity_RS(2941,dense_user_item,popu_matrix,neighboor_size=100,top_K=30)
recomm_pop_not.transpose()

136555
6004


100%|██████████| 6004/6004 [00:12<00:00, 492.66it/s]


Unnamed: 0,44,474,664,693,723,1328,1329,1919,1954,2090,...,8606,20832,21450,21477,21536,21969,22283,22470,23065,23340
prediction,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [25]:
recomm_classic_not = coll.classic_RS(2941,dense_user_item,neighboor_size=100,top_K=30,norm=True)
recomm_classic_not.transpose()

136555
6004


100%|██████████| 6004/6004 [00:13<00:00, 454.50it/s]


Unnamed: 0,300,406,441,442,1385,1507,1508,1919,2142,11990,...,228,435,298,88967,39349,668,558,1505,150494,44
prediction,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,4.950528,4.949051,4.946516,4.94607,4.941331,4.93239,4.929951,4.919641,4.915743,4.91279
