# Ad hod experiments with the MovieLens datasets

The MovieLens dataset corresponds to a Matrix Completion problem. We have about 6,000 MovieLens users and 4,000 movies. Each user and each movie is provided (categorical) relevent information. The goal is to infer users' ratings (an integer, from 1 star to 5 stars). There are approximately $6,000 \times 4,000 = 24,000,000$ possible ratings, and only $1,000,000$ of them are observed (provided).  

NB: the $k$NN$\times$KDE is not designed (yet...) to handle categorical data.  
NB2: the missing rate for this problem is around 96% (which is huge!)... We do not provide solid guarantee on these results.

- - - - -

Method:  
We will only look for neighbours within users. Matching movies instead of users can be another option, not tested here. To match users, we use only two features, namely gender and age, which we treat as numerical features even thought they are provided as categorical features.

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils import normalization
from knnxkde import KNNxKDE

In [10]:
# Datasets can be downloaded here: https://grouplens.org/datasets/movielens/1m/
filename_movies = 'data/ml-1m/movies.dat'
names_movies = ['movie_id', 'name', 'genres']
data_movies = pd.read_csv(filename_movies, sep='::', header=None, names=names_movies, encoding='latin-1')

filename_users = 'data/ml-1m/users.dat'
names_users = ['user_id', 'gender', 'age', 'job', 'zipcode']
data_users = pd.read_csv(filename_users, sep='::', header=None, names=names_users, encoding='latin-1')

filename_ratings = 'data/ml-1m/ratings.dat'
names_ratings = ['user_id', 'movie_id', 'rating', 'time']
data_ratings = pd.read_csv(filename_ratings, sep='::', header=None, names=names_ratings, encoding='latin-1')

In [11]:
nb_repeat = 100000
true_ratings = np.zeros(nb_repeat)
imputed_ratings = np.zeros(nb_repeat)

for n in range(nb_repeat):
    if (n+1)%100==0:
        print(f'{n+1}/{nb_repeat}', end='\r', flush=True)
    
    cur_movie_id = data_ratings.iloc[n]['movie_id']
    cur_user_id = data_ratings.iloc[n]['user_id']
    cur_mask = (data_ratings['movie_id']==cur_movie_id)
    if cur_mask.sum()==1:  # sometimes, just one rating per movie --> skip
        true_ratings[n] = data_ratings.iloc[n]['rating']
        imputed_ratings[n] = np.nan
        continue
        
    all_users_id = data_ratings[cur_mask]['user_id']
    cur_data_users = data_users.iloc[np.array(all_users_id)-1]

    my_data = np.zeros((cur_data_users.shape[0], 3))
    my_data[:, 0] = np.array(cur_data_users['gender'] == 'M', dtype='float32')  # Gender as a numerical feature
    my_data[:, 1] = np.array(cur_data_users['age'], dtype='float32')  # Age as a numerical feature
    my_data[:, 2] = np.array(data_ratings[cur_mask]['rating'], dtype='float32')  # Rating
    idx_to_hide = np.where(all_users_id == cur_user_id)[0][0]
    my_data[idx_to_hide, 2] = np.nan
    norm_miss_data, norm_params = normalization(my_data)

    knnxkde = KNNxKDE(h=0.03, tau=1.0/10.0, metric='nan_std_eucl')
    norm_imputed_data = knnxkde.impute_mean(norm_miss_data)
    m1 = norm_params['min_val'][2]
    m2 = norm_params['max_val'][2]
    true_ratings[n] = data_ratings.iloc[n]['rating']
    imputed_ratings[n] = norm_imputed_data[idx_to_hide, 2] * (m2 + 1e-6) + m1

100000/100000

In [13]:
rmse = np.sqrt(np.nanmean((true_ratings - imputed_ratings) ** 2.0))
print(f'RMSE = {rmse:0.4f}', end='')

RMSE = 0.9746