# Notebook du projet : Apprentissage et factorisation matricielle
--------------
* ISSELNANE Hacene
* HADDAD Ayale 


-------
### Ce notebook contient les trois parties : 
 * Fichier python décrivant la compilation et le formattage des données d'entraînement et de test
 * Les fichiers des paramètres associés aux deux modèles entraînés;
 * Fichier python permettant de charger et d'appliquer les 2 modèles sur les données de test. 

## Import des librairies et fichiers d'implémentation

In [1]:
import numpy as np
import pandas as pd
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from dataset import Dataset
from dataset import Reader

## Chargement des données du projet

In [13]:
bookmarks = pd.read_csv('../Dataset/bookmarks.csv')

In [15]:
asset_genre = pd.read_csv('../Dataset/asset_genres.csv',encoding="utf8")
assets = pd.read_csv('../Dataset/assets.csv',encoding="utf8")
favorites = pd.read_csv('../Dataset/favorites.csv',encoding="utf8")
genres = pd.read_csv('../Dataset/genres.csv',encoding="utf8")
ratings = pd.read_csv('../Dataset/ratings.csv',encoding="utf8")

## Création d'un dataframe regroupant les données utiles au projet

In [16]:
bookmarks_merged= bookmarks.merge(right=ratings,on=['id_profile','id_asset'],how='left')

In [17]:
bookmarks_merged= bookmarks_merged.merge(right=favorites,on=['id_profile','id_asset'],how='left')

In [18]:
bookmarks_merged = bookmarks_merged.drop(columns=['time_x','time_y'])

In [19]:
bookmarks_merged= bookmarks_merged.rename(columns ={'added_date':'favorite_date'})

### Calcul des ${R_{ui}}$ pour chaque entrée  

In [27]:
def intereset_generator_optimize(dataset):
    dim = dataset.shape[0]
    dataset_numpy = np.nan_to_num(dataset.to_numpy())
    w = np.array([1] * dim)
    n = np.array([dataset_numpy[x,2] for x in tqdm(range(0,dim))])
    f = np.array([5 if (dataset_numpy[x,3] != 0) else 0 for x in tqdm(range(0,dim))])
    return w + n + f

In [28]:
bookmarks_merged['rui'] = intereset_generator_optimize(bookmarks_merged)

HBox(children=(IntProgress(value=0, max=73380629), HTML(value='')))




HBox(children=(IntProgress(value=0, max=73380629), HTML(value='')))




## Split des données en Train & Test

In [30]:
bookmarks_idx_train = np.load('../Dataset/bookmarks_idx_train.npy')
bookmarks_idx_test = np.load('../Dataset/bookmarks_idx_test.npy')

In [32]:
trainset = bookmarks_merged.iloc[bookmarks_idx_train]
testset = bookmarks_merged.iloc[bookmarks_idx_test]

## Création des datasets d'entrainement et de test

In [34]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(trainset[['id_profile', 'id_asset', 'rui']], reader)
testdata = Dataset.load_from_df(testset[['id_profile', 'id_asset', 'rui']], reader)
trainset = data.build_full_trainset()
test = testdata.build_full_trainset().build_testset()

## Aproche Baseline

In [37]:
from baseline import Baseline

baseline_approche = Baseline(learning_rate= .00005, reg=.002, n_epochs=20)

In [55]:
baseline_approche.fit(trainset)

Estimating biases using sgd...


In [49]:
predictions = baseline_approche.test(test)

In [50]:
baseline_approche.rmse(predictions)

RMSE: 0.5393


0.539280270704883

## SVD++

In [66]:
from svdpp import SVDpp

svdpp_approche = SVDpp(n_factors=20, n_epochs=20, init_mean=0, init_std_dev=.1,lr_all=.007, reg_all=.02)

In [65]:
svdpp_approche.fit(trainset)

Estimating biases using sgd...


In [61]:
predictions = svdpp_approche.estimate(test)

In [58]:
svdpp_approche.rmse(predictions)

RMSE: 0.3912


0.391245231240562