<a href="https://colab.research.google.com/github/Ang3lino/recomenderSys/blob/master/matrixFactorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:

import numpy as np
import pandas as pd

import os
import random
import pickle

from sortedcontainers import SortedList
from collections import Counter, defaultdict
from tqdm import tqdm  # modulo cuya finalidad es dar un feedback del progreso de algun procedimiento

In [0]:
# !pip install tqdm --upgrade
# tqdm.pandas()

In [6]:
from google.colab import drive  
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
def load_object(fname: str, user_count: int, item_count: int) -> defaultdict:
    fdir = 'drive/My Drive/petroleo/movielens-20m-dataset'
    fname = f'{fname}_{user_count}_{item_count}.json'
    fpath = os.path.join(fdir, 'shrinked', fname)
    with open(fpath, 'rb') as fp:
        object_ = pickle.load(fp)
    return object_

def defaultdict_set(defdict):
    return {k: set(v) for k, v in defdict.items()}


user_count = 4096
item_count = 512
user2item = load_object('user2item', user_count, item_count)
item2user = load_object('item2user', user_count, item_count)
user_item2rating = load_object('user_item2rating', user_count, item_count)



In [0]:
user2item = {int(k): list(map(int, v)) for k, v in user2item.items()}
item2user = {int(k): list(map(int, v)) for k, v in item2user.items()}
user_item2rating = {(int(i), int(j)): v for (i, j), v in user_item2rating.items()}


# Factorizacion de matrices
Con el fin de reducir espacio de almacenamiento y aumentar la velocidad del algoritmo aplicaremos factorizacion de matrices. Aqui, se busca obtener dos matrices cuyo producto aproxime de mejor manera a $R$. Es decir

$$R \approx \hat R = WU^T$$

Asumamos que $R$ con $m$ usuarios y $n$ articulos, donde $W$ es de dimension $m\times k$ y $U$ es de dimension $n \times k$. Definamos tambien la funcion de perdida

$$ J = \sum_{i, j} (r_{ij} - \hat r_{ij})^2 = \sum_{i,j} (r_{ij} - w_i^T u_j)^2 $$ 

Como de costumbre, se busca minimizar la funcion $J$, derivando parcialmente e igualando a cero tenemos.

$$ w_{i} = (\sum_{j\in\psi_i}u_ju_j^T)^{-1} \sum_{j\in\psi_i}r_{ij}u_j $$ 
_
$$u_{j} = (\sum_{i\in\Omega_j}w_iw_i^T)^{-1} \sum_{i\in\Omega_j}r_{ij}w_i$$

Vemos que tanto $w_i$ como $u_j$ dependen mutuamente. Resolveremos este problema aplicando el algoritmo de los minimos cuadrados altenantes, inicializamos tanto $U$ como $W$ con valores aleatorios y aplicamos el algoritmos un numero determinado de epochs.

In [0]:
def loss_function(ratings: dict, u, w):
    ''' r[(i, j)] -> int '''
    return np.mean([(r - w[i].dot(u[j]))**2 for (i, j), r in ratings.items()])

def vector_from_matrix(matrix: np.array, index, axis):
    if axis == 0:
        return matrix[index]
    return matrix[:, index]

def solve_system(dst, src, R, index_relation, k):
    matrix = np.zeros((k,k))
    vector = np.zeros(k)
    I = index_relation.keys()
    for i in tqdm(I, total=len(I)):
        try:
            for j in index_relation[i]:
                v = src[j]
                matrix += np.outer(v, v)
                vector += np.dot(R[(i,j)], v)
            dst[i] = np.linalg.solve(matrix, vector) 
        except KeyError:
            pass

def compute_matrices(R, i2j, j2i, epochs, k):
    m = int(max(i2j.keys())) + 1
    n = int(max(j2i.keys())) + 1
    assert(k < n)
    W = np.random.randn(m, k)
    U = np.random.randn(n, k)
    for epoch in tqdm(range(epochs)):
        matrix = np.zeros((k,k))
        vector = np.zeros(k)
        print(f"\nComputing W for iter {epoch}")

        for i in i2j.keys():
            for j in i2j[i]:
                matrix += np.outer(U[j], U[j])
                vector += R[(i,j)] * U[j]
            W[i] = np.linalg.solve(matrix, vector)
        print(f"\nComputing U for iter {epoch}")
        for j in j2i.keys():
            matrix = np.zeros((k,k))
            vector = np.zeros(k)
            try:
                for i in j2i[j]:
                    matrix += np.outer(W[i], W[i])
                    vector += R[(i,j)] * W[i]
                U[j] = np.linalg.solve(matrix, vector)
            except KeyError:
                pass  
    return U, W



In [21]:
test_matrix = {}
m = 5
n = 5
for i in range(m):
    for j in range(n):
        test_matrix[(i, j)] = i*m + j
i2j = {x: [x for x in range(n)] for x in range(m)}
j2i = {x: [x for x in range(m)] for x in range(n)}
print(test_matrix)


{(0, 0): 0, (0, 1): 1, (0, 2): 2, (0, 3): 3, (0, 4): 4, (1, 0): 5, (1, 1): 6, (1, 2): 7, (1, 3): 8, (1, 4): 9, (2, 0): 10, (2, 1): 11, (2, 2): 12, (2, 3): 13, (2, 4): 14, (3, 0): 15, (3, 1): 16, (3, 2): 17, (3, 3): 18, (3, 4): 19, (4, 0): 20, (4, 1): 21, (4, 2): 22, (4, 3): 23, (4, 4): 24}


In [31]:
U, W = compute_matrices(test_matrix, i2j, j2i, 100, 3)
# U, W = compute_matrices(user_item2rating, user2item, item2user, 10, 10)
# U * W

100%|██████████| 10/10 [00:00<00:00, 1192.24it/s]


In [54]:
print(W.shape)
print(U.shape)
print(U.T.shape)
print(np.dot(W, U.T))
# W*U.T


(5, 3)
(5, 3)
(3, 5)
[[7.33057312e-09 1.00000001e+00 2.00000001e+00 3.00000001e+00
  4.00000001e+00]
 [4.99999999e+00 5.99999999e+00 6.99999999e+00 7.99999999e+00
  8.99999999e+00]
 [1.00000000e+01 1.10000000e+01 1.20000000e+01 1.30000000e+01
  1.40000000e+01]
 [1.50000000e+01 1.60000000e+01 1.70000000e+01 1.80000000e+01
  1.90000000e+01]
 [2.00000000e+01 2.10000000e+01 2.20000000e+01 2.30000000e+01
  2.40000000e+01]]


In [65]:
ans = loss_function(test_matrix, U, W)
print(ans)

3.7533987027499755e-17


In [73]:
U, W = compute_matrices(user_item2rating, user2item, item2user, 10, 10)


  0%|          | 0/10 [00:00<?, ?it/s][A


Computing W for iter 0

Computing U for iter 0



 10%|█         | 1/10 [00:22<03:21, 22.42s/it][A


Computing W for iter 1

Computing U for iter 1



 20%|██        | 2/10 [00:44<02:58, 22.35s/it][A


Computing W for iter 2

Computing U for iter 2



 30%|███       | 3/10 [01:07<02:36, 22.39s/it][A


Computing W for iter 3

Computing U for iter 3



 40%|████      | 4/10 [01:29<02:14, 22.39s/it][A


Computing W for iter 4

Computing U for iter 4



 50%|█████     | 5/10 [01:52<01:52, 22.46s/it][A


Computing W for iter 5

Computing U for iter 5



 60%|██████    | 6/10 [02:14<01:30, 22.50s/it][A


Computing W for iter 6

Computing U for iter 6



 70%|███████   | 7/10 [02:37<01:07, 22.45s/it][A


Computing W for iter 7

Computing U for iter 7



 80%|████████  | 8/10 [02:59<00:44, 22.46s/it][A


Computing W for iter 8

Computing U for iter 8



 90%|█████████ | 9/10 [03:21<00:22, 22.46s/it][A


Computing W for iter 9

Computing U for iter 9



100%|██████████| 10/10 [03:44<00:00, 22.38s/it][A
[A

In [74]:
ans = loss_function(user_item2rating, U, W)
print(ans)

0.8064877356694792
