<a href="https://colab.research.google.com/github/Ang3lino/learningML/blob/master/matrixFactorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:

import numpy as np
import pandas as pd

import os
import random
import pickle

from sortedcontainers import SortedList
from collections import Counter, defaultdict
from tqdm import tqdm  # modulo cuya finalidad es dar un feedback del progreso de algun procedimiento

In [0]:
# !pip install tqdm --upgrade
# tqdm.pandas()

In [12]:
from google.colab import drive  
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def load_object(fname: str, user_count: int, item_count: int) -> defaultdict:
    fdir = 'drive/My Drive/petroleo/movielens-20m-dataset'
    fname = f'{fname}_{user_count}_{item_count}.json'
    fpath = os.path.join(fdir, 'shrinked', fname)
    with open(fpath, 'rb') as fp:
        object_ = pickle.load(fp)
    return object_

def defaultdict_set(defdict):
    return {k: set(v) for k, v in defdict.items()}


user_count = 4096
item_count = 512
user2item = load_object('user2item', user_count, item_count)
item2user = load_object('item2user', user_count, item_count)
user_item2rating = load_object('user_item2rating', user_count, item_count)

user2item = {int(k): list(map(int, v)) for k, v in user2item.items()}
item2user = {int(k): list(map(int, v)) for k, v in item2user.items()}


# Factorizacion de matrices
Con el fin de reducir espacio de almacenamiento y aumentar la velocidad del algoritmo aplicaremos factorizacion de matrices. Aqui, se busca obtener dos matrices cuyo producto aproxime de mejor manera a $R$. Es decir

$$R \approx \hat R = WU^T$$

Asumamos que $R$ con $m$ usuarios y $n$ articulos, donde $W$ es de dimension $m\times k$ y $U$ es de dimension $n \times k$. Definamos tambien la funcion de perdida

$$ J = \sum_{i, j} (r_{ij} - \hat r_{ij})^2 = \sum_{i,j} (r_{ij} - w_i^T u_j)^2 $$ 

Como de costumbre, se busca minimizar la funcion $J$, derivando parcialmente e igualando a cero tenemos.

$$ w_{i} = (\sum_{j\in\psi_i}u_ju_j^T)^{-1} \sum_{j\in\psi_i}r_{ij}u_j $$ 
_
$$u_{j} = (\sum_{i\in\Omega_j}w_iw_i^T)^{-1} \sum_{i\in\Omega_j}r_{ij}w_i$$

.

In [0]:
def loss_function(ratings: dict, u, w):
    ''' r[(i, j)] -> int '''
    return np.mean((r - w[i].dot(u[j])) ** 2 for (i, j), r in ratings.items())

def vector_from_matrix(matrix: np.array, index, axis):
    if axis == 0:
        return matrix[index]
    return matrix[:, index]

def compute_matrices(R, i2j, j2i, epochs, k):
    m = int(max(i2j.keys())) + 1
    n = int(max(j2i.keys())) + 1
    W = np.random.randn(m, k)
    U = np.random.randn(n, k)
    for epoch in tqdm(range(epochs)):
        matrix = np.zeros((k,k))
        vector = np.zeros(k)
        for i in i2j.keys():
            for j in i2j[i]:
                matrix += np.outer(U[j], U[j])
                vector += R[(i,j)] * U[j]
            W[i] = np.linalg.solve(matrix, vector)
        # for j in j2i.keys():
        #     matrix = np.zeros((k,k))
        #     vector = np.zeros(k)
        #     for i in j2i.keys():
        #         matrix += np.outer(W[i], W[i])
        #         vector += R[(i,j)] * W[i]
        #     U[j] = np.linalg.solve(matrix, vector)
    return U, W

def solve_system(dst, src, R, index_relation, axis, k):
    matrix = np.zeros((k,k))
    vector = np.zeros(k)
    for i in index_relation.keys():
        for j in index_relation[i]:
            v = (src[:, j] if axis == 0 else src[j])
            print(src[j])
            print(src[:,j])
            print(matrix)
            matrix += np.outer(v, v)
            vector += np.dot(R[(i,j)], v)
        if axis == 0:
            dst[i] = np.linalg.solve(matrix, vector) 
        else:
            dst[:, i] = np.linalg.solve(matrix, vector) 

In [17]:
test_matrix = {}
m = 4
n = 4
for i in range(m):
    for j in range(n):
        test_matrix[(i, j)] = i*m + j
i2j = {0: [0,1,2], 1:[0,1,2], 2:[0,1,2]}
j2i = {0: [0,1,2], 1:[0,1,2], 2:[0,1,2]}
print(test_matrix)


{(0, 0): 0, (0, 1): 1, (0, 2): 2, (0, 3): 3, (1, 0): 4, (1, 1): 5, (1, 2): 6, (1, 3): 7, (2, 0): 8, (2, 1): 9, (2, 2): 10, (2, 3): 11, (3, 0): 12, (3, 1): 13, (3, 2): 14, (3, 3): 15}


In [18]:
U, W = compute_matrices(test_matrix, i2j, j2i, 10, 10)
# U, W = compute_matrices(user_item2rating, user2item, item2user, 10, 10)

100%|██████████| 10/10 [00:00<00:00, 1654.88it/s]


In [19]:
ans = loss_function(user_item2rating, U, W)

TypeError: ignored

In [0]:
a = np.array([[1,2],[3,4]])
np.outer(a,a)

In [0]:
b = np.array([1,2,3,4])
np.outer(b,b)

In [0]:
max(user2item.values())

In [0]:
np.zeros((2,2))

In [0]:
l = np.array([i for i in range(3)])
print(l)
l = np.outer(l,l)
print(l)

In [0]:
t = np.outer(l[:, 0], l[:, 1])
print(t)