## Factorizacion de Matrices

Primero creemos una matriz de diseño inicial con valores aleatorios entre 0.1 y 0.9. Tambien creemos la funcion del error cuadratico medio

In [1]:
import numpy as np

class matrix_factorization():
    
    def __init__(self,data,features):
        self.data = data
        self.features = features
        self.user_count = data.shape[0]
        self.item_count = data.shape[1]
        self.user_features = np.random.uniform(low=0.1,high = 0.9, size = (self.user_count,self.features))
        self.item_features = np.random.uniform(low=0.1,high = 0.9, size = (self.features,self.item_count))
        
    def MSE(self):
        """
        Mean Squared Error function comparing dot product of user-feature row and feature-item column to user-item cell
        """
        
        matrix_product = np.matmul(self.user_features,self.item_features)
        return np.sum((self.data - matrix_product)**2)


    def single_gradient(self,user_row,item_col,wrt_user_idx = None, wrt_item_idx = None):
        """
        Calcula el gradiente de un unica celda usuario-item a una unica celda usuario-feature o item-feature
        """
        
        if wrt_user_idx !=None and wrt_item_idx !=None:
            return "Too many elements"
        elif wrt_user_idx ==None and wrt_item_idx ==None:
            return "insufficient elements"
        else:
            u_row = self.user_features[user_row,:]
            i_col = self.item_features[:,item_col]
            ui_rating = float(self.data[user_row,item_col])
            prediction = float(np.dot(u_row,i_col))
            
            if wrt_user_idx != None:
                row_elem = float(i_col[wrt_user_idx])
                gradient = 2*(ui_rating-prediction)*row_elem
            else:
                col_elem = float(u_row[wrt_item_idx])
                gradient = 2*(ui_rating-prediction)*col_elem
            return gradient
        
    def user_feature_gradient(self,user_row,wrt_user_idx):
        """
        Averages the gradients of a single user-item row with respect to a single user-feature parameter
        """
        
        summation = 0
        for col in range(0,self.item_count):
            summation += self.single_gradient(user_row = user_row,item_col=col,wrt_user_idx=wrt_user_idx)
        return summation/self.item_count
    
    def item_feature_gradient(self,item_col,wrt_item_idx):
        """
        Averages the gradients of a single user-item column with respect to a single feature-item parameter
        """
        
        summation = 0
        for row in range(0,self.user_count):
            summation += self.single_gradient(user_row = row,item_col=item_col,wrt_item_idx=wrt_item_idx)
        return summation/self.user_count
    
    def update_user_feature(self,learning_rate):
        """
        Updates every user-feature parameter according to supplied learning rate
        """
        for i in range(0, self.user_count):
            for j in range(0,self.features):
                self.user_features[i,j] += learning_rate*self.user_feature_gradient(user_row=i,wrt_user_idx=j)
        
    def update_item_feature(self,learning_rate):
        """
        Updates every feature-item parameter according to supplied learning rate
        """
        for i in range(0, self.features):
            for j in range(0,self.item_count):
                self.item_features[i,j] += learning_rate*self.item_feature_gradient(item_col=j,wrt_item_idx=i)

    ### Metodo de entrenamiento
    
    def train_model(self,learning_rate=0.1,iterations = 1000):
        """
        Trains model, outputting MSE cost/loss every 50 iterations, using supplied learning and iterations
        """
        for i in range(iterations):
            self.update_user_feature(learning_rate=learning_rate)
            self.update_item_feature(learning_rate=learning_rate)
            if i %50 ==0:
                print(self.MSE())
            

Este es un programa de cero, veamos que pasa si creamos matrices

In [2]:
d = np.array([[5,3,1],[1,3,5],[3,5,1]])
print(d)
d2 = matrix_factorization(d,2)
d2.train_model(learning_rate = .1)

[[5 3 1]
 [1 3 5]
 [3 5 1]]
66.26235181326143
3.538655335526818
3.5386552492239662
3.538655249223968
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662
3.5386552492239662


In [3]:
np.dot(d2.user_features,d2.item_features)

array([[4.28078016, 3.86133275, 0.55412635],
       [0.8201476 , 3.21539   , 4.88850218],
       [3.84015545, 3.99383559, 1.52084655]])

Si consideramos un feature (caracteristica adicional), entonces nos mejora la descomposicion

In [4]:
d = np.array([[5,3,1],[1,3,5],[3,5,1]])
print(d)
d2 = matrix_factorization(d,3)
d2.train_model(learning_rate = .1)

[[5 3 1]
 [1 3 5]
 [3 5 1]]
47.142441246692094
2.783028493521302e-09
1.7644288402772858e-22
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30
1.2818989709841442e-30


In [5]:
np.dot(d2.user_features,d2.item_features)

array([[5., 3., 1.],
       [1., 3., 5.],
       [3., 5., 1.]])

In [6]:
### Ahora en Keras

from __future__ import print_function, division
from builtins import range, input
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

import tensorflow

from tensorflow import keras

In [7]:
from keras.models import Model
from keras.layers import Input, Embedding, Dot, Add, Flatten
from keras.regularizers import l2
from tensorflow.keras.optimizers import Adam, SGD

In [8]:
df = pd.read_csv('rating.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'rating.csv'

In [None]:
df.head()

In [None]:
user = df['userId'].value_counts().index
map = {k:i for i, k in enumerate(user)}
df['userId'] = df['userId'].map(map)

In [None]:
mov = df['movieId'].value_counts().index
map = {k:i for i, k in enumerate(mov)}
df['movieId'] = df['movieId'].map(map)

In [None]:
N = df['userId'].max()
M = df['movieId'].max()

df.drop('timestamp', axis = 1, inplace = True)

N,M



In [None]:
n = 1000
m = 800

from collections import Counter

In [None]:
ucount = Counter(df['userId'])
mcount = Counter(df['movieId'])

uid = [u for u, c in ucount.most_common(n)]
mid = [u for u, c in mcount.most_common(m)]

In [None]:
newdf = df[df['userId'].isin(uid) & df['movieId'].isin(mid)]
newdf.head()

In [None]:
N = newdf['userId'].max()
M = newdf['movieId'].max()

user = newdf['userId'].value_counts().index
map = {k:i for i, k in enumerate(user)}
newdf['userId'] = newdf['userId'].map(map)


In [None]:
mov = newdf['movieId'].value_counts().index
map = {k:i for i, k in enumerate(mov)}
newdf['movieId'] = newdf['movieId'].map(map)

In [None]:
newdf = shuffle(newdf)
cutoff = int(0.8*len(newdf))
cutoff

In [None]:
train = newdf.iloc[: cutoff]
test = newdf.iloc[cutoff:]

In [None]:
K = 10
mu = newdf['rating'].mean()
epochs = 25
reg = 0.
N = 1000
M = 800

# KERAS

In [None]:
u = Input(shape=(1,))
m = Input(shape=(1,))

u_embed = Embedding(N, K, embeddings_regularizer = l2(reg))(u) # size (N,1,K)
m_embed = Embedding(M, K, embeddings_regularizer = l2(reg))(m) # size (M,1,K)

In [None]:
u_bias = Embedding(N, 1, embeddings_regularizer = l2(reg))(u)
m_bias = Embedding(M, 1, embeddings_regularizer = l2(reg))(m)

x = Dot(axes = 2)([u_embed, m_embed])

x = Add()([x, u_bias, m_bias])
x = Flatten()(x) # N,1

model = Model(inputs = (u, m),
             outputs = x)

model.compile(loss = 'mse',
             optimizer = Adam(learning_rate = 0.01),
             metrics = ['mse'])

In [None]:
r = model.fit(x = [train['userId'].values, train['movieId'].values],
                 y = train['rating'].values - mu,
                 epochs = epochs,
                 batch_size = 256,
                 validation_data = ([test['userId'].values, test['movieId'].values],
                 test['rating'].values - mu)
                 )

In [None]:
# plot losses
plt.plot(r.history['loss'], label="train loss")
plt.plot(r.history['val_loss'], label="test loss")
plt.legend()
plt.show()

# plot mse
plt.plot(r.history['mse'], label="train mse")
plt.plot(r.history['val_mse'], label="test mse")
plt.legend()
plt.show()



## OTRO KERAS

In [None]:
from sklearn.datasets import dump_svmlight_file
import numpy as np
import pandas as pd
import os
import urllib
import zipfile
from sklearn.model_selection import train_test_split
import shutil
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
dataset = pd.read_csv('ml-100k/u.data',sep='\t',names="user_id,item_id,rating,timestamp".split(","))
dataset.head()

In [None]:
dataset.user_id = dataset.user_id.astype('category').cat.codes.values
dataset.item_id = dataset.item_id.astype('category').cat.codes.values

In [None]:
train, test = train_test_split(dataset, test_size=0.2)

In [None]:
#%tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras
from keras.optimizers import Adam

In [None]:
n_users, n_movies = len(dataset.user_id.unique()), len(dataset.item_id.unique())
n_latent_factors = 20

In [None]:
movie_input = keras.layers.Input(shape=[1],name='Item')
movie_embedding = keras.layers.Embedding(n_movies + 1, n_latent_factors, name='Movie-Embedding')(movie_input)
movie_vec = keras.layers.Flatten(name='FlattenMovies')(movie_embedding)
user_input = keras.layers.Input(shape=[1],name='User')
user_vec = keras.layers.Flatten(name='FlattenUsers')(keras.layers.Embedding(n_users + 1, n_latent_factors,name='User-Embedding')(user_input))
prod = keras.layers.dot([movie_vec, user_vec], axes=1,name='DotProduct')
model = keras.Model([user_input, movie_input], prod)

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'mse'])

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, to_file='model.png')

In [None]:
history = model.fit([train.user_id, train.item_id], train.rating, epochs=100, verbose=0)

In [None]:
pd.Series(history.history['loss']).plot(logy=True)
plt.xlabel("Epoch")
plt.ylabel("Training Error")

In [None]:
results = model.evaluate((test.user_id, test.item_id), test.rating, batch_size=1)

In [None]:
movie_embedding_learnt = model.get_layer(name='Movie-Embedding').get_weights()[0]
pd.DataFrame(movie_embedding_learnt).describe()

In [None]:
user_embedding_learnt = model.get_layer(name='User-Embedding').get_weights()[0]

In [None]:
def recommend(user_id, number_of_movies=5):
    movies = user_embedding_learnt[user_id]@movie_embedding_learnt.T
    mids = np.argpartition(movies, -number_of_movies)[-number_of_movies:]
    return mids

In [None]:
recommend(user_id=1)