# Matrix Factorization Using Neural Network(Evaluation)

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
from keras.models import Model
from keras.layers import Input, Dot, Flatten
from keras.layers.embeddings import Embedding
from sklearn.utils import shuffle
from keras.callbacks import TensorBoard
from keras.regularizers import l2
from sklearn.metrics import mean_absolute_error
import warnings
import keras
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load the rating data

In [2]:
#load the rating data
ratingsHeader = ["UserID", "MovieID", "Rating", "Timestamp"]
ratingsDF = pd.read_table('Raw Data/ratings.dat',sep = '::', names = ratingsHeader, engine = "python")
ratingsDF

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


## Data Processing

In [3]:
#generate the continued userID and movieID for neural network
ratingsDF['userId']= ratingsDF.UserID.astype('category').cat.codes.values   
ratingsDF['movieId'] = ratingsDF.MovieID.astype('category').cat.codes.values
ratingsDF

Unnamed: 0,UserID,MovieID,Rating,Timestamp,userId,movieId
0,1,1193,5,978300760,0,1104
1,1,661,3,978302109,0,639
2,1,914,3,978301968,0,853
3,1,3408,4,978300275,0,3177
4,1,2355,5,978824291,0,2162
...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,6039,1019
1000205,6040,1094,5,956704887,6039,1022
1000206,6040,562,5,956704746,6039,548
1000207,6040,1096,4,956715648,6039,1024


## K - Fold Cross Validation

* shuffle the dataframe
* split the data into K folds
* split the trainData(K-1) and testData(1) for K times
* return the trainDataList and the testDataList

In [4]:
#split the data
def splitData(df, folds = 5, shu = True, randomState = None):
    trainDFList = []
    testDFList = []
    #shuffle the dataframe
    df = shuffle(df, random_state = randomState)
    #split the data into K folds
    nums = df.shape[0]
    length = int(nums / folds)
    dfList = []
    dfList.append(df[0:length])
    for i in range(1,folds - 1):
        dfList.append(df[i*length: (i + 1) * length])
    dfList.append(df[(folds - 1) * length :])
    #split the trainData and test Data
    for i in range(folds):
        testDF = dfList[i]
        tempList = []
        for j in range(folds):
            if(j != i):
                tempList.append(j)
        trainDF = dfList[tempList[0]]
        for j in range(1,folds - 1):
            trainDF = pd.concat([trainDF, dfList[tempList[j]]])
        trainDFList.append(trainDF)
        testDFList.append(testDF)
    #return the trainDataList and the testDataList
    return trainDFList, testDFList

## Matrix Factorization Model Using Neural Network

### Structure of Neural Network

* UserInputLayer, MovieInputLayer
* UserEmbeddingLayer, MovieEmbeddingLayer
* UserFlattenLayer, MovieFlattenLayer
* OutputLayer(rating value)

### Detail of Model
* Optimizer : Adam
* Loss Function : Mean Absolute Error
* Epoch: 25

In [5]:
# Matrix Factorization Model Using Neural Network
def MatrixFactorizationModel(numUsers, numMovies, numFactors):
    #userLayer
    userInput = Input(shape = [1], name = "userInput")
    userEmbedding = Embedding(numUsers, numFactors, embeddings_initializer='he_normal', embeddings_regularizer=l2(1e-6), name = "userEmbedding")(userInput)
    userFlatten = Flatten(name = "userFlatten")(userEmbedding)
    #movieLayer
    movieInput = Input(shape= [1], name = "movieInput")
    movieEmbedding = Embedding(numMovies, numFactors, embeddings_initializer='he_normal', embeddings_regularizer=l2(1e-6), name = "movieEmbedding")(movieInput)
    movieFlatten = Flatten(name = "movieFlatten")(movieEmbedding)
    # Just like in the case of matrix factorization, we take product of user and movie embeddings.
    # This is trained to be equal to actual rating.
    #outputLayer
    rating = Dot(name = "OutputLayer", axes=1)([userFlatten, movieFlatten])
    MFModel = Model(inputs = [userInput, movieInput], outputs = rating)
    #optimizer
    opt = keras.optimizers.Adam()
    #loss function
    loss = keras.losses.MAE
    MFModel.compile(optimizer = opt, loss = loss)
    return MFModel

## Evaluate the performance of the model

In [6]:
# evaluate the performance of the model
def MatrixFactorizationAlgorithm(ratingsDF, folds = 5, randomState = None):
    numUsers = len(ratingsDF.UserID.unique())
    numMovies = len(ratingsDF.MovieID.unique())
    trainRatingDFList, testRatingDFList = splitData(ratingsDF, folds, randomState)
    print("dataset K MAE")
    for i in range(folds):
        #load the train data
        trainRatingDF = trainRatingDFList[i]
        X_train = [trainRatingDF.userId.values, trainRatingDF.movieId.values]
        Y_train = trainRatingDF.Rating.values.astype(float)
        #load the test data
        testRatingDF = testRatingDFList[i]
        X_test = [testRatingDF.userId.values, testRatingDF.movieId.values]
        Y_test = testRatingDF.Rating.values.astype(float)
        numFactorsList = [3, 4, 5, 6, 7, 8, 9, 10]
        #evaluate the model
        for numFactors in numFactorsList:
            MFModel = MatrixFactorizationModel(numUsers, numMovies, numFactors)
            history = MFModel.fit(x=X_train, y=Y_train, batch_size = 64, epochs=25, verbose=0,validation_data=(X_test, Y_test))
            P_test = MFModel.predict(X_test)
            MAE = mean_absolute_error(P_test, Y_test)
            print(i + 1,numFactors, MAE)

In [8]:
MatrixFactorizationAlgorithm(ratingsDF, folds = 5, randomState = 1)

dataset K MAE
1 3 0.6990561365817811
1 4 0.6976224092858676
1 5 0.6927712754157089
1 6 0.6905507860070423


KeyboardInterrupt: 