In [47]:
#######################################
#  Baseline 1 - Matrix Factorization  #
#######################################
#!/usr/bin/python
#
# Created by Albert Au Yeung (2010)
#
# An implementation of matrix factorization
#
try:
    import numpy
except:
    print "This implementation requires the numpy module."
    exit(0)

###############################################################################

"""
@INPUT:
    R     : a matrix to be factorized, dimension N x M
    P     : an initial matrix of dimension N x K
    Q     : an initial matrix of dimension M x K
    K     : the number of latent features
    steps : the maximum number of steps to perform the optimisation
    alpha : the learning rate
    beta  : the regularization parameter
@OUTPUT:
    the final matrices P and Q
"""
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
                    for k in xrange(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = numpy.dot(P,Q)
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
                    for k in xrange(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001:
            break
    return P, Q.T

###############################################################################
import csv
import pandas
from numpy import genfromtxt
if __name__ == "__main__":
#     with open('data/app_dataframe.csv') as file:
#         app = genfromtxt(file, delimiter=',')
    dfApp = pandas.read_csv('data/app_dataframe.csv')

    category = list(dfApp)
    category = category[2:]
    user = dfApp.user_code.tolist()
    user = user[:20]
    
    R = dfApp.values
    R = numpy.delete(R, 0, 1)
    R = numpy.delete(R, 0, 1)

    R = R[:20]
    
    print R

    N = len(R)
    M = len(R[0])
    K = 10

    P = numpy.random.rand(N,K)
    Q = numpy.random.rand(M,K)

    nP, nQ = matrix_factorization(R, P, Q, K)
    nR = numpy.dot(nP, nQ.T)

    print nR


[[ 0.  0.  0. ...,  0.  2.  0.]
 [ 0.  0.  0. ...,  1.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  3.  3.  0.]]
[[ 1.45579165  2.04591066  1.19046575 ...,  1.20283192  2.01208922
   0.62023619]
 [ 1.129354    1.48756443  1.00217703 ...,  1.08061101  2.28078292
   0.69935965]
 [ 1.01042825  1.13806254  1.39116756 ...,  0.05481503  0.64167843
   0.54777903]
 ..., 
 [ 1.6838828   1.79167984  1.43707114 ...,  1.14100613  1.00570606
   0.90625062]
 [ 2.13723153  2.36019968  1.56674568 ...,  2.01327502  2.41266559
   1.00074594]
 [ 4.17371699  4.04809749  4.68610483 ...,  2.97230168  2.97283938
   2.6169002 ]]


In [53]:
topR = []
for i in range(nR.shape[0]):
    lineR = []
    subRank = nR[i].argsort()[-5:][::-1]
    print 'user ',user[i]
#     print subRank
    for j in range(5):
#         tmp = nR[i][subRank[j]]
        tmp = category[subRank[j]]
        lineR.append(tmp)
#     topR.append(lineR)
    print lineR
    
# print topR

user  3035
['TOOLS', 'PRODUCTIVITY', 'GAME_BOARD', 'ENTERTAINMENT', 'AUTO_AND_VEHICLES']
user  6379
['TOOLS', 'VIDEO_PLAYERS', 'PHOTOGRAPHY', 'MUSIC_AND_AUDIO', 'TRANSPORTATION']
user  2713
['ENTERTAINMENT', 'TOOLS', 'SOCIAL', 'GAME_CARD', 'GAME_WORD']
user  4433
['TOOLS', 'COMMUNICATION', 'LIFESTYLE', 'ENTERTAINMENT', 'SOCIAL']
user  2845
['TOOLS', 'VIDEO_PLAYERS', 'ENTERTAINMENT', 'AUTO_AND_VEHICLES', 'PRODUCTIVITY']
user  5543
['GAME_CARD', 'TOOLS', 'COMMUNICATION', 'GAME_CASINO', 'GAME_ARCADE']
user  2047
['COMMUNICATION', 'TOOLS', 'GAME_CARD', 'PRODUCTIVITY', 'TRAVEL_AND_LOCAL']
user  7008
['TOOLS', 'BOOKS_AND_REFERENCE', 'ART_AND_DESIGN', 'GAME_BOARD', 'PARENTING']
user  386
['TOOLS', 'COMMUNICATION', 'BOOKS_AND_REFERENCE', 'MEDIA_AND_VIDEO', 'GAME_MUSIC']
user  1274
['PRODUCTIVITY', 'TOOLS', 'SOCIAL', 'MUSIC_AND_AUDIO', 'GAME_ARCADE']
user  2333
['TOOLS', 'PRODUCTIVITY', 'GAME_TRIVIA', 'GAME_WORD', 'ENTERTAINMENT']
user  3881
['TOOLS', 'GAME_BOARD', 'ENTERTAINMENT', 'GAME_WORD',