In [1]:
"""
Collaborativefilter on the movielens web site.

Description:
    1) use the data set from movielens to realize the collaborative filter
Author:alex
Time:27/11/2017
"""

'\nCollaborativefilter on the movielens web site.\n\nDescription:\n    1) use the data set from movielens to realize the collaborative filter\nAuthor:alex\nTime:27/11/2017\n'

In [2]:
import os
import numpy as np
import pandas as pd
import datetime
import scipy.sparse as sp

In [3]:
def splitdata(sampledata, M, K, seed):
    """
    Generate the cross validation training data and validation data.

    Description:
        get M paire data for training and validation
        cross validation
    Parameter:
        sampledata:
            structure:[userid, viewlist, likelist]
        M:
        K:
        seed:
    """
    np.random.seed(seed)
    validation_data = []
    for item in sampledata:
        if np.random.randint(0, M) == K:
            validation_data.append(item)
    return validation_data

In [4]:
def getrecommendation(user, W, N):
    """
    Core function to get the best probability likely items for the user.

    Description:
        get the recommendation item list of the user
    Parameters:

    """
    pass

In [5]:
def recallrate(train_set, test_set, N):
    """
    Calculate the recall rate of the module in collaborative filter.

    Descriotion:
        sum(R(u)&T(U))/sum(T(u))
    """
    hitnum = 0
    allnum = 0
    for user in train_set.keys():
        real_like = test_set[user]
        rank = getrecommendation(user, N)
        # rank is a sorted list contains the item and the probability
        for item, pui in rank:
            if item in real_like:
                hitnum = hitnum+1
        # add the real like items
        allnum = allnum+len(real_list)
    return hitnum/allnum

In [6]:
def precisionrate(train_set, test_set, N):
    """
    Calculate the precision rate of the module in collaborative filter.

    Description:
        sum(R(u)&T(u))/sum(R(u))
        general precision defination
    """
    hitnum = 0
    allnum = 0
    for user in train_set.key():
        real_like = test_set[user]
        rank = getrecommendation(user, N)
        allnum = allnum+N
        for item, pui in rank:
            if item in real_like:
                hitnum = hitnum+1
    return hitnum/allnum

In [7]:
def coveragerate(train_set, test_set, N):
    """
    Galculate the coverage rate.

    Description:
        sum(all(recommendationtype))/allitems
    """
    recomitem_set = set()
    allitem_set = set()
    for user in train_set.keys():
        for item in train_set[user].keys():
            allitem_set.add(item)
        rank = getrecommendation(user, N)
        for item, pui in rank:
            recomitem_set.add(item)
    return recomitem_set/allitem_set


In [8]:
def popularity(train_set, test_set, N):
    """
    Calculate the popularity.

    Description:
        sum(log(popularity))/N focus on the recommendation items
    """
    for user, items in train_set.items():
        # calculate the popularity of each item in train_set
        item_popularity = dict()
        for item in items.keys():
            if item not in item_popularity:
                item_popularity[item] = 0
            item_popularity[item] = item_popularity[item]+1
        n = 0  # calculate all the recommendation items
        po_sum = 0
        rank = getrecommendation(user, N)
        for item, pui in rank:
            po_sum = po_sum+np.log(1+item_popularity[item])
            # calculate each recommendation item's popularity
            n = n+1
        return po_sum/n

In [9]:
def usersimilarity(userid, viewlist):
    """
    Calculate the similarity matrix of the user set.

    Description:
        each value in the matrix means the similarity between the two persons
        there are to many methods to calculate the similarity between each user
        we can choose the jaccard equation and the cosin similarity equation:
            1)jaccard equation:
                Wuv = |N(u)&N(v)|/|N(u)+N(v)|
            2)consin equation:
                Wuv = |N(u)&N(v)|/sqrt(|N(u)||N(v)|)
    appendence:
        the matrix is to huge we should use sparse matrix to save the matrix
    """
    user_len = len(userid)
    item_user = {}
    start_time = datetime.datetime.now()
    print('start to generate the item_user table %s ' % start_time)
    for i in range(user_len):
        curritemlist = viewlist[i]
        for item in curritemlist:
            if item not in item_user:
                item_user[item] = []
            item_user[item].append(userid[i])
    end_time = datetime.datetime.now()
    print('finish to generate the item_user table and cost time is %s' % (end_time-start_time))
    # generate the sparse matrix
    sparse_mat = sp.csr_matrix((user_len, user_len), dtype=np.uint8)
    start_time = datetime.datetime.now()
    print('begin to generate the sparse matrix %s ' % start_time)
    for key in item_user.keys():
        # userid list of current item
        idlist = [userid.index(item) for item in item_user[key]]
        for i in idlist:
            for j in idlist:
                if i == j:
                    continue
                if sparse_mat[i, j] == 0:
                    sparse_mat[i, j] == 1
                else:
                    sparse_mat[i, j] = sparse_mat[i, j]+1
    end_time = datetime.datetime.now()
    print('finish generate the sparse matrix and cost time is %s ' % (end_time-start_time))
    # finally calculate the similarity between each user
    # get all the no-zero elements in the sparse matrix
    # should change to the coo_matrix to get row and col
    rowlist = sparse_mat.tocoo().row
    collist = sparse_mat.tocoo().col
    print('final calculate the similarity sparse matrix %s ' % start_time)
    for i in rowlist:
        for j in collist:
            len_u = len(viewlist[i])
            len_v = len(viewlist[j])
            sparse_mat[i, j] = sparse_mat[i, j]/(np.sqrt(len_u*len_v))
    end_time = datetime.datetime.now()
    print('finally finish the similarity matrix %s ' % (end_time-start_time))
    return sparse_mat

In [10]:
# tool functions

In [11]:
def strlisttolist(item):
    """Change the strlist to list."""
    tmp_list = item[1:len(item)-1].split(',')
    return [int(item) for item in tmp_list]

In [12]:
# run the code

In [13]:
SEED = 10
M = 8
datafolder = './cleandata'
filename = 'user_movie.csv'
columns = ['userId', 'viewlist', 'likelist']

In [14]:
# read data from csv
inputdata = pd.read_csv(os.path.join(datafolder, filename), delimiter='|')

In [15]:
# load current data
userId = list(inputdata['userId'].values)
viewlist = [strlisttolist(item) for item in list(inputdata['viewlist'].values)]
likelist = [strlisttolist(item) for item in list(inputdata['likelist'].values)]
srcdata = list(zip(userId, viewlist, likelist))
print('current data information')
print(len(srcdata))

current data information
138493


In [None]:
# begin to train the model
for i in range(M):
    currval_data = splitdata(srcdata, M, i, SEED)
    # generate the similarity matrix(sparse matrix)
    W = usersimilarity(userId, viewlist)
    # calculate the standard
    break

start to generate the item_user table 2017-12-04 15:38:14.369469 
finish to generate the item_user table and cost time is 0:00:06.542374
begin to generate the sparse matrix 2017-12-04 15:38:20.911843 
