In [1]:
%autosave 0

Autosave disabled


In [4]:
import sys
import math
from operator import itemgetter

import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
def splitData(dataFile, test_size):
    names = ['user_id','item_id','rating','timestamp']
    df = pd.read_csv(dataFile, sep='\t',names=names)
    
    n_users = df.user_id.unique().shape[0]
    n_items = df.item_id.unique().shape[0]
    print('Number of users = %d \n Number of movies = %d' % (n_users, n_items))
    
    train_data, test_data = train_test_split(df, test_size=test_size)
    print('数据量：', len(train_data), len(test_data))
    return df, n_users, n_items, train_data, test_data

def calc_similarity(n_users, n_items, train_data, test_data):
    train_data_matrix = np.zeros((n_users, n_items))
    for line in train_data.itertuples():
        train_data_matrix[line[1]-1, line[2]-1] = line[3]
    test_data_matrix = np.zeros((n_users, n_items))
    for line in test_data.itertuples():
        test_data_matrix[line[1]-1, line[2]-1] = line[3]
    
    print('1:', np.shape(train_data_matrix))
    print('2:', np.shape(test_data_matrix.T))
    
    user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
    item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
    
    print('开始统计流行item的数量 ...')
    item_popular = {}
    for i_index in range(n_items):
        if np.sum(train_data_matrix[:,i_index]) != 0:
            item_popular[i_index] = np.sum(train_data_matrix[:,i_index] != 0)
    item_count = len(item_popular)
    print('总共流行item数量 = %d' % item_count)
    
    return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular

def predict(rating, similarity, type='user'):
    print(type)
    print('rating = ', np.shape(rating))
    print('similarity = ', np.shape(similarity))
    if type == 'user':
        mean_user_rating = rating.mean(axis=1)
        rating_diff = (rating - mean_user_rating[:,np.newaxis])
        pred = mean_user_rating[:,np.newaxis] + similarity.dot(rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return math.sqrt(mean_squared_error(prediction, ground_truth))

def evaluate(prediction, item_popular, name):
    hit = 0
    rec_count = 0
    test_count = 0
    popular_sum = 0
    all_rec_items = set()
    for u_index in range(n_users):
        items = np.where(train_data_matrix[u_index,:] == 0)[0]
        pre_items = sorted(dict(zip(items, prediction[u_index, items])).items(),
                          key=itemgetter(1),
                          reverse=True)[:20]
        test_items = np.where(test_data_matrix[u_index,:] != 0)[0]
        
        for item, _ in pre_items:
            if item in test_items:
                hit += 1
            all_rec_items.add(item)
            
            if item in item_popular:
                popular_sum += math.log(1 + item_popular[item])
                
        rec_count += len(pre_items)
        test_count += len(test_items)
        
    precision = hit / (1.0 * rec_count)
    recall = hit / (1.0 * test_count)
    coverage = len(all_rec_items) / (1.0 * len(item_popular))
    popularity = popular_sum / (1.0 * rec_count)
    print('%s: precision = %.4f \t recall = %.4f \t coverage = %.4f \t popularity = %.4f' % (name, precision, recall, coverage, popularity))
    
def recommend(u_index, prediction):
    items = np.where(train_data_matrix[u_index,:] == 0)[0]
    pre_items = sorted(dict(zip(items, prediction[u_index, items])).items(),
                      key=itemgetter(1),
                      reverse=True)[:10]
    test_items = np.where(test_data_matrix[u_index,:] != 0)[0]
    
    print('原始结果：', test_items)
    print('推荐结果：', [key for key, value in pre_items])

In [None]:
dataFile = 'u.data'
df, n_users, n_items, train_data, test_data = splitData(dataFile, 
                                                        test_size=0.25)

train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(n_users,
                                                                                                     n_items,
                                                                                                     train_data,
                                                                                                     test_data)
item_prediction = predict(train_data_matrix,
                         item_similarity,
                         type='item')
user_prediction = predict(train_data_matrix,
                         user_similarity,
                         type='user')

print('Item Based CF RMSE: ', rmse(item_prediction, test_data_matrix))
print('User Based CF RMSE: ', rmse(user_prediction, test_data_matrix))

sparsity = round(1.0 * len(df) / float(n_users * n_items), 3)
print('The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%')

u, s, vt = svds(train_data_matrix, k=15)
s_diag_matrix = np.diag(s)
svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
print('svd-shape: ', np.shape(svd_prediction))
print('Model based CF RMSE: ', rmse(svd_prediction, test_data_matrix))

evaluate(item_prediction, item_popular, 'item')
evaluate(user_prediction, item_popular, 'user')
evaluate(svd_prediction, item_popular, 'svd')

recommend(1, svd_prediction)