In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split

def read_ratings(file_path, sep='::'):
    """
    Reads the ratings file into a user x item DataFrame. Ratings are stored in 'database' form.
    Where each line is in the form: <user_id><sep><item_id><sep><rating><sep><timestamp>
    Unkown values are 0 and ratings are on a 1-5 scale
    :param file_path: The ratings file path
    :param sep: The separator between items
    :return: The user x item ratings DataFrame
    """
    ratings_file = os.path.abspath(file_path)
    column_names = ['userId', 'movieId', 'rating', 'timestamp']
    ratings = pd.read_csv(ratings_file, names=column_names, sep=sep, engine='python')
    ratings = ratings.drop('timestamp', axis=1)
    ratings[['userId', 'movieId']] = ratings[['userId', 'movieId']].astype('int32')
    ratings[['rating']] = ratings[['rating']].astype('int8')
    ratings = ratings.pivot('userId', 'movieId', 'rating').fillna(value=0)
    return ratings

def split_train_test(ratings, test_ratio=0.2):
    """
    Split the ratings matrix into test and train matrices.
    :param ratings: The original user x item ratings DataFrame
    :type ratings: DataFrame
    :param test_ratio: The ratio of ratings to take for the test dataset
    :type test_ratio: float
    :return: The train and test ratings dataFrames
    """
    test = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)
    train = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)

    for user in xrange(ratings.shape[0]):
        user_ratings_indexes = ratings.iloc[user, :].nonzero()[0]
        train_indexes, test_indexes = train_test_split(user_ratings_indexes, test_size=test_ratio)
        train.iloc[user, train_indexes] = ratings.iloc[user, train_indexes]
        test.iloc[user, test_indexes] = ratings.iloc[user, test_indexes]

    return train, test

def get_sim(userId, movieId, gamma, pzw, movies, topics):
    movie1 = pzw[movies.get_loc(movieId)]
    user1 = gamma[userId]

    top = user1.multiply(movie1)
    user_tot = 0
    movie_tot = 0
    for i in xrange(topics):
        user_tot += user1[i] * user1[i]
        movie_tot += movie1[i] * movie1[i]
    user_tot = np.sqrt(user_tot)
    movie_tot = np.sqrt(movie_tot)
    sim = top.sum() / (user_tot * movie_tot)
    return sim

In [6]:
path = os.path.join(os.getcwd(), "datasets_rating", "ml1m", "ratings.dat")
full = read_ratings(path)

users= full.index
movies = pd.Index(full.columns.unique())

ratings, test = split_train_test(full)

In [73]:
ratings = ratings.replace(0, np.nan)
user_means = pd.DataFrame(ratings.mean(axis=1), index=ratings.index, columns=['mean']).fillna(value=0)
item_means = pd.DataFrame(ratings.mean(axis=0), index=ratings.columns, columns=['mean']).fillna(value=0)
# user_offsets = ratings.subtract(item_means['mean'], 1).abs().mean(axis=1)
user_offsets = ratings.subtract(item_means['mean'], 1).mean(axis=1)
new = ratings.subtract(item_means['mean'], 1).divide(user_offsets, 0).transpose().unstack().dropna()

In [12]:
fo = open("rating.dat", "w")

line = str(len(movies)) + " "
for movie in movies:
    line += str(movies.get_loc(movie)) + ":1 "
line += "\n"
fo.write(line)

for user in users:
    line =""
    count = 0
    for item in new[user].iteritems():
        rating = item[1]
        score = 0
        if rating < 0:
            score = 1
        elif rating >= 0:
            if rating < 1:
                score = 3
            elif rating > 1:
                score = 4
            elif rating > 2:
                score = 5
        count += 1
        line += str(movies.get_loc(item[0])) + ":"
        line += str(score)
        line += " "
    line = str(count) + " " + line + "\n"
    
    fo.write(line)
fo.close()

In [67]:
topics = 50
# load gamma
path = os.path.join(os.getcwd(), "cpp", "params", "ratings", str(topics), "gamma.dat")
gamma = pd.read_table(path, sep=" ", skiprows=1, header = None).transpose()

#load beta and get p(z|w)
path = os.path.join(os.getcwd(), "cpp", "params", "ratings", str(topics), "beta.dat")
beta = pd.read_table(path, sep=" ", skiprows=1, header = None)
pw = beta.sum(0)
pz = beta.sum(1)
pw = pw / pw.sum()
pz = pz / pz.sum()
pzw = beta.multiply(pz, 0).divide(pw)




In [76]:
# get rmse
total = 0
count = 0
bl_total = 0
new_test = test.replace(0, np.nan).transpose().unstack().dropna()

for user in users:
    userId = user
    if userId % 500 == 0:
        print userId, 
        
    mean = user_means["mean"][userId]
    offset = user_offsets[userId]
    
    for movie in new_test[userId].iteritems():
        movieId, rating = movie
        movie_mean = item_means["mean"][movieId]
        sim = get_sim(userId, movieId, gamma, pzw, movies, topics)

        if offset < 0:
            pred = (movie_mean + offset - 2*sim*offset).clip(1,5).round(0)
        else:
            pred = (movie_mean - offset + 2*sim*offset).clip(1,5).round(0)
        
        error = rating - pred
        bl_error = rating - (movie_mean+offset)
        bl_total += bl_error * bl_error
        total += error * error
        count += 1
        
avg_error = total/count
avg_bl_error = bl_total/count
print 
print np.sqrt(avg_error)
print np.sqrt(avg_bl_error)

500 1000 1500 2000 2500 3000 3500 4000 4500 5000 5500 6000
1.07850146404
0.910019050271
