In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split

def read_ratings(file_path, sep='::'):
    """
    Reads the ratings file into a user x item DataFrame. Ratings are stored in 'database' form.
    Where each line is in the form: <user_id><sep><item_id><sep><rating><sep><timestamp>
    Unkown values are 0 and ratings are on a 1-5 scale
    :param file_path: The ratings file path
    :param sep: The separator between items
    :return: The user x item ratings DataFrame
    """
    ratings_file = os.path.abspath(file_path)
    column_names = ['userId', 'movieId', 'rating', 'timestamp']
    ratings = pd.read_csv(ratings_file, names=column_names, sep=sep, engine='python')
    ratings = ratings.drop('timestamp', axis=1)
    ratings[['userId', 'movieId']] = ratings[['userId', 'movieId']].astype('int32')
    ratings[['rating']] = ratings[['rating']].astype('int8')
    ratings = ratings.pivot('userId', 'movieId', 'rating').fillna(value=0)
    return ratings

def split_train_test(ratings, test_ratio=0.2):
    """
    Split the ratings matrix into test and train matrices.
    :param ratings: The original user x item ratings DataFrame
    :type ratings: DataFrame
    :param test_ratio: The ratio of ratings to take for the test dataset
    :type test_ratio: float
    :return: The train and test ratings dataFrames
    """
    test = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)
    train = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)

    for user in xrange(ratings.shape[0]):
        user_ratings_indexes = ratings.iloc[user, :].nonzero()[0]
        train_indexes, test_indexes = train_test_split(user_ratings_indexes, test_size=test_ratio)
        train.iloc[user, train_indexes] = ratings.iloc[user, train_indexes]
        test.iloc[user, test_indexes] = ratings.iloc[user, test_indexes]

    return train, test

def get_sim(userId, movieId, gamma, pzw, movies, topics):
    movie1 = pzw[movies.get_loc(movieId)]
    user1 = gamma[userId]

    top = user1.multiply(movie1)
    user_tot = 0
    movie_tot = 0
    for i in xrange(topics):
        user_tot += user1[i] * user1[i]
        movie_tot += movie1[i] * movie1[i]
    user_tot = np.sqrt(user_tot)
    movie_tot = np.sqrt(movie_tot)
    sim = top.sum() / (user_tot * movie_tot)
    return sim

In [34]:
path = os.path.join(os.getcwd(), "datasets_rating", "ml1m", "ratings.dat")
full = read_ratings(path)

users= full.index
movies = pd.Index(full.columns.unique())

ratings, test = split_train_test(full)

In [3]:
ratings = ratings.replace(0, np.nan)
user_means = pd.DataFrame(ratings.mean(axis=1), index=ratings.index, columns=['mean']).fillna(value=0)
item_means = pd.DataFrame(ratings.mean(axis=0), index=ratings.columns, columns=['mean']).fillna(value=0)
# user_offsets = ratings.subtract(item_means['mean'], 1).abs().mean(axis=1)
user_offsets = ratings.subtract(item_means['mean'], 1).mean(axis=1)
new = ratings.subtract(item_means['mean'], 1).divide(user_offsets, 0).transpose().unstack().dropna()

In [118]:
fo = open("rating.dat", "w")

line = str(len(movies)) + " "
for movie in movies:
    line += str(movies.get_loc(movie)) + ":1 "
line += "\n"
fo.write(line)

neg_docs = ""

for user in users:
    pos_doc = ""
    neg_doc = ""
    
    count = 0
    neg_count = 0
    
    for item in new[user].iteritems():
        rating = item[1]
        if rating < 0:
            score = 0
            neg_count += 1
            if rating > -1:
                score = 1
            else:
                score = 2
                
            neg_doc += str(movies.get_loc(item[0])) + ":" + str(score) + " "
                
        elif rating >= 0:
            count += 1
            score = 0
            if rating < 1:
                score = 1
            else:
                score = 2
            
            pos_doc += str(movies.get_loc(item[0])) + ":" + str(score) + " "
            
    line = str(count) + " " + pos_doc + "\n"
    neg_docs += str(neg_count) + " " + neg_doc + "\n"
    fo.write(line)
    
fo.write(neg_docs)
fo.close()

In [23]:
topics = 50
# load gamma
path = os.path.join(os.getcwd(), "cpp", "params", "ratings", str(topics), "gamma.dat")
gamma = pd.read_table(path, sep=" ", skiprows=1, header = None).transpose()

#load beta and get p(z|w)
path = os.path.join(os.getcwd(), "cpp", "params", "ratings", str(topics), "beta.dat")
beta = pd.read_table(path, sep=" ", skiprows=1, header = None)
pw = beta.sum(0)
pz = beta.sum(1)
pw = pw / pw.sum()
pz = pz / pz.sum()
pzw = beta.multiply(pz, 0).divide(pw)

In [35]:
new_test = test.replace(0, np.nan).transpose().unstack().dropna()

In [88]:
def sim(movie, user, k):
    enum = user.multiply(movie).sum()
    denom = np.sqrt(user.multiply(user).sum()) * np.sqrt(movie.multiply(movie).sum())
    return enum / denom

In [117]:
np.corrcoef(pzw[movies.get_loc(745)], gamma[0])

array([[ 1.      ,  0.944831],
       [ 0.944831,  1.      ]])

In [116]:
for id, rating in new_test[1].sort_values().iteritems():
    print id, sim(pzw[movies.get_loc(id)], gamma[0], 20).round(2), rating
    print id, np.corrcoef(pzw[movies.get_loc(id)].values, gamma[0].values), rating
    print 


745 0.92 3.0
745 [[ 1.        0.944831]
 [ 0.944831  1.      ]] 3.0

531 0.59 4.0
531 [[ 1.          0.51708081]
 [ 0.51708081  1.        ]] 4.0

594 0.9 4.0
594 [[ 1.          0.96075886]
 [ 0.96075886  1.        ]] 4.0

608 0.15 4.0
608 [[ 1.          0.01637929]
 [ 0.01637929  1.        ]] 4.0

783 0.9 4.0
783 [[ 1.         0.9602857]
 [ 0.9602857  1.       ]] 4.0

1207 0.12 4.0
1207 [[ 1.         -0.05852308]
 [-0.05852308  1.        ]] 4.0

2398 0.9 4.0
2398 [[ 1.          0.94978414]
 [ 0.94978414  1.        ]] 4.0

3114 0.51 4.0
3114 [[ 1.          0.41492295]
 [ 0.41492295  1.        ]] 4.0

1193 0.2 5.0
1193 [[ 1.         -0.00291028]
 [-0.00291028  1.        ]] 5.0

1270 0.35 5.0
1270 [[ 1.          0.15226795]
 [ 0.15226795  1.        ]] 5.0

2028 0.28 5.0
2028 [[ 1.          0.17731344]
 [ 0.17731344  1.        ]] 5.0



0     0.1
1     0.2
2     0.1
3     0.1
4     0.1
5     0.1
6     0.1
7     0.1
8     1.0
9     0.3
10    0.1
11    0.1
12    0.1
13    0.1
14    0.1
15    0.1
16    0.1
17    0.1
18    0.1
19    0.2
Name: 0, dtype: float64

In [105]:
pzw[movies.get_loc(745)].round(1)

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.1
6     0.0
7     0.0
8     0.7
9     0.1
10    0.0
11    0.0
12    0.1
13    0.0
14    0.0
15    0.1
16    0.0
17    0.0
18    0.0
19    0.0
Name: 708, dtype: float64

In [18]:
# get rmse
total = 0
count = 0
bl_total = 0
new_test = test.replace(0, np.nan).transpose().unstack().dropna()

for user in users:
    userId = user
    if userId % 500 == 0:
        print userId, 
        
    mean = user_means["mean"][userId]
    offset = user_offsets[userId]
    
    for movie in new_test[userId].iteritems():
        movieId, rating = movie
        movie_mean = item_means["mean"][movieId]
        sim = get_sim(userId, movieId, gamma, pzw, movies, topics)

        if offset < 0:
            pred = (movie_mean + offset - 2*sim*offset).clip(1,5).round(0)
        else:
            pred = (movie_mean - offset + 2*sim*offset).clip(1,5).round(0)
        
        error = rating - pred
        bl_error = rating - (movie_mean+offset)
        bl_total += bl_error * bl_error
        total += error * error
        count += 1
        
avg_error = total/count
avg_bl_error = bl_total/count
print 
print np.sqrt(avg_error)
print np.sqrt(avg_bl_error)

KeyError: 51L