In [212]:
import os
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error

def read_ratings(file_path, sep='::'):
    ratings_file = os.path.abspath(file_path)
    column_names = ['userId', 'movieId', 'rating', 'timestamp']
    ratings = pd.read_csv(ratings_file, names=column_names, sep=sep, engine='python')
    ratings = ratings.drop('timestamp', axis=1)
    ratings[['userId', 'movieId']] = ratings[['userId', 'movieId']].astype('int32')
    ratings[['rating']] = ratings[['rating']].astype('int8')
    ratings = ratings.pivot('userId', 'movieId', 'rating').fillna(value=0)
    return ratings

def split_train_test(ratings, test_ratio=0.2):
    test = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)
    train = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)

    for user in xrange(ratings.shape[0]):
        user_ratings_indexes = ratings.iloc[user, :].nonzero()[0]
        train_indexes, test_indexes = train_test_split(user_ratings_indexes, test_size=test_ratio)
        train.iloc[user, train_indexes] = ratings.iloc[user, train_indexes]
        test.iloc[user, test_indexes] = ratings.iloc[user, test_indexes]

    return train, test

def get_sim(userId, movieId, gamma, pzw, movies, topics):
    movie1 = pzw[movies.get_loc(movieId)]
    user1 = gamma[userId]

    top = user1.multiply(movie1)
    user_tot = 0
    movie_tot = 0
    for i in xrange(topics):
        user_tot += user1[i] * user1[i]
        movie_tot += movie1[i] * movie1[i]
    user_tot = np.sqrt(user_tot)
    movie_tot = np.sqrt(movie_tot)
    sim = top.sum() / (user_tot * movie_tot)
    return sim

def get_similiarity_matrix(ratings):
    user_similarity = ratings.dot(ratings.transpose())
    normalisation_terms = pd.DataFrame(np.diagonal(user_similarity.values), index=ratings.index).apply(np.sqrt)
    normalisation_terms = normalisation_terms.dot(normalisation_terms.transpose())
    user_similarity = user_similarity.div(normalisation_terms)
    return user_similarity

def get_rmse(predicted, actual):
    return np.sqrt(mean_squared_error(actual.values[actual.values.nonzero()].flatten(),
                                      predicted.values[actual.values.nonzero()].flatten()))

def adjust_user_similarity_knn(user_similarity, k):
        adjusted_similarity = pd.DataFrame(np.zeros(user_similarity.shape),
                                           index=user_similarity.index, columns=user_similarity.columns)

        for user in user_similarity.iterrows():
            top_k_indexes = user[1].sort_values(ascending=False).iloc[0:k+1].index.values
            adjusted_similarity.loc[user[0], top_k_indexes] = user_similarity.loc[user[0], top_k_indexes]

        return adjusted_similarity

In [2]:
path = os.path.join(os.getcwd(), "datasets_rating", "ml1m", "ratings.dat")
full = read_ratings(path)

users= full.index
movies = pd.Index(full.columns.unique())

ratings, test = split_train_test(full)

In [93]:
ratings = ratings.replace(0, np.nan)
user_means = pd.DataFrame(ratings.mean(axis=1), index=ratings.index, columns=['mean']).fillna(value=0)
item_means = pd.DataFrame(ratings.mean(axis=0), index=ratings.columns, columns=['mean']).fillna(value=0)
# user_offsets = ratings.subtract(item_means['mean'], 1).abs().mean(axis=1)
user_offsets = ratings.subtract(user_means['mean'], 0).abs().mean(axis=1)
new = ratings.subtract(user_means['mean'], 0).divide(user_offsets, 0).transpose().unstack().dropna()

In [4]:
fo = open("rating.dat", "w")

line = str(len(movies)) + " "
for movie in movies:
    line += str(movies.get_loc(movie)) + ":1 "
line += "\n"
fo.write(line)

neg_docs = ""

for user in users:
    pos_doc = ""
    neg_doc = ""
    
    count = 0
    neg_count = 0
    
    for item in new[user].iteritems():
        rating = item[1]
        if rating < 0:
            score = 0
            neg_count += 1
            if rating > -1:
                score = 1
            else:
                score = 2
                
            neg_doc += str(movies.get_loc(item[0])) + ":" + str(score) + " "
                
        elif rating >= 0:
            count += 1
            score = 0
            if rating < 1:
                score = 1
            else:
                score = 2
            
            pos_doc += str(movies.get_loc(item[0])) + ":" + str(score) + " "
            
    line = str(count) + " " + pos_doc + "\n"
    neg_docs += str(neg_count) + " " + neg_doc + "\n"
    fo.write(line)
    
fo.write(neg_docs)
fo.close()

In [235]:
topics = 100
# load gamma
path = os.path.join(os.getcwd(), "cpp", "params", "ratings", str(topics), "gamma.dat")
gamma = pd.read_table(path, sep=" ", skiprows=1, header = None).transpose()

#load beta and get p(z|w)
path = os.path.join(os.getcwd(), "cpp", "params", "ratings", str(topics), "beta.dat")
beta = pd.read_table(path, sep=" ", skiprows=1, header = None)
pw = beta.sum(0)
pz = beta.sum(1)
pw = pw / pw.sum()
pz = pz / pz.sum()
pzw = beta.multiply(pz, 0).divide(pw)

In [41]:
new_test = test.replace(0, np.nan).transpose().unstack().dropna()

In [236]:
adj_ratings = ratings.replace(0, np.nan).subtract(user_means['mean'], 0).divide(user_offsets, 0).fillna(value=0)
user_similarity = get_similiarity_matrix(adj_ratings).fillna(value=0)
docs = pd.DataFrame(gamma.T[:6040].values, index = ratings.index)
doc_similarity  = get_similiarity_matrix(docs)

In [237]:
predictions = user_similarity.dot(adj_ratings)
denom = user_similarity.abs().sum().transpose()
predictions = predictions.div(denom, axis='index').multiply(user_offsets, 0).add(user_means['mean'], 0).fillna(value=1)
get_rmse(predictions, test)

1.010953331644598

In [238]:
predictions = doc_similarity.dot(adj_ratings)
denom = doc_similarity.abs().sum().transpose()
predictions = predictions.div(denom, axis='index').multiply(user_offsets, 0).add(user_means['mean'], 0)
get_rmse(predictions, test)

1.0070253685672468

In [232]:
for k in [5,10,20,30,50, 100, 150]:
    simil = adjust_user_similarity_knn(user_similarity, k)
    predictions = simil.dot(adj_ratings)
    denom = simil.abs().sum().transpose()
    predictions = predictions.div(denom, axis='index').multiply(user_offsets, 0).add(user_means['mean'], 0).fillna(value=1)
    predictions = predictions.fillna(value=3).replace(np.inf, 3).replace(-np.inf, 3)
    print k, ":", get_rmse(predictions, test)

5 : 0.996940184171
10 : 0.985678604479
20 : 0.978858026378
30 : 1.0384348066
50 : 0.977512032462
100 : 0.979113652489
150 : 0.981677413447


In [239]:
for k in [5,10,20,30,50, 100, 150]:
    simil = adjust_user_similarity_knn(doc_similarity, k)
    predictions = simil.dot(adj_ratings)
    denom = simil.abs().sum().transpose()
    predictions = predictions.div(denom, axis='index').multiply(user_offsets, 0).add(user_means['mean'], 0).fillna(value=1)
    predictions = predictions.fillna(value=3).replace(np.inf, 3).replace(-np.inf, 3)
    print k, ":", get_rmse(predictions, test)

5 : 1.2894545312
10 : 1.22384671872
20 : 1.1535726669
30 : 1.10039275379
50 : 1.03333114785
100 : 1.00154170175
150 : 1.00241251734


In [240]:
for k in [5,10,20,30,50, 100, 150]:
    simil = adjust_user_similarity_knn(doc_similarity, k)
    predictions = simil.dot(adj_ratings)
    denom = simil.abs().sum().transpose()
    predictions = predictions.div(denom, axis='index').multiply(user_offsets, 0).add(user_means['mean'], 0).fillna(value=1)
    predictions = predictions.fillna(value=1).replace(np.inf, 1).replace(-np.inf, 1)
    print k, ":", get_rmse(predictions, test)

5 : 1.39630091786
10 : 1.34089590469
20 : 1.25737148269
30 : 1.18389470134
50 : 1.08058735804
100 : 1.01551969801
150 : 1.00507971672


In [None]:
def sim(movie, user, k):
    enum = user.multiply(movie).sum()
    denom = np.sqrt(user.multiply(user).sum()) * np.sqrt(movie.multiply(movie).sum())
    return enum / denom

In [None]:
np.corrcoef(pzw[movies.get_loc(745)], gamma[0])

In [None]:
for id, rating in new_test[1].sort_values().iteritems():
    print id, sim(pzw[movies.get_loc(id)], gamma[0], 20).round(2), rating
    print id, np.corrcoef(pzw[movies.get_loc(id)].values, gamma[0].values), rating
    print 

In [None]:
pzw[movies.get_loc(745)].round(1)

In [None]:
# get rmse
total = 0
count = 0
bl_total = 0
new_test = test.replace(0, np.nan).transpose().unstack().dropna()

for user in users:
    userId = user
    if userId % 500 == 0:
        print userId, 
        
    mean = user_means["mean"][userId]
    offset = user_offsets[userId]
    
    for movie in new_test[userId].iteritems():
        movieId, rating = movie
        movie_mean = item_means["mean"][movieId]
        sim = get_sim(userId, movieId, gamma, pzw, movies, topics)

        if offset < 0:
            pred = (movie_mean + offset - 2*sim*offset).clip(1,5).round(0)
        else:
            pred = (movie_mean - offset + 2*sim*offset).clip(1,5).round(0)
        
        error = rating - pred
        bl_error = rating - (movie_mean+offset)
        bl_total += bl_error * bl_error
        total += error * error
        count += 1
        
avg_error = total/count
avg_bl_error = bl_total/count
print 
print np.sqrt(avg_error)
print np.sqrt(avg_bl_error)