In [77]:
import os
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error

def read_ratings(file_path, sep='::'):
    ratings_file = os.path.abspath(file_path)
    column_names = ['userId', 'movieId', 'rating', 'timestamp']
    ratings = pd.read_csv(ratings_file, names=column_names, sep=sep, engine='python')
    ratings = ratings.drop('timestamp', axis=1)
    ratings[['userId', 'movieId']] = ratings[['userId', 'movieId']].astype('int32')
    ratings[['rating']] = ratings[['rating']].astype('int8')
    ratings = ratings.pivot('userId', 'movieId', 'rating').fillna(value=0)
    return ratings

def split_train_test(ratings, test_ratio=0.2):
    test = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)
    train = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)

    for user in xrange(ratings.shape[0]):
        user_ratings_indexes = ratings.iloc[user, :].nonzero()[0]
        train_indexes, test_indexes = train_test_split(user_ratings_indexes, test_size=test_ratio)
        train.iloc[user, train_indexes] = ratings.iloc[user, train_indexes]
        test.iloc[user, test_indexes] = ratings.iloc[user, test_indexes]

    return train, test

def get_similiarity_matrix(ratings):
    user_similarity = ratings.dot(ratings.transpose())
    normalisation_terms = pd.DataFrame(np.diagonal(user_similarity.values), index=ratings.index).apply(np.sqrt)
    normalisation_terms = normalisation_terms.dot(normalisation_terms.transpose())
    user_similarity = user_similarity.div(normalisation_terms)
    return user_similarity

def get_rmse(predicted, actual):
    return np.sqrt(mean_squared_error(actual.values[actual.values.nonzero()].flatten(),
                                      predicted.values[actual.values.nonzero()].flatten()))

def adjust_user_similarity_knn(user_similarity, k):
        adjusted_similarity = pd.DataFrame(np.zeros(user_similarity.shape),
                                           index=user_similarity.index, columns=user_similarity.columns)

        for user in user_similarity.iterrows():
            top_k_indexes = user[1].sort_values(ascending=False).iloc[0:k+1].index.values
            adjusted_similarity.loc[user[0], top_k_indexes] = user_similarity.loc[user[0], top_k_indexes]

        return adjusted_similarity
        
def predict(ratings, user_similarity, user_means, user_std_devs):
    predictions = user_similarity.dot(ratings)
    denom = user_similarity.abs().sum().transpose()
    predictions = predictions.div(denom, axis='index')
    
    adjusted_predictions_values = predicted.values * self.user_std_devs.values
    adjusted_predicted_values = adjusted_predictions_values + self.user_means.values
    predictions = pd.DataFrame(adjusted_predicted_values).fillna(value=0)
    return predictions

In [183]:
path = os.path.join(os.getcwd(),"..","datasets", "ml1m", "ratings.dat")
full = read_ratings(path)

users= full.index
movies = pd.Index(full.columns.unique())

ratings, test = split_train_test(full)

In [184]:
ratings = ratings.replace(0, np.nan)
user_means = pd.DataFrame(ratings.mean(axis=1), index=ratings.index, columns=['mean']).fillna(value=0)
item_means = pd.DataFrame(ratings.mean(axis=0), index=ratings.columns, columns=['mean']).fillna(value=0)
user_offsets = ratings.subtract(user_means['mean'], 0).abs().mean(axis=1)
new = ratings.subtract(user_means['mean'], 0).divide(user_offsets, 0).transpose().unstack().dropna()

In [185]:
fo = open("rating.dat", "w")

line = str(len(movies)) + " "
for movie in movies:
    line += str(movies.get_loc(movie)) + ":1 "
line += "\n"
fo.write(line)

neg_docs = ""

for user in users:
    pos_doc = ""
    neg_doc = ""
    
    count = 0
    neg_count = 0
    
    for item in new.loc[user].iteritems():
        rating = item[1]
        if rating < 0:
            score = 0
#             count += 1
#             neg_count += 1
            if rating > -1:
                score = 1
                count += 1
                
                pos_doc += str(movies.get_loc(item[0])) + ":" + str(score) + " "
                
        elif rating >= 0:
            count += 1
            score = 0
            if rating < 1:
                score = 3
            else:
                score = 4
            
            pos_doc += str(movies.get_loc(item[0])) + ":" + str(score) + " "
            
    line = str(count) + " " + pos_doc + "\n"
#     neg_docs += str(neg_count) + " " + neg_doc + "\n"
    fo.write(line)
    
# fo.write(neg_docs)
fo.close()

In [195]:
path = os.path.join(os.getcwd(), "..", "param", "rating", "theta.dat")
gamma = pd.read_table(path, sep=" ", header = None)

path = os.path.join(os.getcwd(), "..", "param", "rating", "phi.dat")
beta = pd.read_table(path, sep=" ", header = None)

path = os.path.join(os.getcwd(), "..", "param", "rating", "alpha.dat")
alpha = pd.read_table(path, sep=" ", header = None)

In [196]:
gamma = pd.DataFrame(gamma.replace(gamma[0][0], 0).values, index = ratings.index)

In [197]:
gamma

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0,0.000000,0,0.000000,0.000000,0.000057,0.000000,0.000000,0.000000,...,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000017,0.000000,0.000000,0.000000
4,0.000000,0,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.000000,0,0.000037,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.000000,0,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.000000,0,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.000000,0,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000050,0.000000,0.000000
9,0.000000,0,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10,0.000000,0,0.000000,0,0.000039,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [198]:
adj_ratings = ratings.subtract(user_means['mean'], 0).divide(user_offsets, 0).fillna(value=0)

In [199]:
sim = pd.DataFrame(get_similiarity_matrix(gamma), index=adj_ratings.index, columns=adj_ratings.index).fillna(value=0)

In [200]:
predictions = sim.dot(adj_ratings).div(sim.abs().sum(), 0)
predictions = predictions.multiply(user_offsets, 0).add(user_means['mean'], 0).fillna(value=0)
get_rmse(predictions, test)

1.0012240649848365

In [132]:
k=200
sim_topk = pd.DataFrame(np.zeros(sim.shape), index=sim.index, columns=sim.columns)

for user in sim.iterrows():
    top_k_indexes = user[1].sort_values(ascending=False).iloc[0:k+1].index.values
    sim_topk.loc[user[0], top_k_indexes] = sim.loc[user[0], top_k_indexes]

In [133]:
predictions = sim_topk.dot(adj_ratings).div(sim.abs().sum(), 0)
predictions = predictions.multiply(user_offsets, 0).add(user_means['mean'], 0).fillna(value=0)
get_rmse(predictions, test)

1.0274499889433504

In [201]:
path = os.path.join(os.getcwd(), "..","datasets", "ml1m", "movies.dat")
movies = pd.read_table(path, sep="::", header = None, engine='python')

In [202]:
movies = movies.drop(0,1).drop(2,1)

In [203]:
sorted_topics = alpha.transpose().index
for topic in sorted_topics:
    top_words = beta.loc[topic].sort_values(ascending=False).index
    
    if topic < 10:
        print topic, " :" ,
    else:
        print topic , ":" , 
        
    for i in xrange(30):
        print movies.loc[top_words[i]][1][:-7], ' |', 
    print
    print

0  : Princess Caraboo  | G.I. Jane  | Amazing Panda Adventure, The  | Princess Bride, The  | Canadian Bacon  | I'm Not Rappaport  | Money Talks  | Truth About Cats & Dogs, The  | Romy and Michele's High School Reunion  | Pleasure Garden, The  | American President, The  | Brother's Kiss, A  | Stupids, The  | Breathing Room  | Soft Fruit  | Mr. Jones  | Clan of the Cave Bear, The  | Baby-Sitters Club, The  | Malcolm X  | I Dreamed of Africa  | Beefcake  | House Party 3  | N�nette et Boni  | Princess Mononoke, The (Mononoke Hime)  | Shop Around the Corner, The  | Marlene Dietrich: Shadow and Light  | Eye for an Eye  | Stuart Saves His Family  | Hard Target  | Richie Rich  |

1  : Abyss, The  | Madonna: Truth or Dare  | Princess Caraboo  | Outlaw Josey Wales, The  | In Too Deep  | Tomb of Ligeia, The  | Right Stuff, The  | Blood Feast  | Pit and the Pendulum  | Splendor in the Grass  | Razor's Edge, The  | Puppet Master  | Somewhere in the City  | New Rose Hotel  | Walk in the Sun, A  | Go