In [18]:
import os
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error

def read_ratings(file_path, sep='::'):
    ratings_file = os.path.abspath(file_path)
    column_names = ['userId', 'movieId', 'rating', 'timestamp']
    ratings = pd.read_csv(ratings_file, names=column_names, sep=sep, engine='python')
    ratings = ratings.drop('timestamp', axis=1)
    ratings[['userId', 'movieId']] = ratings[['userId', 'movieId']].astype('int32')
    ratings[['rating']] = ratings[['rating']].astype('int8')
    ratings = ratings.pivot('userId', 'movieId', 'rating').fillna(value=0)
    return ratings

def split_train_test(ratings, test_ratio=0.2):
    test = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)
    train = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)

    for user in xrange(ratings.shape[0]):
        user_ratings_indexes = ratings.iloc[user, :].nonzero()[0]
        train_indexes, test_indexes = train_test_split(user_ratings_indexes, test_size=test_ratio)
        train.iloc[user, train_indexes] = ratings.iloc[user, train_indexes]
        test.iloc[user, test_indexes] = ratings.iloc[user, test_indexes]

    return train, test

def get_similiarity_matrix(ratings):
    user_similarity = ratings.dot(ratings.transpose())
    normalisation_terms = pd.DataFrame(np.diagonal(user_similarity.values), index=ratings.index).apply(np.sqrt)
    normalisation_terms = normalisation_terms.dot(normalisation_terms.transpose())
    user_similarity = user_similarity.div(normalisation_terms)
    return user_similarity

def get_rmse(predicted, actual):
    return np.sqrt(mean_squared_error(actual.values[actual.values.nonzero()].flatten(),
                                      predicted.values[actual.values.nonzero()].flatten()))

def adjust_user_similarity_knn(user_similarity, k):
        adjusted_similarity = pd.DataFrame(np.zeros(user_similarity.shape),
                                           index=user_similarity.index, columns=user_similarity.columns)

        for user in user_similarity.iterrows():
            top_k_indexes = user[1].sort_values(ascending=False).iloc[0:k+1].index.values
            adjusted_similarity.loc[user[0], top_k_indexes] = user_similarity.loc[user[0], top_k_indexes]

        return adjusted_similarity
        
def predict(ratings, user_similarity, user_means, user_std_devs):
    predictions = user_similarity.dot(ratings)
    denom = user_similarity.abs().sum().transpose()
    predictions = predictions.div(denom, axis='index')
    
    adjusted_predictions_values = predicted.values * self.user_std_devs.values
    adjusted_predicted_values = adjusted_predictions_values + self.user_means.values
    predictions = pd.DataFrame(adjusted_predicted_values).fillna(value=0)
    return predictions

In [2]:
path = os.path.join(os.getcwd(),"..","datasets", "ml1m", "ratings.dat")
full = read_ratings(path)

users= full.index
movies = pd.Index(full.columns.unique())

ratings, test = split_train_test(full)

In [3]:
ratings = ratings.replace(0, np.nan)
user_means = pd.DataFrame(ratings.mean(axis=1), index=ratings.index, columns=['mean']).fillna(value=0)
item_means = pd.DataFrame(ratings.mean(axis=0), index=ratings.columns, columns=['mean']).fillna(value=0)
user_offsets = ratings.subtract(user_means['mean'], 0).abs().mean(axis=1)
new = ratings.subtract(user_means['mean'], 0).divide(user_offsets, 0).transpose().unstack().dropna()

In [4]:
fo = open("rating.dat", "w")

line = str(len(movies)) + " "
for movie in movies:
    line += str(movies.get_loc(movie)) + ":1 "
line += "\n"
fo.write(line)

neg_docs = ""

for user in users:
    pos_doc = ""
    neg_doc = ""
    
    count = 0
    neg_count = 0
    
    for item in new.loc[user].iteritems():
        rating = item[1]
        if rating < 0:
            score = 0
            neg_count += 1
            if rating > -1:
                score = 1
            else:
                score = 2
                
            neg_doc += str(movies.get_loc(item[0])) + ":" + str(score) + " "
                
        elif rating >= 0:
            count += 1
            score = 0
            if rating < 1:
                score = 1
            else:
                score = 2
            
            pos_doc += str(movies.get_loc(item[0])) + ":" + str(score) + " "
            
    line = str(count) + " " + pos_doc + "\n"
    neg_docs += str(neg_count) + " " + neg_doc + "\n"
    fo.write(line)
    
fo.write(neg_docs)
fo.close()

In [137]:
path = os.path.join(os.getcwd(), "..", "param", "rating", "gamma.dat")
gamma = pd.read_table(path, sep=" ", skiprows=1, header = None)

path = os.path.join(os.getcwd(), "..", "param", "rating", "beta.dat")
beta = pd.read_table(path, sep=" ", header = None)

path = os.path.join(os.getcwd(), "..", "param", "rating", "alpha.dat")
alpha = pd.read_table(path, sep=" ", header = None)

In [142]:
alpha

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01


In [138]:
adj_ratings = ratings.subtract(user_means['mean'], 0).divide(user_offsets, 0).fillna(value=0)
gamma = gamma.replace(0.01000, 0).loc[adj_ratings.index]

In [139]:
sim = pd.DataFrame(get_similiarity_matrix(gamma), index=adj_ratings.index, columns=adj_ratings.index).fillna(value=0)

In [140]:
predictions = sim.dot(adj_ratings).div(sim.abs().sum(), 0)
predictions = predictions.multiply(user_offsets, 0).add(user_means['mean'], 0).fillna(value=0)
get_rmse(predictions, test)

1.0163537287527014

In [141]:
k=10
sim_topk = pd.DataFrame(np.zeros(sim.shape), index=sim.index, columns=sim.columns)

for user in sim.iterrows():
    top_k_indexes = user[1].sort_values(ascending=False).iloc[0:k+1].index.values
    sim_topk.loc[user[0], top_k_indexes] = sim.loc[user[0], top_k_indexes]


In [136]:
predictions = sim_topk.dot(adj_ratings).div(sim.abs().sum(), 0)
predictions = predictions.multiply(user_offsets, 0).add(user_means['mean'], 0).fillna(value=0)
get_rmse(predictions, test)

1.0346097562242511

In [13]:
path = os.path.join(os.getcwd(), "datasets_rating", "ml1m", "movies.dat")
movies = pd.read_table(path, sep="::", header = None, engine='python')

In [14]:
movies = movies.drop(0,1).drop(2,1)

In [28]:
sorted_topics = alpha.transpose().index
for topic in sorted_topics:
    top_words = beta.loc[topic].sort_values(ascending=False).index
    
    if topic < 10:
        print topic, " :" ,
    else:
        print topic , ":" , 
        
    for i in xrange(30):
        print movies.loc[top_words[i]][1][:-7], ' |', 
    print
    print

0  : Permanent Midnight  | Farinelli: il castrato  | Die Hard  | Victor/Victoria  | Peggy Sue Got Married  | Outrageous Fortune  | Midnight Cowboy  | Turbo: A Power Rangers Movie  | U.S. Marshalls  | Casper  | Splendor  | Return to Paradise  | American Dream  | Second Jungle Book: Mowgli & Baloo, The  | Pretty Woman  | Bride of Re-Animator  | Mortal Thoughts  | Mondo  | Saint of Fort Washington, The  | Everything Relative  | Pete's Dragon  | Madonna: Truth or Dare  | Preacher's Wife, The  | Good Will Hunting  | Dirty Work  | In & Out  | Alice and Martin (Alice et Martin)  | Dumb & Dumber  | Three Musketeers, The  | Hard Day's Night, A  |

1  : Two Crimes  | Them!  | Fiendish Plot of Dr. Fu Manchu, The  | Secret Agent  | Dumbo  | King and I, The  | What Dreams May Come  | Bushwhacked  | Harvey  | Thing, The  | Blue Sky  | Road Trip  | Dune  | Working Girl  | Funny Bones  | Shampoo  | Living in Oblivion  | Overnight Delivery  | Seventh Sign, The  | Stupids, The  | Bull Durham  | Dave  | 