In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split

def read_ratings(file_path, sep='::'):
    """
    Reads the ratings file into a user x item DataFrame. Ratings are stored in 'database' form.
    Where each line is in the form: <user_id><sep><item_id><sep><rating><sep><timestamp>
    Unkown values are 0 and ratings are on a 1-5 scale
    :param file_path: The ratings file path
    :param sep: The separator between items
    :return: The user x item ratings DataFrame
    """
    ratings_file = os.path.abspath(file_path)
    column_names = ['userId', 'movieId', 'rating', 'timestamp']
    ratings = pd.read_csv(ratings_file, names=column_names, sep=sep, engine='python')
    ratings = ratings.drop('timestamp', axis=1)
    ratings[['userId', 'movieId']] = ratings[['userId', 'movieId']].astype('int32')
    ratings[['rating']] = ratings[['rating']].astype('int8')
    ratings = ratings.pivot('userId', 'movieId', 'rating').fillna(value=0)
    return ratings

def split_train_test(ratings, test_ratio=0.2):
    """
    Split the ratings matrix into test and train matrices.
    :param ratings: The original user x item ratings DataFrame
    :type ratings: DataFrame
    :param test_ratio: The ratio of ratings to take for the test dataset
    :type test_ratio: float
    :return: The train and test ratings dataFrames
    """
    test = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)
    train = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)

    for user in xrange(ratings.shape[0]):
        user_ratings_indexes = ratings.iloc[user, :].nonzero()[0]
        train_indexes, test_indexes = train_test_split(user_ratings_indexes, test_size=test_ratio)
        train.iloc[user, train_indexes] = ratings.iloc[user, train_indexes]
        test.iloc[user, test_indexes] = ratings.iloc[user, test_indexes]

    return train, test

def get_ratings_sparsity(ratings):
    """
    Calculates the sparsity of the ratings matrix
    :param ratings: The user x item ratings DataFrame
    :type ratings: DataFrame
    :return: The percentage sparsity of the DataFrame
    """
    sparsity = float(len(ratings.values.nonzero()[0]))
    sparsity /= (ratings.shape[0] * ratings.shape[1])
    sparsity *= 100
    return sparsity

In [2]:
path = os.path.join(os.getcwd(), "datasets_rating", "ml1m", "ratings.dat")
ratings, test = split_train_test(read_ratings(path))
ratings = ratings.fillna(value=0)

In [3]:
ratings = ratings.replace(0, np.nan)
user_means = pd.DataFrame(ratings.mean(axis=1), index=ratings.index, columns=['mean']).fillna(value=0)
item_means = pd.DataFrame(ratings.mean(axis=0), index=ratings.columns, columns=['mean']).fillna(value=0)
# ratings = ratings.fillna(value=0)

In [4]:
user_offsets = ratings.subtract(item_means['mean'], 1).abs().mean(axis=1)

In [5]:
new = ratings.subtract(item_means['mean'], 1).divide(user_offsets, 0)
new.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [6]:
new = new.transpose().unstack().dropna()

In [36]:
users= ratings.index
movies = pd.Index(ratings.columns.unique())
len(movies)

3706

In [37]:
fo = open("rating.dat", "w")

line = str(len(movies)) + " "
for movie in movies:
    line += str(movies.get_loc(movie)) + ":1 "
line += "\n"
fo.write(line)

for user in users:
    line =""
    count = 0
    for item in new[user].iteritems():
        rating = item[1]
        score = 0
        if rating < 0:
            score = 1
        elif rating >= 0:
            if rating < 1:
                score = 2
            elif rating > 1:
                score = 3
            elif rating > 2:
                score = 4
        count += 1
        line += str(movies.get_loc(item[0])) + ":"
        line += str(score)
        line += " "
    line = str(count) + " " + line + "\n"
    
    fo.write(line)
fo.close()

In [81]:
path = os.path.join(os.getcwd(), "cpp", "params", "ratings", "beta.dat")
beta = pd.read_table(path, sep=" ", skiprows=1, header = None).drop(3706, 1)
beta.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,0.002699,0.000412,0.000308,0.00031,0.000449,0.000799,0.000111,6e-05,0.000133,0.000356,...,8.1e-05,4e-06,4.6e-05,1e-05,8e-06,8.4e-05,0.000114,5e-06,1.2e-05,0.000503
1,0.002457,0.000122,0.000317,8.7e-05,0.000261,1.8e-05,0.000359,3.3e-05,6.1e-05,0.001269,...,0.000105,2.1e-05,5.8e-05,0.000214,3e-05,0.000945,0.000612,2.7e-05,7e-05,0.000308
2,0.001448,0.000567,0.00069,3.2e-05,0.000522,0.003627,0.000135,7.1e-05,0.000187,0.00048,...,9.8e-05,1.5e-05,3.7e-05,8.7e-05,5.4e-05,0.00106,0.000133,6.5e-05,6.8e-05,0.000584
3,0.002529,0.000857,0.000247,8.6e-05,0.000233,0.000842,0.000138,5.9e-05,3.6e-05,0.001064,...,0.000133,1.3e-05,3e-06,0.000154,2.9e-05,0.001501,0.000469,8.7e-05,8.3e-05,0.000888
4,0.003321,0.000833,0.000831,0.000191,0.000327,0.000606,0.0007,5.9e-05,0.00021,0.001166,...,7.7e-05,1.8e-05,2.5e-05,4.6e-05,6.3e-05,0.000608,0.000489,4.2e-05,9e-06,0.000155


In [83]:
path = os.path.join(os.getcwd(), "cpp", "params", "ratings", "gamma.dat")
gamma = pd.read_table(path, sep=" ", skiprows=1, header = None).drop(10, 1).transpose()
gamma.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
0,0.097806,0.098043,0.096288,0.103065,0.128455,0.107191,0.101128,0.127123,0.108132,0.104568,...,0.109102,0.09416,0.112048,0.089097,0.106044,0.099848,0.099524,0.092335,0.095212,0.101268
1,0.102412,0.089134,0.095662,0.105168,0.086536,0.09912,0.098195,0.10556,0.098327,0.097383,...,0.130568,0.090798,0.10287,0.116948,0.101571,0.100323,0.098102,0.096703,0.098278,0.094178
2,0.099399,0.098987,0.101127,0.111232,0.081743,0.095846,0.088935,0.0808,0.097611,0.099125,...,0.093822,0.095075,0.085246,0.082014,0.101575,0.098546,0.095517,0.089792,0.098514,0.093543
3,0.09978,0.113442,0.107657,0.099681,0.104479,0.094162,0.091348,0.10738,0.096253,0.088443,...,0.092187,0.097934,0.099479,0.092792,0.094983,0.098436,0.103496,0.098384,0.105506,0.101114
4,0.100518,0.099308,0.09871,0.099734,0.106499,0.096151,0.111925,0.088096,0.097441,0.099265,...,0.103188,0.114837,0.097389,0.111541,0.102123,0.101518,0.094747,0.08656,0.117643,0.098914


In [107]:
user1_test = test.replace(0, np.nan).transpose().unstack().dropna()[2]
user1_test

movieId
265     4
434     2
457     4
480     5
590     5
648     4
920     5
1207    4
1244    3
1408    3
1442    4
1537    4
1552    3
1610    5
1687    3
1690    3
1784    5
1945    5
1962    5
1968    2
2002    5
2028    4
2268    5
2728    3
3095    4
3108    3
dtype: float64

In [108]:
gamma[1]

0    0.098043
1    0.089134
2    0.098987
3    0.113442
4    0.099308
5    0.107866
6    0.095517
7    0.107287
8    0.095383
9    0.095033
Name: 1, dtype: float64

In [112]:
movie_index = 3095
movie1 = beta[movies.get_loc(movie_index)].divide(beta[movies.get_loc(movie_index)].sum())
user1 = gamma[1]

top = user1.multiply(movie1)
user_tot = 0
movie_tot = 0
for i in xrange(10):
    user_tot += user1[i] * user1[i]
    movie_tot += movie1[i] * movie1[i]
user_tot = np.sqrt(user_tot)
movie_tot = np.sqrt(movie_tot)
sim = top.sum() / (user_tot * movie_tot)
sim

0.85760878288708375

0.89535559596028635