In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split

def read_ratings(file_path, sep='::'):
    """
    Reads the ratings file into a user x item DataFrame. Ratings are stored in 'database' form.
    Where each line is in the form: <user_id><sep><item_id><sep><rating><sep><timestamp>
    Unkown values are 0 and ratings are on a 1-5 scale
    :param file_path: The ratings file path
    :param sep: The separator between items
    :return: The user x item ratings DataFrame
    """
    ratings_file = os.path.abspath(file_path)
    column_names = ['userId', 'movieId', 'rating', 'timestamp']
    ratings = pd.read_csv(ratings_file, names=column_names, sep=sep, engine='python')
    ratings = ratings.drop('timestamp', axis=1)
    ratings[['userId', 'movieId']] = ratings[['userId', 'movieId']].astype('int32')
    ratings[['rating']] = ratings[['rating']].astype('int8')
    ratings = ratings.pivot('userId', 'movieId', 'rating').fillna(value=0)
    return ratings

def split_train_test(ratings, test_ratio=0.2):
    """
    Split the ratings matrix into test and train matrices.
    :param ratings: The original user x item ratings DataFrame
    :type ratings: DataFrame
    :param test_ratio: The ratio of ratings to take for the test dataset
    :type test_ratio: float
    :return: The train and test ratings dataFrames
    """
    test = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)
    train = pd.DataFrame(np.zeros(ratings.shape), index=ratings.index, columns=ratings.columns)

    for user in xrange(ratings.shape[0]):
        user_ratings_indexes = ratings.iloc[user, :].nonzero()[0]
        train_indexes, test_indexes = train_test_split(user_ratings_indexes, test_size=test_ratio)
        train.iloc[user, train_indexes] = ratings.iloc[user, train_indexes]
        test.iloc[user, test_indexes] = ratings.iloc[user, test_indexes]

    return train, test

def get_ratings_sparsity(ratings):
    """
    Calculates the sparsity of the ratings matrix
    :param ratings: The user x item ratings DataFrame
    :type ratings: DataFrame
    :return: The percentage sparsity of the DataFrame
    """
    sparsity = float(len(ratings.values.nonzero()[0]))
    sparsity /= (ratings.shape[0] * ratings.shape[1])
    sparsity *= 100
    return sparsity

In [2]:
path = os.path.join(os.getcwd(), "datasets_rating", "ml1m", "ratings.dat")
ratings, test = split_train_test(read_ratings(path))
ratings = ratings.fillna(value=0)

3.56392712015


In [3]:
ratings = ratings.replace(0, np.nan)
user_means = pd.DataFrame(ratings.mean(axis=1), index=ratings.index, columns=['mean']).fillna(value=0)
item_means = pd.DataFrame(ratings.mean(axis=0), index=ratings.columns, columns=['mean']).fillna(value=0)
# ratings = ratings.fillna(value=0)

In [4]:
user_offsets = ratings.subtract(item_means['mean'], 1).abs().mean(axis=1)
user_offsets.head()

userId
1    0.665123
2    0.717332
3    0.739184
4    0.742620
5    0.944570
dtype: float64

In [8]:
new = ratings.subtract(item_means['mean'], 1).divide(user_offsets, 0)
new.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.313168,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [34]:
new.transpose().unstack().dropna()[1]

movieId
1       1.313168
48      2.981652
260    -0.697595
527     0.747717
531     0.219610
588     0.330623
595     1.696964
608    -0.394692
720    -2.166649
745    -2.256688
783     1.149158
914    -1.731372
919    -0.356462
938     0.511388
1022    1.905303
1028    1.674415
1029    1.950985
1097    0.046042
1193    0.928882
1197   -1.948564
1207   -0.627628
1246   -0.024003
1287    1.352309
1545   -0.092050
1566    1.062200
1836    2.501247
1907    0.336717
1961    1.430577
2018    0.352476
2028    0.990167
2294    0.836899
2355    1.724451
2687   -1.002321
2692   -0.338590
2762   -0.644023
2791    0.044864
2797    0.233624
2918   -0.168732
3105    1.870874
3114   -0.336321
3186    0.820081
3408    0.201917
dtype: float64

In [36]:
new.transpose().unstack().dropna().multiply(5).round()[1]

movieId
1        7
48      15
260     -3
527      4
531      1
588      2
595      8
608     -2
720    -11
745    -11
783      6
914     -9
919     -2
938      3
1022    10
1028     8
1029    10
1097     0
1193     5
1197   -10
1207    -3
1246    -0
1287     7
1545    -0
1566     5
1836    13
1907     2
1961     7
2018     2
2028     5
2294     4
2355     9
2687    -5
2692    -2
2762    -3
2791     0
2797     1
2918    -1
3105     9
3114    -2
3186     4
3408     1
dtype: float64