In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movie-lens-dataset/movies.csv
/kaggle/input/movie-lens-dataset/ratings.csv
/kaggle/input/movie-lens-dataset/tags.csv
/kaggle/input/movie-lens-dataset/links.csv


In [2]:
data = pd.read_csv('/kaggle/input/movie-lens-dataset/ratings.csv')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
users = data['userId'].unique()
movies = data['movieId'].unique()

print("Number of users", len(users))
print("Number of movies", len(movies))



Number of users 610
Number of movies 9724


In [4]:
test = pd.DataFrame(columns=data.columns)
train = pd.DataFrame(columns=data.columns)

test_ratio = 0.2

for u in users:
    temp = data[data['userId']==u]
    n = len(temp)
    test_size = int(test_ratio * n)
    
temp = temp.sort_values('timestamp').reset_index()
temp.drop('index', axis=1, inplace=True)

dummy_test = temp.loc[n-1-test_size:]
dummy_train = temp.loc[:n-2-test_size]

test = pd.concat([test, dummy_test])
train = pd.concat([train, dummy_train])

In [5]:
test

Unnamed: 0,userId,movieId,rating,timestamp
1041,610,71732,3.5,1493848688
1042,610,113159,3.5,1493848692
1043,610,94867,3.5,1493848703
1044,610,102070,3.0,1493848706
1045,610,59915,3.5,1493848708
...,...,...,...,...
1297,610,101739,3.5,1495959269
1298,610,70,4.0,1495959282
1299,610,328,3.5,1495959299
1300,610,2459,3.5,1495959405


The train data is the older ratings of the users while the test data is newer ratings that we intend on predicting.


In [6]:
train[train['userId']==610].sort_values('timestamp').head()

Unnamed: 0,userId,movieId,rating,timestamp
0,610,318,3.0,1479541963
1,610,2959,5.0,1479541966
2,610,1573,3.5,1479541990
3,610,7163,1.5,1479541995
4,610,3623,3.0,1479542001


In [7]:
test[test['userId']==610].sort_values('timestamp').head()

Unnamed: 0,userId,movieId,rating,timestamp
1041,610,71732,3.5,1493848688
1042,610,113159,3.5,1493848692
1043,610,94867,3.5,1493848703
1044,610,102070,3.0,1493848706
1045,610,59915,3.5,1493848708


In [8]:
1479541963-1493848688

-14306725

In [9]:
def create_utility_matrix(data):
    
    movieField = data['movieId'].tolist()
    userField = data['userId'].tolist()
    ratingField = data['rating'].tolist()
    
    users = list(set(userField))
    movies = list(set(movieField))
    
    users_index = {users[i]: i for i in range(len(users))}
    
    pd_dict = {movie: [np.nan for i in range(len(users))] for movie in movies}
    
    for i in range(0, len(data)):
        movie = movieField[i]
        user = userField[i]
        rating = ratingField[i]
        pd_dict[movie][users_index[user]] = rating
    
    X = pd.DataFrame(pd_dict)
    X.index = users
    
    movies_cols = list(X.columns)
    movies_index = {movies_cols[i]: i for i in range(len(movies_cols))}
    
    return X, users_index, movies_index

In [10]:
X, users_index, movies_index = create_utility_matrix(data)

In [11]:
X

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,98239,98243,131013,131023,32728,163809,98279,32743,65514,98296
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,3.0,,,4.0,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [12]:
data[(data['userId']==609) & (data['movieId']==1)]

Unnamed: 0,userId,movieId,rating,timestamp
99497,609,1,3.0,847221025


In [13]:
# Testing working of masked array
x = np.array([1, 2, 3, -1, 5])
masked_x = np.ma.masked_array(x, np.array([0, 0, 0, 1, 0]))
masked_x.mean() # This will ignore the -1 value and calculate the mean using only the 4 values


2.75

In [14]:
from scipy.linalg import sqrtm

def svd(train, k):
    
    util_mat = np.array(train)
    # the nan or unavailable entries are masked
    mask = np.isnan(util_mat)
    masked_arr = np.ma.masked_array(util_mat, mask)
    item_means = np.mean(masked_arr, axis=0)
    
    # Before: [nan, 1.5, nan, 2.5, 3]
    # After: [2.34, 1.5, 2.34, 2.5, 3]
    # nan entries will replaced by the average rating for each item
    util_mat = masked_arr.filled(item_means)
    x = np.tile(item_means, (util_mat.shape[0],1))
    
    # Before: [2.34, 1.5, 2.34, 2.5, 3]
    # After: [0, -0.84, 0, 0.16, 0.66]
    # we remove the per item average from all entries.
    # the above mentioned nan entries will be essentially zero now
    util_mat = util_mat - x
    
    # Using numpy SVD
    # U is user features
    # V is movie features
    U, s, V=np.linalg.svd(util_mat, full_matrices=False)
    s=np.diag(s)
    
    # we take only the k most significant features
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]
    
    s_root=sqrtm(s)
    
    Usk=np.dot(U,s_root)
    skV=np.dot(s_root,V)
    UsV = np.dot(Usk, skV)
    
    UsV = UsV + x
    
    print("svd done")
    return UsV

In [15]:
def rmse(true, pred):
    x = true - pred
    return sum([xi * xi for xi in x])/len(x)

In [16]:
no_of_features = [8, 10, 12, 14, 17]

util_mat, users_index, movies_index = create_utility_matrix(train)

for num_feat in no_of_features:
    pred_svd = svd(util_mat, num_feat)
    preds = []
    
    for _,row in test.iterrows():
        user = row['userId']
        movie = row['movieId']
        
        u_index = users_index[user]
        if movie in movies_index:
            movie_index = movies_index[movie]
            pred_rating = pred_svd[u_index, movie_index]
        else:
            pred_rating = np.mean(pred_svd[u_index, :])
        preds.append(pred_rating)

print(rmse(test['rating'], preds))

svd done
svd done
svd done
svd done
svd done
0.8323800663144226
