https://towardsdatascience.com/beginners-guide-to-creating-an-svd-recommender-system-1fd7326d1f65

In [9]:
import pandas as pd
import numpy as np
import scipy
from scipy.linalg import sqrtm

In [10]:
data = pd.read_csv('ratings.csv')
data['userId'] = data['userId'].astype('str')
data['movieId'] = data['movieId'].astype('str')
users = data['userId'].unique() #list of all users
movies = data['movieId'].unique() #list of all movies
test = pd.DataFrame(columns=data.columns)
train = pd.DataFrame(columns=data.columns)
test_ratio = 0.2 #fraction of data to be used as test set.

print("Number of users", len(users))
print("Number of movies", len(movies))
print(data.head())

Number of users 671
Number of movies 9066
  userId movieId  rating   timestamp
0      1      31     2.5  1260759144
1      1    1029     3.0  1260759179
2      1    1061     3.0  1260759182
3      1    1129     2.0  1260759185
4      1    1172     4.0  1260759205


In [11]:
for u in users:
    temp = data[data['userId'] == u]
    n = len(temp)
    test_size = int(test_ratio*n)
    
temp = temp.sort_values('timestamp').reset_index()
temp.drop('index', axis=1, inplace=True)

In [12]:
dummy_test = temp.iloc[n-1-test_size :]
dummy_train = temp.iloc[: n-2-test_size]
    
test = pd.concat([test, dummy_test])
train = pd.concat([train, dummy_train])

In [13]:
test

Unnamed: 0,userId,movieId,rating,timestamp
91,671,5299,3.0,1065112004
92,671,745,4.0,1065149085
93,671,2918,4.0,1065149106
94,671,1225,4.0,1065149143
95,671,457,4.0,1065149159
96,671,6269,4.0,1065149201
97,671,3671,3.0,1065149267
98,671,1266,4.0,1065149270
99,671,2401,4.0,1065149286
100,671,590,4.0,1065149296


In [14]:
train

Unnamed: 0,userId,movieId,rating,timestamp
0,671,2683,4.0,1063500751
1,671,2355,4.0,1063500762
2,671,1206,3.0,1063500775
3,671,1247,4.0,1063500804
4,671,2797,4.0,1063500821
...,...,...,...,...
85,671,4019,3.5,1065111959
86,671,5816,4.0,1065111963
87,671,4880,4.0,1065111973
88,671,4308,3.5,1065111985


In [15]:
def create_utility_matrix(data, formatizer = {'user':0, 'item': 1, 'value': 2}):
    """
        :param data:      Array-like, 2D, nx3
        :param formatizer:pass the formatizer
        :return:          utility matrix (n x m), n=users, m=items
    """
        
    itemField = formatizer['item']
    userField = formatizer['user']
    valueField = formatizer['value']

    userList = data.iloc[:,userField].tolist()
    itemList = data.iloc[:,itemField].tolist()
    valueList = data.iloc[:,valueField].tolist()
    
    users = list(set(data.iloc[:,userField]))
    items = list(set(data.iloc[:,itemField]))
    users_index = {users[i]: i for i in range(len(users))}
    pd_dict = {item: [np.nan for i in range(len(users))] for item in items}
    
    for i in range(0,len(data)):
        item = itemList[i]
        user = userList[i]
        value = valueList[i]
    
    pd_dict[item][users_index[user]] = value
    X = pd.DataFrame(pd_dict)
    X.index = users
        
    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}
    # users_index gives us a mapping of user_id to index of user
    # items_index provides the same for items
    return X, users_index, items_index

In [16]:
def svd(train, k):
    utilMat = np.array(train)

    # the nan or unavailable entries are masked
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)

    # nan entries will replaced by the average rating for each item
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0],1))

    # we remove the per item average from all entries.
    # the above mentioned nan entries will be essentially zero now
    utilMat = utilMat - x

    # The magic happens here. U and V are user and item features
    U, s, V=np.linalg.svd(utilMat, full_matrices=False)
    s=np.diag(s)
    
    # we take only the k most significant features
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]
    s_root=sqrtm(s)
    Usk=np.dot(U,s_root)
    skV=np.dot(s_root,V)
    UsV = np.dot(Usk, skV)
    UsV = UsV + x
    print("svd done")
    return UsV

In [17]:
def rmse(true, pred):
    # this will be used towards the end
    x = true - pred
    return sum([xi*xi for xi in x])/len(x)
# to test the performance over a different number of features
no_of_features = [8,10,12,14,17]
utilMat, users_index, items_index = create_utility_matrix(train)
for f in no_of_features: 
    svdout = svd(utilMat, k=f)
    pred = [] #to store the predicted ratings
    for _,row in test.iterrows():
        user = row['userId']
        item = row['movieId']
        u_index = users_index[user]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        pred.append(pred_rating)
print(rmse(test['rating'], pred))

svd done
svd done
svd done
svd done
svd done
3.25
