In [135]:
# Imports
import pandas as pd
import numpy as np
from scipy.linalg import svd, diagsvd # SVD
from copy import deepcopy # Deep copy

In [136]:
df = pd.read_csv('./archive/ratings_small.csv')
df = df[['userId', 'movieId', 'rating']]
df = df.pivot(index='userId', columns='movieId', values='rating')
df = df.fillna(0)
matrix_a = df.to_numpy()
df.head(5)

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [137]:
# Since the original df has many movies that have not been rated
# by any user, we need to drop those columns from the df, and 
# translate their indices in matrix A to the Ids in the original df
df_cleared = df.loc[:, (df != 0).any(axis=0)]
df_index_list = df_cleared.index.tolist()
df_column_list = df_cleared.columns.tolist()

# We've defined these global variables above to save time and memory
def get_ids_from_matrix(df_index_list, df_column_list, user_id, movie_id):
    return df_index_list[user_id], df_column_list[movie_id]

In [138]:
userRow = np.random.randint(matrix_a.shape[0])

# Gets a random value for the column where the value isn't 0
movieColumn = np.random.choice(np.where(matrix_a[userRow] != 0)[0])

userId, movieId= get_ids_from_matrix(df_index_list, df_column_list, userRow, movieColumn)
print("Matrix Row -> " + str(userRow))
print("Original UserId -> " + str(userId) + "\n")
print("Matrix Column -> " + str(movieColumn))
print("Original MovieId -> " + str(movieId))

Matrix Row -> 552
Original UserId -> 553

Matrix Column -> 1352
Original MovieId -> 1704


In [139]:
matrix_b = deepcopy(matrix_a)
matrix_b[(userRow, movieColumn)] = np.random.randint(1, 10) / 2

print("Changed Cell -> " + str((userRow, movieColumn)))
print("Old Value -> " + str(matrix_a[(userRow, movieColumn)]))
print("New value -> " + str(matrix_b[(userRow, movieColumn)]))

Changed Cell -> (552, 1352)
Old Value -> 4.0
New value -> 2.0


In [141]:
# SVD - matrix B
U, S, V = svd(matrix_b)
S[-600] *= 0
sigma = diagsvd(S, matrix_b.shape[0], matrix_b.shape[1])

# Reconstruct matrix b
matrix_b_reconstructed = U @ sigma @ V
matrix_b_reconstructed

array([[ 9.63623955e-03, -7.70602369e-03, -4.01744442e-04, ...,
         6.33558173e-05,  3.80134904e-05,  3.70792908e-04],
       [ 5.52174415e-03, -4.41569463e-03, -2.30207023e-04, ...,
         3.63040594e-05,  2.17824357e-05,  2.12471220e-04],
       [-7.19749884e-03,  5.75578225e-03,  3.00070909e-04, ...,
        -4.73217191e-05, -2.83930315e-05, -2.76952593e-04],
       ...,
       [ 3.45217498e-02, -2.76067671e-02, -1.43924619e-03, ...,
         2.26971700e-04,  1.36183020e-04,  1.32836258e-03],
       [ 3.97522356e+00,  1.98135218e-02,  1.03295455e-03, ...,
        -1.62898781e-04, -9.77392686e-05, -9.53372798e-04],
       [ 5.01134815e+00, -9.07502812e-03, -4.73115871e-04, ...,
         7.46112193e-05,  4.47667316e-05,  4.36665680e-04]])