In [103]:
# Imports
import pandas as pd
import numpy as np
from scipy.linalg import svd, diagsvd # SVD
from copy import deepcopy # Deep copy

In [104]:
df = pd.read_csv('./archive/ratings_small.csv')
df = df[['userId', 'movieId', 'rating']]
df = df.pivot(index='userId', columns='movieId', values='rating')
df = df.fillna(0)
matrix_b = df.to_numpy()
df.head(5)

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [105]:
# Since the original df has many movies that have not been rated
# by any user, we need to drop those columns from the df, and 
# translate their indices in matrix A to the Ids in the original df
df_cleared = df.loc[:, (df != 0).any(axis=0)]
df_index_list = df_cleared.index.tolist()
df_column_list = df_cleared.columns.tolist()

# We've defined these global variables above to save time and memory
def get_ids_from_matrix(df_index_list, df_column_list, user_id, movie_id):
    return df_index_list[user_id], df_column_list[movie_id]

In [106]:
userRow = np.random.randint(matrix_b.shape[0])

# Gets a random value for the column where the value isn't 0
movieColumn = np.random.choice(np.where(matrix_b[userRow] != 0)[0])

userId, movieId= get_ids_from_matrix(df_index_list, df_column_list, userRow, movieColumn)
print("Matrix Row -> " + str(userRow))
print("Original UserId -> " + str(userId) + "\n")
print("Matrix Column -> " + str(movieColumn))
print("Original MovieId -> " + str(movieId))

Matrix Row -> 44
Original UserId -> 45

Matrix Column -> 2541
Original MovieId -> 3160


In [107]:
matrix_b = deepcopy(matrix_b)
matrix_b[(userRow, movieColumn)] = np.random.randint(1, 5)

print("Changed Cell -> " + str((userRow, movieColumn)))
print("Old Value -> " + str(matrix_b[(userRow, movieColumn)]))
print("New value -> " + str(matrix_b[(userRow, movieColumn)]))

Changed Cell -> (44, 2541)
Old Value -> 2.0
New value -> 2.0


In [108]:
# SVD - matrix B
U, S, V = svd(matrix_b)

sigma = diagsvd(S, matrix_b.shape[0], matrix_b.shape[1])

# Reconstruct matrix b
matrix_b_reconstructed = U @ sigma @ V

np.allclose(matrix_b, matrix_b_reconstructed)
print("Matrix B and its reconstruction are the same: " + str(np.allclose(matrix_b, matrix_b_reconstructed)))

Matrix B and its reconstruction are the same: True
