In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import gc

from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from scipy.linalg import svd


import numpy as np

import time


In [2]:
# 데이터 불러오기
df = pd.read_csv("Filtered_Ratings_NoZero.csv")

print(df.head())

   User-ID        ISBN  Book-Rating
0   276726  0155061224            5
1   276729  052165615X            3
2   276729  0521795028            6
3   276744  038550120X            7
4   276747  0060517794            9


In [3]:
# User-ID별로 등장 횟수를 세어 Series로 저장
user_counts = df['User-ID'].value_counts()

# User-ID가 100번 이상 등장한 사용자만 남기기 = 100권 이상 책 평가한 유저만 남기겠다
filtered_df = df[df['User-ID'].isin(user_counts[user_counts >= 100].index)]

# 결과 출력
total_rows = len(filtered_df)
print(filtered_df.head())
print("행의 개수 : ", total_rows) # 103271 개

     User-ID        ISBN  Book-Rating
529   277427  002542730X           10
530   277427  003008685X            8
531   277427  0060006641           10
532   277427  0060542128            7
533   277427  0061009059            9
행의 개수 :  103271


In [4]:
# matrix로 변환하기
R_matrix = filtered_df.pivot(index="ISBN", columns="User-ID", values="Book-Rating").fillna(0).astype("float32")
print(R_matrix.head())

User-ID     2033    2110    2276    4017    4385    5582    6242    6251    \
ISBN                                                                         
0000913154     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0001046438     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
000104687X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0001047213     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0001047973     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

User-ID     6543    6575    ...  269566  270713  271448  271705  273113  \
ISBN                        ...                                           
0000913154     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0001046438     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
000104687X     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0001047213     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
000

In [5]:
num_rows = R_matrix.shape[0]  # 행의 개수
num_cols = R_matrix.shape[1]  # 열의 개수

print(f"행의 개수: {num_rows}")  # 66574개
print(f"열의 개수: {num_cols}")  # 449개

# R_matrix에서 0이 아닌 값의 개수
non_zero_cnt = np.count_nonzero(R_matrix)
print("0이 아닌 값의 갯수 : ", non_zero_cnt)
print("0이 아닌 값의 비율 :", non_zero_cnt * 100 / (num_rows*num_cols),"%")

행의 개수: 66574
열의 개수: 449
0이 아닌 값의 갯수 :  103271
0이 아닌 값의 비율 : 0.34548356290968274 %


In [13]:
# Rmatrix에서 0이 아닌 모든 값에 대해서 하나 제외해보고 svd 진행 후 error 계산하는 함수
# svd를 103271번 진행
def calculate_svd_error(R_matrix_np, num_values):
    error_sum = 0
    non_zero_indices = np.argwhere(R_matrix_np != 0)
    
    for idx in range(min(num_values, len(non_zero_indices))):
        i, j = non_zero_indices[idx]
        
        # 원래 값 저장
        original_value = R_matrix_np[i,j]
        # 해당 값 제외
        R_matrix_np[i,j] = 0
        
        # SVD 수행
        sparse_R_matrix = csc_matrix(R_matrix_np)
        u, s, vt = svds(sparse_R_matrix, k = 100)
        reconstructed = np.dot(np.dot(u, np.diag(s)), vt)
        
        # 오차 계산
        predicted_value = reconstructed[i,j]
        error_sum += abs(original_value - predicted_value)
        
        # 원래 값 복원
        R_matrix_np[i,j] = original_value
        
    return error_sum
        

In [20]:
# Rmatrix로 svd 진행 후 0이 아닌 값들 평균 에러 구하기
# svd 1번 진행
def calculate_svd_error2(R_matrix_np, num_values):
    
    # SVD 수행
    sparse_R_matrix = csc_matrix(R_matrix_np)
    u, s, vt = svds(sparse_R_matrix, k=70)
    reconstructed = np.dot(np.dot(u, np.diag(s)), vt)
    
    error_sum = 0
    non_zero_indices = np.argwhere(R_matrix_np != 0)

    for idx in range(min(num_values, len(non_zero_indices))):
        i, j = non_zero_indices[idx]

        # 원래 값과 재구성된 값 사이의 오차 계산
        original_value = R_matrix_np[i, j]
        predicted_value = reconstructed[i, j]
        error_sum += abs(original_value - predicted_value)

    return error_sum

In [14]:
R_matrix_np = R_matrix.to_numpy()
start_time = time.time()
error = calculate_svd_error(R_matrix_np, num_values=1000)
end_time = time.time()
duration = end_time - start_time  # 실행 시간 계산
print("평균 error : ", error/103271)
print(f"Function executed in {duration} seconds")


평균 error :  0.07611983692509469
Function executed in 764.9960215091705 seconds


In [12]:
# k = 100
R_matrix_np = R_matrix.to_numpy()
start_time = time.time()
error = calculate_svd_error2(R_matrix_np, num_values=103271)
end_time = time.time()
duration = end_time - start_time  # 실행 시간 계산
print("평균 error : ", error/103271)
print(f"Function executed in {duration} seconds")

평균 error :  3.6324669694859186
Function executed in 1.133404016494751 seconds


In [14]:
# k = 150
R_matrix_np = R_matrix.to_numpy()
start_time = time.time()
error = calculate_svd_error2(R_matrix_np, num_values=103271)
end_time = time.time()
duration = end_time - start_time  # 실행 시간 계산
print("평균 error : ", error/103271)
print(f"Function executed in {duration} seconds")

평균 error :  2.8802643652364956
Function executed in 1.273538589477539 seconds


In [17]:
# k = 170
R_matrix_np = R_matrix.to_numpy()
start_time = time.time()
error = calculate_svd_error2(R_matrix_np, num_values=103271)
end_time = time.time()
duration = end_time - start_time  # 실행 시간 계산
print("평균 error : ", error/103271)
print(f"Function executed in {duration} seconds")

평균 error :  2.6173566662896186
Function executed in 1.3252308368682861 seconds


In [19]:
# k = 180
R_matrix_np = R_matrix.to_numpy()
start_time = time.time()
error = calculate_svd_error2(R_matrix_np, num_values=103271)
end_time = time.time()
duration = end_time - start_time  # 실행 시간 계산
print("평균 error : ", error/103271)
print(f"Function executed in {duration} seconds")

평균 error :  2.4899180446381135
Function executed in 1.3592166900634766 seconds


In [21]:
# k = 70
R_matrix_np = R_matrix.to_numpy()
start_time = time.time()
error = calculate_svd_error2(R_matrix_np, num_values=103271)
end_time = time.time()
duration = end_time - start_time  # 실행 시간 계산
print("평균 error : ", error/103271)
print(f"Function executed in {duration} seconds")

평균 error :  4.180068734832744
Function executed in 0.9694747924804688 seconds


In [None]:
# 유저의 
test_matrix = R_matrix.iloc[-19972:, -134:]
print(test_matrix.head())

In [6]:
# test_matrix 부분 제외하기
#remaining_matrix = R_matrix.drop(R_matrix.tail(19972).index).drop(R_matrix.columns[-134:], axis=1)

# 1. 행렬 -> 희소행렬로 변환
sparse_R_matrix = csc_matrix(R_matrix.values)

# k = 100으로 svd 진행
U, S, Vt = svds(sparse_R_matrix, k=100)

In [10]:
# 1번 실행될 때 몇 초 걸리는지 
R_matrix_np = R_matrix.to_numpy()

i = 100
j = 100
error_sum = 0

start_time = time.time()

original_value = R_matrix_np[i,j]
# 해당 값 제외
R_matrix_np[i,j] = 0

# SVD 수행
sparse_R_matrix = csc_matrix(R_matrix_np)
u, s, vt = svds(sparse_R_matrix, k = 100)
reconstructed = np.dot(np.dot(u, np.diag(s)), vt)

# 오차 계산
predicted_value = reconstructed[i,j]
error_sum += abs(original_value - predicted_value)

# 원래 값 복원
R_matrix_np[i,j] = original_value

end_time = time.time()
duration = end_time - start_time  # 실행 시간 계산

print(error_sum)
print(f"Function executed in {duration} seconds")


0.03187514841556549
Function executed in 0.7364842891693115 seconds


In [7]:
# 데이터 복원하기
reconstructed_data = np.dot(np.dot(U, np.diag(S)), Vt)
num_re_rows = reconstructed_data.shape[0]
num_re_columns = reconstructed_data.shape[1]
print(f"행의 개수: {num_re_rows}")  # 66574개
print(f"열의 개수: {num_re_columns}")  # 449개

행의 개수: 66574
열의 개수: 449


In [8]:
# Reconstructing the matrix from SVD components
Sigma = np.zeros((U.shape[0], Vt.shape[0]))
Sigma[:S.shape[0], :S.shape[0]] = np.diag(S)
reconstructed_matrix = np.dot(U, np.dot(Sigma, Vt))
num_re_rows = reconstructed_matrix.shape[0]
num_re_columns = reconstructed_matrix.shape[1]
print(f"행의 개수: {num_re_rows}")  # 66574개
print(f"열의 개수: {num_re_columns}")  # 449개

ValueError: shapes (66574,100) and (66574,449) not aligned: 100 (dim 1) != 66574 (dim 0)

In [9]:
# 2. 그냥 svd 진행하기
# Converting the DataFrame to a NumPy array for SVD
remaining_matrix_np = R_matrix.to_numpy()

# Perform SVD
U, s, Vt = svd(remaining_matrix_np)

MemoryError: Unable to allocate 16.5 GiB for an array with shape (66574, 66574) and data type float32

In [None]:
# Reconstructing the matrix from SVD components
Sigma = np.zeros((U.shape[0], Vt.shape[0]))
Sigma[:s.shape[0], :s.shape[0]] = np.diag(s)
reconstructed_matrix = np.dot(U, np.dot(Sigma, Vt))