In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import gc

from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from scipy.linalg import svd


import numpy as np

import time


In [2]:
# 데이터 불러오기
df = pd.read_csv("Filtered_Ratings_NoZero.csv")

print(df.head())

# User-ID별로 등장 횟수를 세어 Series로 저장
user_counts = df['User-ID'].value_counts()

# User-ID가 100번 이상 등장한 사용자만 남기기 = 100권 이상 책 평가한 유저만 남기겠다
filtered_df = df[df['User-ID'].isin(user_counts[user_counts >= 100].index)]

# 결과 출력
total_rows = len(filtered_df)
print(filtered_df.head())
print("행의 개수 : ", total_rows) # 103271 개

   User-ID        ISBN  Book-Rating
0   276726  0155061224            5
1   276729  052165615X            3
2   276729  0521795028            6
3   276744  038550120X            7
4   276747  0060517794            9
     User-ID        ISBN  Book-Rating
529   277427  002542730X           10
530   277427  003008685X            8
531   277427  0060006641           10
532   277427  0060542128            7
533   277427  0061009059            9
행의 개수 :  103271


In [3]:
# matrix로 변환하기
R_matrix = filtered_df.pivot(index="ISBN", columns="User-ID", values="Book-Rating").fillna(0).astype("float32")
print(R_matrix.head())

num_rows = R_matrix.shape[0]  # 행의 개수
num_cols = R_matrix.shape[1]  # 열의 개수

print(f"행의 개수: {num_rows}")  # 66574개
print(f"열의 개수: {num_cols}")  # 449개

# R_matrix에서 0이 아닌 값의 개수
non_zero_cnt = np.count_nonzero(R_matrix)
print("0이 아닌 값의 갯수 : ", non_zero_cnt)
print("0이 아닌 값의 비율 :", non_zero_cnt * 100 / (num_rows*num_cols),"%")

User-ID     2033    2110    2276    4017    4385    5582    6242    6251    \
ISBN                                                                         
0000913154     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0001046438     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
000104687X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0001047213     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0001047973     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

User-ID     6543    6575    ...  269566  270713  271448  271705  273113  \
ISBN                        ...                                           
0000913154     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0001046438     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
000104687X     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0001047213     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
000

1. Rmatrix를 baseline predictor를 추가한 matrix로 변환 + overfitting 방지를 위해 regularization term도 추가
2. baseline matrix 를 svd 수행
3. 기존 값(Rmatrix에서 0이 아닌 값들)을 얼마나 예측 잘했는지 확인
4. 다른 값 예측해보기

In [4]:
# baseline 준비
R_matrix_np = R_matrix.to_numpy()
global_mean = np.mean(R_matrix_np[R_matrix_np != 0])
user_bias = np.mean(R_matrix_np - global_mean, axis=1)
item_bias = np.mean(R_matrix_np - global_mean, axis=0)
print("global mean = ", global_mean)
print("user bias = ", user_bias)
print("item bias = ", item_bias)

global mean =  7.8254204
user bias =  [-7.807618  -7.8053913 -7.8120728 ... -7.8120728 -7.803164  -7.8053913]
item bias =  [-7.8085246 -7.81261   -7.8008037 -7.804694  -7.7944202 -7.8090653
 -7.813226  -7.797259  -7.8055954 -7.800278  -7.8120246 -7.7888174
 -7.8078938 -7.8119946 -7.807473  -7.809426  -7.8128057 -7.0583205
 -7.811168  -7.807954  -7.8112884 -7.802216  -7.7999177 -7.807984
 -7.812986  -7.8100266 -7.8129406 -7.788111  -7.723026  -7.81246
 -7.8095913 -7.8112435 -7.804499  -7.8004885 -7.771573  -7.6909113
 -7.7971387 -7.809531  -7.7983856 -7.8106127 -7.8127155 -7.810132
 -7.8048296 -7.8069773 -7.810868  -7.791551  -7.7998123 -7.810207
 -7.800308  -7.8092756 -7.7833195 -7.79409   -7.7666016 -7.809516
 -7.813887  -7.807443  -7.818333  -7.810177  -7.7370853 -7.814548
 -7.8113785 -7.803823  -7.814142  -7.808029  -7.8081493 -7.8113785
 -7.802757  -7.7985506 -7.7997823 -7.790785  -7.813902  -7.8055353
 -7.8062563 -7.809441  -7.808134  -7.7962375 -7.8092155 -7.791611
 -7.812265  -7

In [9]:
u, s, vt = svds(csc_matrix(R_matrix_np), k=100)
print(u.shape[0])
print(np.dot(s, vt).shape[0])
reconstructed = np.dot(np.dot(u, np.diag(s)), vt)
num_rows = reconstructed.shape[0]  # 행의 개수
num_cols = reconstructed.shape[1]  # 열의 개수

print(f"행의 개수: {num_rows}")  # 66574개
print(f"열의 개수: {num_cols}")  # 449개

66574
449
행의 개수: 66574
열의 개수: 449


In [22]:
# R_matrix -> baseline matrix로 변환
def transfer_baseline_matrix(R_matrix):
    R_matrix_np = R_matrix.to_numpy()
    global_mean = np.mean(R_matrix_np[R_matrix_np != 0])
    user_bias = np.mean(R_matrix_np - global_mean, axis=1)
    item_bias = np.mean(R_matrix_np - global_mean, axis=0)
    return R_matrix_np - (user_bias[:, np.newaxis] + item_bias) + global_mean, global_mean, user_bias, item_bias;

In [11]:
print(transfer_baseline_matrix(R_matrix))

[[23.441563 23.445648 23.433842 ... 23.440422 23.439941 23.448833]
 [23.439337 23.443422 23.431616 ... 23.438194 23.437714 23.446606]
 [23.446018 23.450104 23.438297 ... 23.444876 23.444395 23.453289]
 ...
 [23.446018 23.450104 23.438297 ... 23.444876 23.444395 23.453289]
 [23.437109 23.441195 23.429388 ... 23.435966 23.435486 23.44438 ]
 [23.439337 23.443422 23.431616 ... 23.438194 23.437714 23.446606]]


In [14]:
non_zero_indices = np.argwhere(R_matrix.to_numpy() != 0)
print(non_zero_indices)

[[    0   279]
 [    1    35]
 [    2    35]
 ...
 [66571   266]
 [66572   239]
 [66573   163]]


In [30]:
def calcul_avg_svd_error_baseline(R_matrix_np, baseline_matrix, mean, user_bias, item_bias, non_zero_indices, num_values):
    
    # SVD 진행
    u, s, vt = svds(baseline_matrix, k=100)
    reconstructed = np.dot(np.dot(u, np.diag(s)), vt)
    
    #오차 계산
    error_sum = 0

    for i, j in non_zero_indices:
        predicted_rating = reconstructed[i, j] + user_bias[i] + item_bias[j] - mean 
        error_sum += abs(R_matrix_np[i, j] - predicted_rating)

    return error_sum/num_values

In [27]:
#  svd 사용
baseline_matrix, global_mean, user_bias, item_bias = transfer_baseline_matrix(R_matrix)
R_matrix_np = R_matrix.to_numpy()

start_time = time.time()
avg_error = calcul_avg_svd_error_baseline(R_matrix_np, baseline_matrix, global_mean, user_bias, item_bias, non_zero_indices, 103271)
end_time = time.time()

duration = end_time - start_time  # 실행 시간 계산
formatted_avg_error = "{:.4f}".format(avg_error)
formatted_duration = "{:.4f}".format(duration)


print("avg error = ", formatted_avg_error)
print(f"Function executed in {duration} seconds\n")

avg error =  0.0001
Function executed in 2.7466671466827393 seconds



In [31]:
# svds 사용 
baseline_matrix, global_mean, user_bias, item_bias = transfer_baseline_matrix(R_matrix)
R_matrix_np = R_matrix.to_numpy()

start_time = time.time()
avg_error = calcul_avg_svd_error_baseline(R_matrix_np, baseline_matrix, global_mean, user_bias, item_bias, non_zero_indices, 103271)
end_time = time.time()

duration = end_time - start_time  # 실행 시간 계산
formatted_avg_error = "{:.4f}".format(avg_error)
formatted_duration = "{:.4f}".format(duration)


print("avg error = ", formatted_avg_error)
print(f"Function executed in {duration} seconds\n")

avg error =  7.6576
Function executed in 12.337036848068237 seconds



In [9]:
def calculate_svd_error_with_baseline(R_matrix_np, num_values=103271, k=100):
    global_mean = np.mean(R_matrix_np[R_matrix_np != 0])
    user_bias = np.mean(R_matrix_np - global_mean, axis=1)
    item_bias = np.mean(R_matrix_np - global_mean, axis=0)

    # 베이스라인 보정된 평점 행렬 생성
    baseline_corrected = R_matrix_np - (user_bias[:, np.newaxis] + item_bias) + global_mean

    # SVD 수행
    u, s, vt = svds(csc_matrix(baseline_corrected), k=k)
    reconstructed = np.dot(np.dot(u, np.diag(s)), vt)

    # 오차 계산
    error_sum = 0
    non_zero_indices = np.argwhere(R_matrix_np != 0)

    for i, j in non_zero_indices[:num_values]:
        predicted_rating = reconstructed[i, j] + user_bias[i] + item_bias[j] - global_mean
        error_sum += abs(R_matrix_np[i, j] - predicted_rating)

    return error_sum

# 함수 실행
R_matrix_np = R_matrix.to_numpy()
error = calculate_svd_error_with_baseline(R_matrix_np)
print("avg error = ", error/103271)

avg error =  7.659766560626064
