In [20]:
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import gc

from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from scipy.linalg import svd
from sklearn.metrics import mean_squared_error, mean_absolute_error


import numpy as np

import time


In [2]:
# 데이터 불러오기
df = pd.read_csv("Filtered_Ratings_NoZero.csv")

print(df.head())

# User-ID별로 등장 횟수를 세어 Series로 저장
user_counts = df['User-ID'].value_counts()

# User-ID가 100번 이상 등장한 사용자만 남기기 = 100권 이상 책 평가한 유저만 남기겠다
filtered_df = df[df['User-ID'].isin(user_counts[user_counts >= 100].index)]

# 결과 출력
total_rows = len(filtered_df)
print(filtered_df.head())
print("행의 개수 : ", total_rows) # 103271 개

   User-ID        ISBN  Book-Rating
0   276726  0155061224            5
1   276729  052165615X            3
2   276729  0521795028            6
3   276744  038550120X            7
4   276747  0060517794            9
     User-ID        ISBN  Book-Rating
529   277427  002542730X           10
530   277427  003008685X            8
531   277427  0060006641           10
532   277427  0060542128            7
533   277427  0061009059            9
행의 개수 :  103271


In [3]:
# matrix로 변환하기
R_matrix = filtered_df.pivot(index="ISBN", columns="User-ID", values="Book-Rating").fillna(0).astype("float32")
print(R_matrix.head())

num_rows = R_matrix.shape[0]  # 행의 개수
num_cols = R_matrix.shape[1]  # 열의 개수

print(f"행의 개수: {num_rows}")  # 66574개
print(f"열의 개수: {num_cols}")  # 449개

# R_matrix에서 0이 아닌 값의 개수
non_zero_cnt = np.count_nonzero(R_matrix)
print("0이 아닌 값의 갯수 : ", non_zero_cnt)
print("0이 아닌 값의 비율 :", non_zero_cnt * 100 / (num_rows*num_cols),"%")

User-ID     2033    2110    2276    4017    4385    5582    6242    6251    \
ISBN                                                                         
0000913154     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0001046438     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
000104687X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0001047213     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0001047973     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

User-ID     6543    6575    ...  269566  270713  271448  271705  273113  \
ISBN                        ...                                           
0000913154     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0001046438     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
000104687X     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0001047213     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
000

In [17]:
# baseline 구하는 함수
def cal_baseline(R_matrix_np):
    global_mean = np.mean(R_matrix_np[R_matrix_np != 0])
    user_bias = np.mean(R_matrix_np - global_mean, axis=1)
    item_bias = np.mean(R_matrix_np - global_mean, axis=0)
    return global_mean, user_bias, item_bias

In [18]:
def svd_baseline(R_matrix_np, k):
    global_mean, user_bias, item_bias = cal_baseline(R_matrix_np)
    baseline_corrected_matrix = R_matrix_np - (user_bias[:, np.newaxis] + item_bias) + global_mean

    sparse_R_matrix = csc_matrix(baseline_corrected_matrix)
    
    u, s, vt = svds(sparse_R_matrix, k = k)
    
    reconstructed = np.dot(u, np.dot(np.diag(s), vt))
    
    return reconstructed + (user_bias[:, np.newaxis] + item_bias) + global_mean

In [22]:
# mae와 rmse 로 성능 평가
def calculate_mae_rmse(true_ratings, predicted_ratings):
    mask = true_ratings != 0  # 평가되지 않은 항목은 제외
    mae = mean_absolute_error(true_ratings[mask], predicted_ratings[mask])
    rmse = np.sqrt(mean_squared_error(true_ratings[mask], predicted_ratings[mask]))
    return mae, rmse

In [21]:
R_matrix_np = R_matrix.to_numpy()
svd_baseline_matrix = svd_baseline(R_matrix_np, 100)


NameError: name 'calculate_mae_rmse' is not defined

In [24]:
mae, rmse = calculate_mae_rmse(R_matrix_np, svd_baseline_matrix)
print("MAE =", mae)
print("RMSE =", rmse)

MAE = 7.9914255
RMSE = 8.19535


In [26]:
a = csc_matrix(R_matrix)
U, S, Vt = svds(a, 100)

recon = np.dot(U, np.dot(np.diag(S), Vt))

mae, rmse = calculate_mae_rmse(R_matrix_np, recon)
print("MAE =", mae)
print("RMSE =", rmse)

MAE = 3.6324675
RMSE = 5.176186
