그냥 MF말고 보통 다음 정도는 해

- bias 추가
- L2 norm regulation

In [18]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt # https://matplotlib.org/stable/api/pyplot_summary.html#module-matplotlib.pyplot

%matplotlib inline

# from IPython.display import set_matplotlib_formats
# set_matplotlib_formats("retina") 
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings('ignore')

print(f"{np.__version__}")
print(f"{pd.__version__}")
print(f"{mpl.__version__}")

1.23.5
1.5.3
3.7.1


In [19]:
import scipy as sp
import sympy
import sklearn

sympy.init_printing(use_latex='mathjax') # Juypter 노트북에서 수학식의 LaTeX 표현을 위해 필요함

print(f"{sklearn.__version__}")
print(f"{sympy.__version__}")
print(f"{sp.__version__}")

1.1.3
1.11.1
1.10.0


In [20]:
import os

base_src = "./data/"


####
# MF를 사용할 것이므로 u-i matrix를 분해할 것이다.
# R = P * Q.T $\approx$ \hat{R} 이고, \hat{R}과 R의 차이를 줄이는 방향으로 학습한다.
# 따라서 user, item의 속성은 필요가 없다.
####

# ratings
u_data_src = os.path.join(base_src, "u.data")
u_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(u_data_src, sep="\t", names=u_cols, encoding="latin-1")

ratings = ratings[["user_id", "movie_id", "rating"]].astype(int)

print(ratings.shape, ratings.columns)
ratings.info()

(100000, 3) Index(['user_id', 'movie_id', 'rating'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   user_id   100000 non-null  int64
 1   movie_id  100000 non-null  int64
 2   rating    100000 non-null  int64
dtypes: int64(3)
memory usage: 2.3 MB


In [46]:
class MF():
    def __init__(self, 
                ratings: pd.DataFrame, 
                hyper_params):
        self.R = np.array(ratings)
        # R = P * Q.T 이고 잠재 행렬로 분해한다면,
        # (num_users, num_items) = (num_users, K) * (K, num_items) 가 될 것임.
        self.num_users, self.num_items = np.shape(self.R)
        
        self.K = hyper_params["K"] # 예상되는 잠재 변수의 갯수
        self.alpha = hyper_params["alpha"] # learning rate
        self.beta = hyper_params["beta"] # regularization parameter
        self.iterations = hyper_params["iterations"] # epochs
        self.verbose = hyper_params["verbose"]
        
        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)
        
        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
        
    def rmse(self):
        xs, ys = self.R.nonzero() # 0이 아닌 값들의 index를 반환
        self.predictions = []
        self.errors = []
        
        # 각각의 (x, y)에 대해서 prediction과 error를 구한다.
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        
        return np.sqrt(np.mean(self.errors**2))

    def train(self):
        # size의 경우 공식을 생각해보면
        # R = P * Q.T 이고 잠재 행렬로 분해한다면,
        # (num_users, num_items) = (num_users, K) * (K, num_items)
        # scale은 그냥 MF에서 하는 관습적인 걸로 만들어봤다.
        self.P = np.random.normal(scale=1./self.K, # std
                                size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, 
                                size=(self.num_items, self.K))
        # bias
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()]) # 전체 bias
        
        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)] # sample [(행, 열, 값), (행, 열, 값), ...]
        
        training_process = []
        
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse = self.rmse()
            training_process.append((i + 1, rmse))
            if self.verbose:
                if (i + 1) % 10 == 0:
                    print(f"iterations: {i + 1}, RMSE: {rmse}")
        
        return training_process
    
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    def sgd(self):
        """
        sgd 공식을 코드로 그대로 옮긴 것.
        """
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = r - prediction
            
            # bias update
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])
            
            # P, Q latent matrix update
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i, :])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j, :])

In [42]:
R_temp = ratings.pivot(
    index="user_id", 
    columns="movie_id", 
    values="rating").fillna(0)

hyper_params = {
    "K": 30,
    "alpha": 0.001,
    "beta": 0.02,
    "iterations": 100,
    "verbose": True
}

In [43]:
mf = MF(R_temp, hyper_params)

train_process = mf.train()

iterations: 10, RMSE: 0.9585352347291066
iterations: 20, RMSE: 0.9373809481763696
iterations: 30, RMSE: 0.9280866411019143
iterations: 40, RMSE: 0.9225474103342166
iterations: 50, RMSE: 0.9184375970511475
iterations: 60, RMSE: 0.9145995473017484
iterations: 70, RMSE: 0.9101338485610159
iterations: 80, RMSE: 0.9040675341754495
iterations: 90, RMSE: 0.895422699547366
iterations: 100, RMSE: 0.8837878300530232


In [11]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

def score(model, neighbor_size=0):
    """
    이 섹션에서는 neighbor를 고려해야 하므로 여기서 모델에 주입해서 사용하기로 함.
    """
    id_pairs:tuple[int, int] = zip(x_test["user_id"], x_test["movie_id"])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test["rating"])
    return RMSE(y_true, y_pred)

## 사용자의 평가 경향을 고려한 CF 

1. 각 사용자의 평균 점수 계산
2. 편차 (평점 - 해당 사용자의 평균 점수) 사용
3. 편차 예측값 = 편차 * 다른 사용자 유사도
4. 예측값 = 편차 예측값 + 평점 평균

In [12]:
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings["rating"]

x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=0.26, 
                                                    stratify=y)

ratings_matrix = x_train.pivot(
    index="user_id",
    columns="movie_id",
    values="rating"
)

ratings_matrix.head(1)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,,3.0,3.0,,4.0,,,3.0,...,,,,,,,,,,


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy) # u-u sim
user_similarity = pd.DataFrame(data=user_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)

user_similarity.head(1)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.088978,0.052138,0.030152,0.218725,0.327315,0.308537,0.264006,0.050687,0.242015,...,0.230381,0.129382,0.202153,0.125288,0.151505,0.091778,0.266404,0.119847,0.146305,0.298911


In [43]:
# ratings_matrix |user_id, movie_id|
rating_mean = ratings_matrix.mean(axis=1)
# rating_mean |user_id, rating_mean|

# 각 점수에서 평균을 뺀다.
# |movie_id, user_id| - |user_id, rating_mean| -> |movie_id, user_id|
# rating_mean가 broadcast되어서 뺄셈이 수행된다.
rating_bias = (ratings_matrix.T - rating_mean).T

rating_bias

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,-0.607330,,-0.60733,-0.60733,,0.392670,,,-0.607330,...,,,,,,,,,,
2,0.270833,,,,,,,,,-1.729167,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.048780,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,0.828571,,...,,,,,,,,,,
940,,,,,,,0.517647,,-0.482353,,...,,,,,,,,,,
941,1.000000,,,,,,0.000000,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [46]:
def CF_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias.columns:
        sim_score = user_similarity[user_id].copy() # 유사도 점수
        movie_ratings = rating_bias[movie_id].copy() # 영화 평점
        
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index # 평점이 없는 사람들의 인덱스
        movie_ratings = movie_ratings.dropna()
        sim_score = sim_score.drop(none_rating_idx)
        
        if neighbor_size == 0: # 0이면 일반 CF
            prediction = np.dot(sim_score, movie_ratings) / sim_score.sum()
            prediction = prediction + rating_mean[user_id] # 예측 + 편차
        
        else:
            if len(sim_score) > 1: # 일단 비슷한 유저 자체가 자기 자신 외에 더 있어야 하니까.
                # 지정한 이웃 숫자보다 유사도 점수 대상 유저가 적을 수도 있으니까. 10 이웃 목표했는데 5명 밖에 없는 경우.
                neighbor_size = min(neighbor_size, len(sim_score))
                
                sim_score = np.array(sim_score)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_score)
                
                sim_score = sim_score[user_idx][-neighbor_size:] # 오름차순으로 정렬 했으니까... 뒤에서부터 뽑아야 함.
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                
                prediction = np.dot(sim_score, movie_ratings) / sim_score.sum()
                prediction = prediction + rating_mean[user_id] # 예측 + 편차
            else:
                prediction = rating_mean[user_id]
    else: # ratings_matrix에 없는 영화는 3점으로 예측
        prediction = rating_mean[user_id]
        
    return prediction

In [47]:
# 와 최초로 1점 아랫대 나왔음 ㅋㅋ
score(CF_knn_bias, neighbor_size=30)

0.9476320043490462