# 1. Import necessary modules and classes

In [1]:
%matplotlib inline
from datetime import datetime
from functools import reduce
from os.path import exists
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import gc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sys

- 레퍼런스의 협업필터링 유틸코드 업로드, Damper Baseline import

In [2]:
from google.colab import files
src = list(files.upload().values())[0]
open('cf_utils.py','wb').write(src)
import cf_utils
# Import User + Movie baseline model
from cf_utils import DampedUserMovieBaselineModel

Saving cf_utils.py to cf_utils.py


# 2. Load the Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# 시간순 정렬
ratings_df = pd.read_csv('/content/drive/MyDrive/ml-100k/u.data', sep='\t', header=None, 
                         names=['userId', 'movieId', 'rating', 'timestamp'])
ratings_df['timestamp'] = ratings_df['timestamp'].apply(datetime.fromtimestamp)
ratings_df = ratings_df.sort_values('timestamp')
print('First 5:')
display(ratings_df.head())
print()
print('Last 5:')
display(ratings_df.tail())

First 5:


Unnamed: 0,userId,movieId,rating,timestamp
214,259,255,4,1997-09-20 03:05:10
83965,259,286,4,1997-09-20 03:05:27
43027,259,298,4,1997-09-20 03:05:54
21396,259,185,4,1997-09-20 03:06:21
82655,259,173,4,1997-09-20 03:07:23



Last 5:


Unnamed: 0,userId,movieId,rating,timestamp
46773,729,689,4,1998-04-22 23:10:38
73008,729,313,3,1998-04-22 23:10:38
46574,729,328,3,1998-04-22 23:10:38
64312,729,748,4,1998-04-22 23:10:38
79208,729,272,4,1998-04-22 23:10:38


# 3. Write helper function and ALSRecommender class

In [5]:
user_col, item_col, rating_col = ratings_df.columns[:3]

In [6]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
214,259,255,4,1997-09-20 03:05:10
83965,259,286,4,1997-09-20 03:05:27
43027,259,298,4,1997-09-20 03:05:54
21396,259,185,4,1997-09-20 03:06:21
82655,259,173,4,1997-09-20 03:07:23
...,...,...,...,...
46773,729,689,4,1998-04-22 23:10:38
73008,729,313,3,1998-04-22 23:10:38
46574,729,328,3,1998-04-22 23:10:38
64312,729,748,4,1998-04-22 23:10:38


In [7]:
item_col

'movieId'

In [12]:
def get_rating_matrix(X):
    '''Rating matrix와 user, item id - row, column indices mapping을 만듦.
    Parameters
    ----------
    X : pandas.DataFrame, shape=(n_ratings,>=3)
        First 3 columns must be in order of user, item, rating.

    Returns
    -------
    rating_matrix : 2d numpy array, shape=(n_users, n_items)
    user_map : pandas Series, shape=(n_users,)
        Mapping from the original user id to an integer in the range [0,n_users)
    item_map : pandas Series, shape=(n_items,)
        Mapping from the original item id to an integer in the range [0,n_items)
    '''
    user_col, item_col, rating_col = X.columns[:3] # 세 변수에 처음 세 칼럼명을 할당
    rating = X[rating_col]
    #numpy 모듈의 arange 함수는 반열린구간 [start, stop) 에서 step 의 크기만큼 일정하게 떨어져 있는 숫자들을 array 형태로 반환
    user_map = pd.Series(
        index = np.unique(X[user_col]), # 고유한 원소들(user_id)을 모은 뒤, 오름차순 정렬한 결과(array)를 반환
        data = np.arange(X[user_col].nunique()), # nunique는 고유한 요소의 수(user_id수), 그걸 0부터 요소수-1 까지 배열로 만듦. [0, n_users-1]
        name = 'user_map',
    )
    item_map = pd.Series(
        index = np.unique(X[item_col]), # unique item_id
        data = np.arange(X[item_col].nunique()), # integer in the range [0,n_items)
        name = 'item_map',
    )
    # map(적용시킬 함수, 적용할 값들) : 반복 가능한 자료형 (리스트나 튜플)을 첫 번째 인자로 들어온 함수에 하나씩 집어넣어서 함수를 수행
    user_inds = X[user_col].map(user_map) # 각 user_id(=row index)에 [0, n_users-1] mapping
    item_inds = X[item_col].map(item_map)
    rating_matrix = (
        pd.pivot_table(
            data=X,
            values=rating_col,
            index=user_inds,
            columns=item_inds,
        )
        .fillna(0)
        .values
    )
    return rating_matrix, user_map, item_map

In [13]:
get_rating_matrix(ratings_df)

(array([[5., 3., 4., ..., 0., 0., 0.],
        [4., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [5., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 5., 0., ..., 0., 0., 0.]]),
 1        0
 2        1
 3        2
 4        3
 5        4
       ... 
 939    938
 940    939
 941    940
 942    941
 943    942
 Name: user_map, Length: 943, dtype: int64,
 1          0
 2          1
 3          2
 4          3
 5          4
         ... 
 1678    1677
 1679    1678
 1680    1679
 1681    1680
 1682    1681
 Name: item_map, Length: 1682, dtype: int64)

In [None]:
class ALSRecommender():
    """Alternating Least Squares algorithm을 이용한 Recommender
    Parameters
    ----------
    k : int, default=5
        Number of latent features(잠재특성 수)
    lmbda : float, default=0.1
        Regularization parameter(L2 규제계수?)
    max_epochs : int, default=15
        Max number of iterations to run
    baseline_algo : object
        Object with fit(X) and 
    """
    def __init__(self, k=5, lmbda=0.1, max_epochs=15, baseline_algo=None, error_metric='mae', verbose=True):
        # float으로 오는 경우에 대비해 integer 지정
        self.k = int(np.round(k))
        self.lmbda = lmbda
        self.max_epochs = max_epochs
        self.baseline_algo = baseline_algo
        self.error_metric = error_metric
        self.verbose = verbose

        self.U = None
        self.I = None
        self.initialized = False

    def _calc_train_error(self, U, I, R, R_selector=None, error_metric='mae'):
        if R_selector is None: #이게 뭐지? 실제 평점?
            R_selector = (R>0)
        R_hat = np.dot(U.T, I) # 예측평점행렬을 구할 때 note에선 U*I.T 했는데, 여기선 U.T*I
        if error_metric == 'mae':
            error = np.sum( R_selector * np.abs(R_hat - R) ) / np.sum(R_selector)
        else:
            raise ValueError("{} is an unsupprted error metric".format(metric))
        return error

    def _fit_init(self, X):
        if not isinstance(X, pd.DataFrame):   # Dataframe인지 확인
            raise ValueError("X must be a DataFrame")
        X = X.copy()
        user_col, item_col, rating_col = X.columns[:3]
        if self.baseline_algo is None:
            self.train_mean = X[rating_col].mean()    #베이스라인이 없으면 단순평균
        else:
            self.baseline_algo.fit(X)
        self.R, self.user_map, self.item_map = get_rating_matrix(X)
        n_users, n_items = self.R.shape
        self.U = 3 * np.random.rand(self.k, n_users)    # 난수 어레이 shape은 주어진 값에 의해 결정되며, 값은 [0, 1) 범위에서 균일한 분포
        self.I = 3 * np.random.rand(self.k, n_items)
        self.I[0, :] = self.R[self.R != 0].mean(axis=0) # 각 영화의 평균 평점
        self.E = np.eye(self.k) # (k X k) dimensional idendity matrix (항등행렬? 대각성분이 1, 나머지는 0)
        self.epoch = 0
        self.train_errors = []
        self.initialized = True

    def fit
