In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두 개의 분해된 행렬 P와 Q,T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)

    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]

    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]

    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)

    return rmse

def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    # P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 랜덤한 값으로 입력
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    break_count = 0

    # R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장.
    non_zeros = [ (i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]

    for step in range(steps):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r- np.dot(P[i, :], Q[j, :].T)

            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i, :] = P[i, :] + learning_rate*(eij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] = Q[j, :] + learning_rate*(eij * P[i, :] - r_lambda * Q[j, :])

        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0:
            print("### iteration step :", step, " rmse :", rmse)

    return P, Q

movies = pd.read_csv('/content/movies.csv')
ratings = pd.read_csv('/content/ratings.csv')
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index='userId', columns = 'movieId')

# title 컬럼을 얻기 위해 movies 와 조인 수행
rating_movies = pd.merge(ratings, movies, on='movieId')

# columns='title'로 title 컬럼으로 pivot 수행
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')

P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda = 0.01)

pred_matrix = np.dot(P, Q.T)




### iteration step : 0  rmse : 2.9023619751336867
### iteration step : 10  rmse : 0.7335768591017927
### iteration step : 20  rmse : 0.5115539026853442
### iteration step : 30  rmse : 0.37261628282537446
### iteration step : 40  rmse : 0.2960818299181014
### iteration step : 50  rmse : 0.2520353192341642
### iteration step : 60  rmse : 0.22487503275269854
### iteration step : 70  rmse : 0.2068545530233154
### iteration step : 80  rmse : 0.19413418783028685
### iteration step : 90  rmse : 0.18470082002720406
### iteration step : 100  rmse : 0.17742927527209104
### iteration step : 110  rmse : 0.1716522696470749
### iteration step : 120  rmse : 0.16695181946871726
### iteration step : 130  rmse : 0.16305292191997542
### iteration step : 140  rmse : 0.15976691929679646
### iteration step : 150  rmse : 0.1569598699945732
### iteration step : 160  rmse : 0.15453398186715425
### iteration step : 170  rmse : 0.15241618551077643
### iteration step : 180  rmse : 0.1505508073962831
### iteration

In [None]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)

ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.055084,4.092018,3.56413,4.502167,3.981215,1.271694,3.603274,2.333266,5.091749,3.972454,...,1.402608,4.208382,3.705957,2.720514,2.787331,3.475076,3.253458,2.161087,4.010495,0.859474
2,3.170119,3.657992,3.308707,4.166521,4.31189,1.275469,4.237972,1.900366,3.392859,3.647421,...,0.973811,3.528264,3.361532,2.672535,2.404456,4.232789,2.911602,1.634576,4.135735,0.725684
3,2.307073,1.658853,1.443538,2.208859,2.229486,0.78076,1.997043,0.924908,2.9707,2.551446,...,0.520354,1.709494,2.281596,1.782833,1.635173,1.323276,2.88758,1.042618,2.29389,0.396941


In [None]:
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]

    # 평점이 0을 초과한 데이터는 이미 본 영화
    already_seen = user_rating[user_rating > 0].index.tolist()

    movies_list = ratings_matrix.columns.tolist()
    unseen_list = [movie for movie in movies_list if movie not in already_seen]

    return unseen_list

In [None]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n = 10):

    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 영화면 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]

    return recomm_movies

In [None]:
unseen_list = get_unseen_movies(ratings_matrix, 9)

# 아이템 기반의 인접 이웃 협업 필터링으로 영화 추천
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n=10)

# 평점 데이터를 DataFrame으로 생성
recomm_movies = pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index,
                             columns=['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Rear Window (1954),5.704612
"South Park: Bigger, Longer and Uncut (1999)",5.4511
Rounders (1998),5.298393
Blade Runner (1982),5.244951
Roger & Me (1989),5.191962
Gattaca (1997),5.183179
Ben-Hur (1959),5.130463
Rosencrantz and Guildenstern Are Dead (1990),5.087375
"Big Lebowski, The (1998)",5.03869
Star Wars: Episode V - The Empire Strikes Back (1980),4.989601


# Surprise Package install

In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/772.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m563.2/772.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3156234 sha256=75b706ec912175e519234b97d7ae72390caa512f6b307ca8565a18c56fcc31d4
  Stored in directory: /root/.cache/pip/w

In [None]:
import surprise
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

data = Dataset.load_builtin(name='ml-100k', prompt=True)
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

print(type(trainset))
print(type(testset))

<class 'surprise.trainset.Trainset'>
<class 'list'>


In [None]:
print(testset[0])
print(dir(testset))

('391', '591', 4.0)
['__add__', '__class__', '__class_getitem__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__rmul__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'append', 'clear', 'copy', 'count', 'extend', 'index', 'insert', 'pop', 'remove', 'reverse', 'sort']


In [None]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x780acbb3fb50>

In [None]:
predictions = algo.test(testset)
print(*predictions[:5], sep='\n')

user: 391        item: 591        r_ui = 4.00   est = 3.25   {'was_impossible': False}
user: 181        item: 1291       r_ui = 1.00   est = 1.47   {'was_impossible': False}
user: 637        item: 268        r_ui = 2.00   est = 2.70   {'was_impossible': False}
user: 332        item: 451        r_ui = 5.00   est = 3.91   {'was_impossible': False}
user: 271        item: 204        r_ui = 4.00   est = 3.63   {'was_impossible': False}


In [None]:
[(pred.uid, pred.iid, pred.est) for pred in predictions[:3]]

[('391', '591', 3.2540625006575534),
 ('181', '1291', 1.4730334286644222),
 ('637', '268', 2.7029014751106155)]

In [None]:
uid = str(391)
iid = str(591)

pred = algo.predict(uid, iid)
print(pred)

user: 391        item: 591        r_ui = None   est = 3.25   {'was_impossible': False}


In [None]:
# ratings: userId, movieId, rating, timestamp

from surprise import Reader

ratings = pd.read_csv('/content/ratings.csv')
# 구조 변경을 위해서 헤더를 제외하고 파일로 저장
ratings.to_csv('/content/ratings_noh.csv', index=False, header=False)
ratings = pd.read_csv('/content/ratings_noh.csv')
print(ratings.head())

   1  1.1  4.0  964982703
0  1    3  4.0  964981247
1  1    6  4.0  964982224
2  1   47  5.0  964983815
3  1   50  5.0  964982931
4  1   70  3.0  964982400


In [None]:
reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5, 5))
data = Dataset.load_from_file('/content/ratings_noh.csv', reader=reader)

In [None]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)
algo = SVD(n_factors=50, random_state=42)
algo.fit(trainset)

predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8785


0.8784769558636133

In [None]:
from surprise.model_selection import cross_validate

# Pandas DataFrame에서 Surprise Dataset으로 데이터 로딩
ratings = pd.read_csv('/content/ratings.csv') # reading data in pandas df reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
algo = SVD(random_state=0)
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8678  0.8694  0.8763  0.8640  0.8824  0.8720  0.0066  
MAE (testset)     0.6663  0.6673  0.6682  0.6670  0.6791  0.6696  0.0048  
Fit time          1.84    2.05    1.75    1.77    1.88    1.86    0.11    
Test time         0.24    0.19    0.24    0.39    0.13    0.24    0.09    


{'test_rmse': array([0.86775044, 0.86940692, 0.87630674, 0.86401833, 0.88239549]),
 'test_mae': array([0.66629731, 0.66734412, 0.66818847, 0.6669541 , 0.67912261]),
 'fit_time': (1.841937780380249,
  2.049347162246704,
  1.7533819675445557,
  1.7659735679626465,
  1.8839116096496582),
 'test_time': (0.2359929084777832,
  0.1913142204284668,
  0.2445073127746582,
  0.38843369483947754,
  0.12961673736572266)}

In [59]:
# HyperParameter tuning - 입력하는 파라미터의 최적값을 찾는 작업
from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs': [20, 40, 60], 'n_factors':[50, 100, 200]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

print(gs.best_score['rmse'])

0.8763032897552451


In [60]:
print(gs.best_params['rmse'])

{'n_epochs': 20, 'n_factors': 50}
