# SVD (잠재 요인 기법)

In [None]:
!pip install scikit-surprise

In [1]:
import surprise
print(surprise.__version__)

1.1.1


In [2]:
# 고유값 분해(행렬을 더 낮은 차원으로 분해)해서 
# 다시 원본으로 살리면서 잠재적인 값을 추론하는 방법
from surprise import SVD

# SVD를 사용하기 위한 테이터셋을 만들어주는 클래스
# 사용자, 아이템, 평점
from surprise import Dataset

# RMSE, MAE 등을 사용한 정확도 측정
from surprise import accuracy

# 훈련/검증 데이터 분류
from surprise.model_selection import train_test_split

In [3]:
## 1. 데이터셋을 만들어주자(사용자, 아이템, 평점)
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.25, random_state=0)

In [4]:
## 2. SVD() 모델 선정
algo = SVD()

In [5]:
## 3. 훈련용 데이터로 fit() (학습)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2783c247e20>

In [10]:
## 4. 검증용 데이터로 예측
predictions = algo.test(testset)
print('prediction type : ', type(predictions), ' size : ', len(predictions))
print('prediction 결과의 최초 5개 추출')
predictions[:5]

prediction type :  <class 'list'>  size :  25000
prediction 결과의 최초 5개 추출


[Prediction(uid='120', iid='282', r_ui=4.0, est=3.5730363657454958, details={'was_impossible': False}),
 Prediction(uid='882', iid='291', r_ui=4.0, est=3.709466611507368, details={'was_impossible': False}),
 Prediction(uid='535', iid='507', r_ui=5.0, est=3.830851291789567, details={'was_impossible': False}),
 Prediction(uid='697', iid='244', r_ui=5.0, est=3.577959424570562, details={'was_impossible': False}),
 Prediction(uid='751', iid='385', r_ui=4.0, est=3.5518593924892428, details={'was_impossible': False})]

In [11]:
# 5. 정확도 계산
[ (pred.uid, pred.iid, pred.est) for pred in predictions[:3]]

[('120', '282', 3.5730363657454958),
 ('882', '291', 3.709466611507368),
 ('535', '507', 3.830851291789567)]

In [13]:
# 6. 사용자, 아이템 정보 주고 예측 실행
#아이디가 196인 302인 영화를 본사람의 평점을 예측
uid = str(196)
iid = str(302)
pred = algo.predict(uid, iid)
pred

Prediction(uid='196', iid='302', r_ui=None, est=4.219800766235271, details={'was_impossible': False})

## Custom Data로 SVD하기

In [125]:
import time

In [126]:
### 직접 csv 파일로 고유값 분해 해보자
## CSV를 SVD하기 위한 DataSet을 만들기 위해

import pandas as pd
import numpy as np

In [127]:
## 1. df <--- csv
ratings = pd.read_csv('./ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [128]:
from surprise import Reader
reader = Reader(rating_scale = (0.5, 5.0)) # 최저 0.5, 최대 5 인 데이터셋 만듦

In [129]:
## SVD에서 사용할 수 있는 데이터셋으로 만들어주자.
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x2784504a460>

In [130]:
## 데이터 분류
trainset, testset = train_test_split(data, test_size=.25, random_state=0)

In [131]:
from surprise.dataset import DatasetAutoFolds

reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5, 5))
# DatasetAutoFolds 클래스를 ratings_noh.csv 파일 기반으로 생성. 
data_folds = DatasetAutoFolds(ratings_file='ratings_noh.csv', reader=reader)

#전체 데이터를 학습데이터로 생성함. 
trainset = data_folds.build_full_trainset()

In [132]:
#객체 생성 , #훈련
algo = SVD(n_factors=20, n_epochs= 30,  random_state=1)
algo.fit(trainset) 
predictions = algo.test( testset )
accuracy.rmse(predictions)

RMSE: 1.0388


1.0387802469854106

In [133]:
#검증
predictions = algo.test(testset)
predictions[:5]

[Prediction(uid=63, iid=2000, r_ui=3.0, est=3.501556983616962, details={'was_impossible': False}),
 Prediction(uid=31, iid=788, r_ui=2.0, est=3.501556983616962, details={'was_impossible': False}),
 Prediction(uid=159, iid=6373, r_ui=4.0, est=3.501556983616962, details={'was_impossible': False}),
 Prediction(uid=105, iid=81564, r_ui=3.0, est=3.501556983616962, details={'was_impossible': False}),
 Prediction(uid=394, iid=480, r_ui=3.0, est=3.501556983616962, details={'was_impossible': False})]

In [134]:
[(pred.uid, pred.iid, pred.est) for pred in predictions[:5]] 

[(63, 2000, 3.501556983616962),
 (31, 788, 3.501556983616962),
 (159, 6373, 3.501556983616962),
 (105, 81564, 3.501556983616962),
 (394, 480, 3.501556983616962)]

In [135]:
#예측
uid = str(123)
iid = str(333)
pred = algo.predict(uid,iid)
pred

Prediction(uid='123', iid='333', r_ui=None, est=3.8064382504471417, details={'was_impossible': False})

In [110]:
from surprise.model_selection import cross_validate
cross_validate(algo, data, measures=['RMSE','MAE'], cv = 5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8836  0.8702  0.8795  0.8794  0.8728  0.8771  0.0049  
MAE (testset)     0.6750  0.6678  0.6762  0.6746  0.6679  0.6723  0.0037  
Fit time          8.82    9.95    10.19   9.21    8.98    9.43    0.54    
Test time         0.31    0.17    0.19    0.31    0.18    0.23    0.06    


{'test_rmse': array([0.88357479, 0.87015348, 0.8794724 , 0.87942439, 0.87277173]),
 'test_mae': array([0.67503673, 0.66780005, 0.67624418, 0.67458958, 0.66791246]),
 'fit_time': (8.824311017990112,
  9.945703744888306,
  10.194490671157837,
  9.214999675750732,
  8.976396083831787),
 'test_time': (0.3093867301940918,
  0.1685473918914795,
  0.18587374687194824,
  0.30736351013183594,
  0.18014216423034668)}

## Surprise 를 이용한 개인화 영화 추천 시스템

In [136]:
#영화에 대한 상세 속성 정보 DataFrame 로딩
movies = pd.read_csv('./movies.csv')

# userId =9 의 movieId 데이터 추출하여 movieId = 42 데이터가 있는지 확인
movieIds = ratings[ratings['userId']==9]['movieId']
if movieIds[movieIds==42].count() == 0:
    print('사용자 아이디 9 는 영화 아이디 42 의 평점 없음')
    
print(movies[movies['movieId']==42])

사용자 아이디 9 는 영화 아이디 42 의 평점 없음
    movieId                   title              genres
38       42  Dead Presidents (1995)  Action|Crime|Drama


In [137]:
uid = str(9)
iid = str(42)

pred = algo.predict(uid,iid, verbose=True)

user: 9          item: 42         r_ui = None   est = 2.93   {'was_impossible': False}


In [138]:
## 내가 안본 영화 리스트를 구해서, 그 중에서 추천

def get_unseen_surprise(movies, ratings, userId):
    ## 1. 전체 영화id 리스트
    total_movies = movies['movieId'].tolist()
    ## 2. 내가 본 영화id 리스트
    seen_movies = ratings[ratings['userId']== userId]['movieId'].tolist()
    ## 3. 안본 영화 리스트 = 1 - 2
    unseen_movies = [ movie for movie in total_movies if movie not in seen_movies]
    
    print('전체 영화 수 >>> ',len(total_movies))
    print('평점 매긴 영화 수 >>> ',len(seen_movies))
    print('안 본 영화 수 >>> ',len(unseen_movies))
    return unseen_movies

unseen_movies = get_unseen_surprise(movies, ratings, 9)

전체 영화 수 >>>  9742
평점 매긴 영화 수 >>>  46
안 본 영화 수 >>>  9696


In [139]:
## 안본 영화 중에서 평점 예측이 높게 나온 5개를 리스트업하는 함수

def recomm_movie_by_surprise(algo, userId, unseen_movies, top_n = 20):
    ## 안본 영화리스트를 하나씩 꺼낸다음 평점을 예측하세요.
    predictions = [ algo.predict(str(userId), str(movieId)) for movieId in unseen_movies]
    
    def sortkey_est(one):
        return one.est
    
    predictions.sort(key= sortkey_est , reverse=True)
    top_predictions = predictions[:top_n]
    
    top_movie_ids = [ int(pred.iid) for pred in top_predictions]
    top_movie_rating = [ pred.est for pred in top_predictions]
    top_movie_titles = movies[movies.movieId.isin(top_movie_ids)]['title']
    print(list(zip(top_movie_ids, top_movie_titles, top_movie_rating)))
    top_movie_preds = [ (id, title, rating) for id, title, rating in zip(top_movie_ids, top_movie_titles, top_movie_rating)]
#     print(len(top_predictions))
#     print(top_predictions)
    return top_movie_preds
    

In [140]:
top_preds = recomm_movie_by_surprise(algo, 9, unseen_movies)
top_preds

[(750, 'Usual Suspects, The (1995)', 4.476281873922615), (1089, 'Pulp Fiction (1994)', 4.322834963507822), (1208, 'Shawshank Redemption, The (1994)', 4.298289575182489), (318, "Schindler's List (1993)", 4.297756703770234), (3030, 'Blade Runner (1982)', 4.278983600619644), (1221, 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 4.277241857354926), (1223, 'Godfather, The (1972)', 4.270334230138356), (541, 'Philadelphia Story, The (1940)', 4.265267861176618), (1213, 'Reservoir Dogs (1992)', 4.228429211998829), (858, 'Streetcar Named Desire, A (1951)', 4.222827678536802), (1148, 'Wallace & Gromit: The Wrong Trousers (1993)', 4.221645813815583), (1217, 'Lawrence of Arabia (1962)', 4.221607101511421), (898, 'Apocalypse Now (1979)', 4.211939482048961), (1230, 'Goodfellas (1990)', 4.177738564718491), (50, 'Ran (1985)', 4.1725948948558065), (296, 'Godfather: Part II, The (1974)', 4.169674931341478), (1204, 'Grand Day Out with Wallace and Gromit, A (1989)', 4.168654

[(750, 'Usual Suspects, The (1995)', 4.476281873922615),
 (1089, 'Pulp Fiction (1994)', 4.322834963507822),
 (1208, 'Shawshank Redemption, The (1994)', 4.298289575182489),
 (318, "Schindler's List (1993)", 4.297756703770234),
 (3030, 'Blade Runner (1982)', 4.278983600619644),
 (1221,
  'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)',
  4.277241857354926),
 (1223, 'Godfather, The (1972)', 4.270334230138356),
 (541, 'Philadelphia Story, The (1940)', 4.265267861176618),
 (1213, 'Reservoir Dogs (1992)', 4.228429211998829),
 (858, 'Streetcar Named Desire, A (1951)', 4.222827678536802),
 (1148, 'Wallace & Gromit: The Wrong Trousers (1993)', 4.221645813815583),
 (1217, 'Lawrence of Arabia (1962)', 4.221607101511421),
 (898, 'Apocalypse Now (1979)', 4.211939482048961),
 (1230, 'Goodfellas (1990)', 4.177738564718491),
 (50, 'Ran (1985)', 4.1725948948558065),
 (296, 'Godfather: Part II, The (1974)', 4.169674931341478),
 (1204, 'Grand Day Out with Wallace and Gromit,

In [141]:
print(top_preds[1])

(1089, 'Pulp Fiction (1994)', 4.322834963507822)
