In [4]:
%matplotlib inline

import time
import operator

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [5]:
# Data Source : https://grouplens.org/datasets/movielens

rating_file_path = "../data/ml-1m/ratings.dat"
movie_file_path = "../data/ml-1m/movies.dat"
user_file_path = "../data/ml-1m/users.dat"

rating_data = pd.io.parsers.read_csv(rating_file_path, 
                                     names=['user_id', 'movie_id', 'rating', 'time'], delimiter='::')
movie_data = pd.io.parsers.read_csv(movie_file_path, 
                                    names=['movie_id', 'title', 'genre'], delimiter='::')
user_data = pd.io.parsers.read_csv(user_file_path, 
                                   names=['user_id', 'gender', 'age', 'occupation', 'zipcode'], delimiter='::')

In [6]:
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

In [7]:
# SVD 라이브러리를 사용하기 위한 학습 데이터를 생성합니다.
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(rating_data[['user_id', 'movie_id', 'rating']], reader)
train_data = data.build_full_trainset()
# SVD 모델을 학습합니
train_start = time.time()
model = SVD(n_factors=8,
            lr_all=0.005,
            reg_all=0.02,
            n_epochs=100)
model.fit(train_data)
train_end = time.time()
print("training time of model: %.2f seconds" % (train_end - train_start))

training time of model: 11.71 seconds


In [8]:
# user_id가 4인 유저의 영화 평가 데이터입니다.
target_user_id = 4
target_user_data = rating_data[rating_data['user_id']==target_user_id]


In [9]:
# user_id 4인 유저가 평가한 영화 히스토리 정보를 추출합니다.
target_user_movie_rating_dict = {}

for index, row in target_user_data.iterrows():
    movie_id = row['movie_id']
    target_user_movie_rating_dict[movie_id] = row['rating']
            
print(target_user_movie_rating_dict)

{3468: 5, 1210: 3, 2951: 4, 1214: 4, 1036: 4, 260: 5, 2028: 5, 480: 4, 1196: 2, 1198: 5, 1954: 5, 1097: 4, 3418: 4, 3702: 4, 2366: 4, 1387: 5, 3527: 1, 1201: 5, 2692: 5, 2947: 5, 1240: 5}


In [10]:
test_data = []
for index, row in movie_data.iterrows():
    movie_id = row['movie_id']
    
    rating = 0
    if movie_id in target_user_movie_rating_dict:
        continue
    test_data.append((target_user_id, movie_id, rating))

test_data

[(4, 1, 0),
 (4, 2, 0),
 (4, 3, 0),
 (4, 4, 0),
 (4, 5, 0),
 (4, 6, 0),
 (4, 7, 0),
 (4, 8, 0),
 (4, 9, 0),
 (4, 10, 0),
 (4, 11, 0),
 (4, 12, 0),
 (4, 13, 0),
 (4, 14, 0),
 (4, 15, 0),
 (4, 16, 0),
 (4, 17, 0),
 (4, 18, 0),
 (4, 19, 0),
 (4, 20, 0),
 (4, 21, 0),
 (4, 22, 0),
 (4, 23, 0),
 (4, 24, 0),
 (4, 25, 0),
 (4, 26, 0),
 (4, 27, 0),
 (4, 28, 0),
 (4, 29, 0),
 (4, 30, 0),
 (4, 31, 0),
 (4, 32, 0),
 (4, 33, 0),
 (4, 34, 0),
 (4, 35, 0),
 (4, 36, 0),
 (4, 37, 0),
 (4, 38, 0),
 (4, 39, 0),
 (4, 40, 0),
 (4, 41, 0),
 (4, 42, 0),
 (4, 43, 0),
 (4, 44, 0),
 (4, 45, 0),
 (4, 46, 0),
 (4, 47, 0),
 (4, 48, 0),
 (4, 49, 0),
 (4, 50, 0),
 (4, 51, 0),
 (4, 52, 0),
 (4, 53, 0),
 (4, 54, 0),
 (4, 55, 0),
 (4, 56, 0),
 (4, 57, 0),
 (4, 58, 0),
 (4, 59, 0),
 (4, 60, 0),
 (4, 61, 0),
 (4, 62, 0),
 (4, 63, 0),
 (4, 64, 0),
 (4, 65, 0),
 (4, 66, 0),
 (4, 67, 0),
 (4, 68, 0),
 (4, 69, 0),
 (4, 70, 0),
 (4, 71, 0),
 (4, 72, 0),
 (4, 73, 0),
 (4, 74, 0),
 (4, 75, 0),
 (4, 76, 0),
 (4, 77, 0),
 (4, 78,

In [11]:
# 타겟 유저의 평점 점수를 예측합니다.
target_user_predictions = model.test(test_data)

# 예측된 점수 중, 타겟 유저의 영화별 점수를 target_user_movie_predict_dict로 저장합니다.
def get_user_predicted_ratings(predictions, user_id, user_history):
    target_user_movie_predict_dict = {}
    print(predictions)
    for uid, mid, rating, predicted_rating, _ in predictions:
        if user_id == uid:
            if mid not in user_history:
                target_user_movie_predict_dict[mid] = predicted_rating
    return target_user_movie_predict_dict
target_user_movie_predict_dict = get_user_predicted_ratings(predictions=target_user_predictions, 
                                                            user_id=target_user_id, 
                                                            user_history=target_user_movie_rating_dict)


[Prediction(uid=4, iid=1, r_ui=0, est=4.490027905776206, details={'was_impossible': False}), Prediction(uid=4, iid=2, r_ui=0, est=3.288866504658201, details={'was_impossible': False}), Prediction(uid=4, iid=3, r_ui=0, est=3.207628645683813, details={'was_impossible': False}), Prediction(uid=4, iid=4, r_ui=0, est=3.5766912009126455, details={'was_impossible': False}), Prediction(uid=4, iid=5, r_ui=0, est=3.2598194740919966, details={'was_impossible': False}), Prediction(uid=4, iid=6, r_ui=0, est=4.291192869531892, details={'was_impossible': False}), Prediction(uid=4, iid=7, r_ui=0, est=3.3705753660930386, details={'was_impossible': False}), Prediction(uid=4, iid=8, r_ui=0, est=3.364069739759959, details={'was_impossible': False}), Prediction(uid=4, iid=9, r_ui=0, est=2.570321720127746, details={'was_impossible': False}), Prediction(uid=4, iid=10, r_ui=0, est=3.1443831613834563, details={'was_impossible': False}), Prediction(uid=4, iid=11, r_ui=0, est=3.868386662056013, details={'was_imp

In [29]:
#데이터 정렬
target_user_predicted = sorted(target_user_movie_predict_dict.items(), 
                                     key=operator.itemgetter(1), reverse=True)

In [13]:
#장르 코드 불러오기
movie_dict1 = {}
for index, row in movie_data.iterrows():
    movie_id = row['movie_id']
    movie_genre = row['genre']
    movie_dict1[movie_id] = movie_genre

In [14]:
#제목 코드 불러오기
movie_dict2 = {}
for index, row in movie_data.iterrows():
    movie_id = row['movie_id']
    movie_title= row['title']
    movie_dict2[movie_id] = movie_title

In [17]:
# 2-1 movie_dict1에 애니메이션이 들어가 있는 영화만 출력
for predicted in target_user_predicted :
    movie_id= predicted[0]
    predicted_rating = predicted[1]
    if movie_dict1[movie_id] in "Animations":
        print(movie_dict2[movie_id])

I Married A Strange Person (1997)
Wallace & Gromit: The Best of Aardman Animation (1996)


In [27]:
# 장르별로 평균 예측 평점 구하기
genres=['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir', 'Western']
genre_rating={}
for genre in genres :
    i=0
    rating=0
    for predicted in target_user_predicted :
        movie_id= predicted[0]
        p_rating = predicted[1]
        if movie_dict1[movie_id] in genre:
            i+=1
            rating=rating+p_rating
    rating_mean=rating/i        
    genre_rating[genre]=rating_mean

#2-2 액션 평균 점수 부여 
print(genre_rating["Action"])

3.0875590271600943


In [30]:
#value로 내리참순 정렬로 가장 높은 평균평점 장르확인
genre_ratings = sorted(genre_rating.items(), key=operator.itemgetter(1), reverse=True)
#2-3 장르별 평균평점 및 상위 평균평점
print(genre_ratings)
print(genre_ratings[0])

[('Animation', 4.590629665701818), ('Film-Noir', 4.4417578861567035), ('War', 4.278825071808003), ('Fantasy', 4.203133031204315), ('Documentary', 4.171497177087502), ('Western', 3.9921008575057235), ('Drama', 3.9834051604250678), ('Musical', 3.9383243225103097), ('Crime', 3.764682963418222), ('Romance', 3.658387600894698), ('Thriller', 3.63670154651861), ('Mystery', 3.6189353406418747), ('Adventure', 3.532613897135614), ('Sci-Fi', 3.5306662685242634), ('Comedy', 3.390410802737876), ("Children's", 3.1172118380518468), ('Action', 3.0875590271600943), ('Horror', 2.940614182628705)]
('Animation', 4.590629665701818)
