In [1]:
import pandas as pd
import numpy as np
import warnings
# 경고 제거
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('/content/drive/MyDrive/ml-100k/u.data', sep = '\t', names=r_cols, header = None)
ratings = ratings.drop('timestamp', axis=1) # timestamp 지우기

In [4]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [5]:
len(ratings.user_id.unique()) # 유저 943명 (평가 10만개)

943

In [6]:
len(ratings.movie_id.unique()) # item 1,682개 (평가 10만개)

1682

In [7]:
ratings.rating.unique()

array([3, 1, 2, 4, 5])

In [8]:
ratings.isnull().sum()

user_id     0
movie_id    0
rating      0
dtype: int64

In [9]:
ratings.duplicated().sum()

0

In [10]:
# Rating df의 user_id를 타겟(종속변수, 예측값)으로 train, test stratified split 실시(user_id 기준)
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y, random_state=12)

In [11]:
# "유저별 영화 평점"을 알아보기 위해 pivot table 활용 - train 셋을 full matrix로 변환
rating_matrix = x_train.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1670,1672,1675,1676,1677,1678,1679,1680,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,,1.0,5.0,3.0,...,,,,,,,,,,
2,,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [12]:
# 유저들의 영화 평점 데이터프레임의 코사인유사도를 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index = rating_matrix.index, columns = rating_matrix.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.154837,0.033459,0.013666,0.296253,0.342354,0.331744,0.244745,0.068384,0.268612,...,0.316956,0.127985,0.231527,0.143976,0.190169,0.108120,0.196504,0.141111,0.134674,0.293023
2,0.154837,1.000000,0.072256,0.106733,0.081576,0.182561,0.114109,0.046109,0.099550,0.145810,...,0.060035,0.139669,0.309276,0.376965,0.220621,0.217525,0.131795,0.070756,0.113498,0.052893
3,0.033459,0.072256,1.000000,0.311484,0.027719,0.014601,0.039432,0.070050,0.081034,0.076301,...,0.031871,0.053479,0.132740,0.063880,0.090194,0.018359,0.094752,0.083607,0.063215,0.019423
4,0.013666,0.106733,0.311484,1.000000,0.013465,0.033774,0.058527,0.157532,0.062480,0.033595,...,0.039318,0.000000,0.089554,0.137911,0.046025,0.000000,0.105633,0.141821,0.077759,0.015725
5,0.296253,0.081576,0.027719,0.013465,1.000000,0.145269,0.273056,0.201649,0.070948,0.149909,...,0.256210,0.059592,0.082490,0.055522,0.160808,0.020458,0.211167,0.074864,0.130234,0.223876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.108120,0.217525,0.018359,0.000000,0.020458,0.120078,0.084191,0.100528,0.000000,0.066687,...,0.031923,0.427589,0.237385,0.256477,0.298950,1.000000,0.068785,0.156469,0.016275,0.127225
940,0.196504,0.131795,0.094752,0.105633,0.211167,0.252028,0.230838,0.189545,0.081321,0.292762,...,0.265112,0.051482,0.153021,0.146790,0.103424,0.068785,1.000000,0.124923,0.153303,0.149331
941,0.141111,0.070756,0.083607,0.141821,0.074864,0.110166,0.046488,0.107442,0.055583,0.087973,...,0.040989,0.200604,0.191509,0.079746,0.218499,0.156469,0.124923,1.000000,0.063528,0.126774
942,0.134674,0.113498,0.063215,0.077759,0.130234,0.175661,0.196002,0.116145,0.097490,0.135871,...,0.163462,0.026337,0.056572,0.146478,0.061057,0.016275,0.153303,0.063528,1.000000,0.111918


In [13]:
# 한 user_id에게 한 영화(movie_id)를 평가한 사용자들을 바탕으로 가중평균 예측 평점을 계산하는 함수, 실수값 리턴
def cf_simple(user_id, movie_id):
    if movie_id in ratings: # 해당 movie_id가 유저-영화 rating_matrix에 존재하는지 확인(movie_id는 컬럼명. 인덱스와 컬럼을 다 확인하나?)

        # 현재 사용자와 다른 사용자 간의 (cosine)유사도 가져오기 (총 943개)
        sim_scores = user_similarity[user_id]

        # 현재 영화에 대한 모든 사용자의 평점 가져오기
        this_movie_ratings = ratings[movie_id]

        # 현재 영화를 평가하지 않은 사용자 index 가져오기
        none_rating_idx = this_movie_ratings[this_movie_ratings.isnull()].index

        # 현재 영화를 평가하지 않은 사용자의 rating (null) 제거
        this_movie_ratings = this_movie_ratings.dropna()

        # 현재 영화를 평가하지 않은 사용자의 similarity 제거
        sim_scores = sim_scores.drop(none_rating_idx)

        # 특정 사용자의 영화 예상평점 = 평가한 모든 사용자의 가중평균값
        mean_rating = np.dot(sim_scores, this_movie_ratings) / sim_scores.sum()

    else: # 유저-영화 rating matrix에 해당 영화가 없다면 기본값 3.0을 예측치로 돌려줌
        mean_rating = 3.0

    return mean_rating

# RMSE 계산함수 (오차(true-pred) 제곱 평균의 제곱근)
def RMSE(y_true, y_pred):
    return np.sqrt( np.mean(( np.array(y_true) - np.array(y_pred) )**2 ) )

# Score 함수 : model을 입력으로 받음
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array( [model(user, movie) for (user, movie) in id_pairs] )
    y_true = np.array( x_test['rating'] )
    return RMSE(y_true, y_pred)

In [14]:
score(cf_simple)

1.2411285187280163

- 특정 사용자에게 추천 실시 : 한 사용자의 모든 영화에 대한 예측값을 계산하고 그중에서 값이 높은 상위 n개만 추출해서 보여줌

In [15]:
# 추천을 위해 데이터를 다시 로딩한다.
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('/content/drive/MyDrive/ml-100k/u.data', sep = '\t', names=r_cols, header = None)
ratings = ratings.drop('timestamp', axis=1) # timestamp 지우기
rating_matrix = ratings.pivot(values = 'rating', index = 'user_id', columns = 'movie_id') # user-movie rating matrix

In [16]:
# 영화 제목 가져오기용 df : movies
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 
          'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 
          'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('/content/drive/MyDrive/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')
movies = movies[['movie_id', 'title']]
movies = movies.set_index('movie_id')

In [17]:
# User Cosine similarity 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index) # user-similarity

In [18]:
# 추천 함수
def recommender(user, n_items = 10):
    # 현재 사용자의 모든 아이템에 대한 예상 평점 계산
    predictions = []

    # iloc : index location, loc : label location(행 값과 열 값 자체)
    # 현재 사용자가 이미 평가한(= column 기준 not null) 영화의 열 인덱스 List 추출 -> 추천시 제외하기 위해
    rated_movie_index = rating_matrix.loc[user][rating_matrix.loc[user].notnull()].index

    # 해당 사용자가 평가하지 않은 영화들 : 열 index(추천 후보들의 'movie_id') 와 평점(NaN)
    items = rating_matrix.loc[user].drop(rated_movie_index)

    # 예상평점 계산
    for item in items.index: # 추천 후보들의 'movie_id'
        predictions.append( cf_simple(user, item) )

    recommend = pd.Series(data = predictions, index = items.index, dtype = float) # 추천 후보들의 'movie_id' 별 예상평점(가중평균)
    recommend = recommend.sort_values(ascending = False)[:n_items] # 내림차순으로 n_items 개수 추천
    recommended_items = movies.loc[recommend.index]['title']

    return recommended_items

In [19]:
recommender(2,10)

movie_id
2                    GoldenEye (1995)
1140    Road to Wellville, The (1994)
1150                Last Dance (1996)
1149                 Walkabout (1971)
1148                 Tom & Viv (1994)
1147                 My Family (1995)
1146             Calendar Girl (1993)
1145                Blue Chips (1994)
1144           Quiet Room, The (1996)
1143                Hard Eight (1996)
Name: title, dtype: object

• 실제 추천을 할 때는, train/test 나눌 필요 없이 모든 데이터로 하는게 더 정확하다

• rated_index 에서 해당 사용자가 이미 평가한 영화는 제외함 

https://yeong-jin-data-blog.tistory.com/m/entry/%ED%98%91%EC%97%85%ED%95%84%ED%84%B0%EB%A7%81-Collaborative-Filtering