In [1]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

In [2]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
os.environ["MKL_NUM_THREADS"] = "1"

In [3]:
# data
rating_file_path = "./data/ratings.dat"
# column
rating_cols = ["user_id", "movie_id", "rating", "timestamp"]
# rating 데이터 불러오기
ratings = pd.read_csv(rating_file_path, sep="::", names=rating_cols, engine="python")
# data 길이 저장
original_data_size = len(ratings)

In [4]:
# HEAD 찍어보기
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
# 3점 이상만 남김
ratings = ratings[ratings["rating"] >=3]
# 3점 이상 데이터 길이
filtered_data_size = len(ratings)

In [6]:
# 데이터 길이 비교
print(f"original data size : {original_data_size}, filtered_data_size : {filtered_data_size}")
print(f"Ratio of Remaining Data is {filtered_data_size / original_data_size:.2%} ")

original data size : 1000209, filtered_data_size : 836478
Ratio of Remaining Data is 83.63% 


In [7]:
# ratinngs 컬럼의 이름을 count로 변환
ratings.rename(columns={"rating" : "count"}, inplace=True)

In [8]:
# 영화 제목을 보기 위해 메타 데이터 읽어옵니다
movie_file_path = "./data/movies.dat"
# column 지정
cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv(movie_file_path, sep="::", names=cols, engine="python")

In [9]:
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
# 원본 보존
ratings_original = ratings.copy()

In [11]:
# ratings, movie merge
ratings = pd.merge(ratings, movies, how="left", on="movie_id")

In [12]:
# merge가 잘 되었는지 확인
ratings.head()

Unnamed: 0,user_id,movie_id,count,timestamp,title,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [13]:
# unique한 영화 수
num_unique_movie = ratings["movie_id"].nunique()

In [14]:
# unique 유저 수
num_unique_user = ratings["user_id"].nunique()

In [15]:
print(num_unique_movie, num_unique_user)

3628 6039


In [16]:
# groupby를 사용해서 가장 인기있는 영화 ID top30 확인
# ascending -> 오름차순
ratings.groupby("movie_id")["count"].sum().value_counts().sort_values(ascending=False)[:30]

3      60
4      43
7      28
6      24
8      22
13     20
10     20
11     20
12     16
5      16
60     15
43     15
9      15
15     14
34     14
20     14
16     14
56     13
62     13
28     13
27     13
77     13
105    12
55     12
138    12
23     12
47     11
76     11
95     11
31     11
Name: count, dtype: int64

In [17]:
# 내 영화 평점 추가
# timestamp col 삭제
ratings.drop('timestamp', axis=1, inplace=True)

add_1 = {'user_id': 6041, 'movie_id': 1, 'count': 5 , 'title':'Toy Story (1995)'}
add_2 = {'user_id': 6041, 'movie_id': 5, 'count': 3 , 'title':'Father of the Bride Part II (1995)'}
add_3 = {'user_id': 6041, 'movie_id': 105, 'count': 3 , 'title':'Bridges of Madison County, The (1995)'}
add_4 = {'user_id': 6041, 'movie_id': 55, 'count': 3 , 'title':'Georgia (1995)'}
add_5 = {'user_id': 6041, 'movie_id': 138, 'count': 4, 'title':'Neon Bible, The (1995)'}

ratings = ratings.append(add_1, ignore_index=True)
ratings = ratings.append(add_2, ignore_index=True)
ratings = ratings.append(add_3, ignore_index=True)
ratings = ratings.append(add_4, ignore_index=True)
ratings = ratings.append(add_5, ignore_index=True)

In [18]:
# movie_id 번호 순서대로 맞춰주기
movie_ids = np.sort(ratings["movie_id"].unique())
# key(movie_id) : value(index)
movie_ids_list = { j:i for i,j in enumerate(movie_ids) }
# id가 0부터 시작함
ratings["movie_id"] = ratings["movie_id"].map(lambda x : movie_ids_list[x])

In [19]:
# id to title
i2t = dict(zip(ratings['movie_id'], ratings['title']))
# title to id
t2i = dict(zip(ratings['title'] , ratings['movie_id']))

In [20]:
# 범주형 변환
ratings["user_id"] = pd.Categorical(ratings["user_id"])
ratings["movie_id"] = pd.Categorical(ratings["movie_id"])

In [21]:
# csr_matrix value, row, col 값 선언 (movie, user)
value = ratings['count'].values.astype(np.int32)
row = ratings['movie_id'].cat.codes.values.astype(np.int32)
col = ratings['user_id'].cat.codes.values.astype(np.int32)

In [22]:
movie_num = ratings["movie_id"].nunique()
user_num = ratings["user_id"].nunique()

In [23]:
movie_num, user_num

(3628, 6040)

In [24]:
# sparse_matrix 만들기
csr = csr_matrix((value, (row,col)), shape=(movie_num, user_num))

In [25]:
# als 학습
als_model = AlternatingLeastSquares(factors=300, regularization=0.02, use_gpu=False, iterations=100, dtype=np.float32)

In [26]:
# 학습 시작
als_model.fit(csr)

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [27]:
movie_name = 'Toy Story (1995)'
t2i[movie_name]

0

In [28]:
# 비슷한 영화 찾기
similar_movie = als_model.similar_items(t2i[movie_name])

In [29]:
similar_movie

[(0, 0.9999999),
 (2845, 0.36057732),
 (568, 0.23517507),
 (2114, 0.21270286),
 (3003, 0.19675072),
 (131, 0.19290094),
 (350, 0.17835541),
 (33, 0.17407694),
 (1147, 0.1630439),
 (885, 0.1546073)]

In [30]:
# 추천 결과 확인
# 토이스토리를 본 사람에게 추천하는 것들이 나쁘지 않은듯?
[i2t[i[0]] for i in similar_movie][1:]

['Toy Story 2 (1999)',
 'Aladdin (1992)',
 "Bug's Life, A (1998)",
 'Soft Toilet Seats (1999)',
 'Neon Bible, The (1995)',
 'Lion King, The (1994)',
 'Babe (1995)',
 'Groundhog Day (1993)',
 'Walk in the Sun, A (1945)']

In [31]:
user_to_idx = {v:k for k,v in enumerate(ratings['user_id'].unique())}
movie_to_idx = {v:k for k,v in enumerate(ratings['title'].unique())}

In [32]:
minchae, toy = user_to_idx[6041], movie_to_idx[movie_name]
minchae_vector, toy_vector = als_model.user_factors[minchae], als_model.item_factors[toy]

In [33]:
np.dot(minchae_vector, toy_vector)

0.00028027315