In [5]:
import pandas as pd
import os
import numpy as np
import scipy
import implicit

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.4
1.7.1
0.4.8


## 1) 데이터 준비와 전처리

In [6]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [19]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    836478 non-null  int64
 1   movie_id   836478 non-null  int64
 2   counts     836478 non-null  int64
 3   timestamp  836478 non-null  int64
dtypes: int64(4)
memory usage: 31.9 MB


In [7]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [8]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [9]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [10]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [14]:
max(ratings['user_id'])

6040

In [21]:
ratings['movie_id'].nunique

<bound method IndexOpsMixin.nunique of 0          1193
1           661
2           914
3          3408
4          2355
           ... 
1000203    1090
1000205    1094
1000206     562
1000207    1096
1000208    1097
Name: movie_id, Length: 836478, dtype: int64>

## 2) 분석

In [31]:
print('ratings에 있는 유니크한 영화 개수:', ratings['movie_id'].nunique())

ratings에 있는 유니크한 영화 개수: 3628


In [32]:
print('ratings에 있는 유니크한 사용자 수:', ratings['user_id'].nunique())

ratings에 있는 유니크한 사용자 수: 6039


In [41]:
grouped_ratings = ratings.groupby('movie_id').count()
sorted_ratings = grouped_ratings.sort_values(by='counts', ascending=False)
merged_ratings = pd.merge(sorted_ratings, movies, how='left', on='movie_id')
merged_ratings.head()

print('가장 인기있는 영화 30개(인기순):')
merged_ratings.iloc[:30]

가장 인기있는 영화 30개(인기순):


Unnamed: 0,movie_id,user_id,counts,timestamp,title,genre
0,2858,3211,3211,3211,American Beauty (1999),Comedy|Drama
1,260,2910,2910,2910,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
2,1196,2885,2885,2885,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
3,1210,2716,2716,2716,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
4,2028,2561,2561,2561,Saving Private Ryan (1998),Action|Drama|War
5,589,2509,2509,2509,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
6,593,2498,2498,2498,"Silence of the Lambs, The (1991)",Drama|Thriller
7,1198,2473,2473,2473,Raiders of the Lost Ark (1981),Action|Adventure
8,1270,2460,2460,2460,Back to the Future (1985),Comedy|Sci-Fi
9,2571,2434,2434,2434,"Matrix, The (1999)",Action|Sci-Fi|Thriller


## 3) 내가 선호하는 영화를 5가지 골라서 ratings에 추가

In [48]:
new_user_id = max(ratings['user_id'])+1
new_movie_ids = [260, 1196, 1210, 2571, 1240]
my_rating = 5
new_timestamp = max(ratings['timestamp']) + 1

In [50]:
data_to_add = [[new_user_id, mvid, my_rating, new_timestamp] for mvid in new_movie_ids]
data_to_add = pd.DataFrame(cols=ratings_cols, )
    



Unnamed: 0,0,1,2,3
0,6041,260,5,1046454591
1,6041,1196,5,1046454591
2,6041,1210,5,1046454591
3,6041,2571,5,1046454591
4,6041,1240,5,1046454591


## 4) CSR matrix를 직접 만들기

In [51]:
from scipy.sparse import csr_matrix

num_user = max(ratings['user_id'])+1
num_movies = max(ratings['movie_id'])+1

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movies))
csr_data

<6041x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836478 stored elements in Compressed Sparse Row format>

## 5) als_model = AlternatingLeastSquares 모델을 구성하여 훈련


In [52]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [53]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [54]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x6041 sparse matrix of type '<class 'numpy.int64'>'
	with 836478 stored elements in Compressed Sparse Column format>

In [55]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

## 6) 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악


## 7) 내가 좋아하는 영화와 비슷한 영화를 추천받기


## 8) 내가 가장 좋아할 만한 영화들을 추천받기
