# Movielens 영화 추천 실습 
### 달성 목표
1. CSR matrix가 정상적으로 만들어졌다 : 사용자와 아이템 개수
2. MF 모델이 정상적으로 훈련되어 사용자와 아이템 벡터 내적 수치가 의미있게 형성되었다
3. 비슷한 영화 찾기와 유저에게 추천하기 : MF 모델이 예측한 유저 선호도 및 아이템 간 유사도, 기여도 측정 


## 데이터 준비와 전처리

In [1]:
import os
import pandas as pd
rating_file_path = os.getenv("HOME") + "/aiffel/recommendata_iu/data/ml-1m/ratings.dat"
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding='ISO-8859-1')
original_data_size = len(ratings)
ratings.head(3)

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968


In [2]:
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'original_data_size: {original_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Ramaining Data is {filtered_data_size / original_data_size:.2%}')

original_data_size: 1000209, filtered_data_size: 836478
Ratio of Ramaining Data is 83.63%


In [3]:
# ratings 컬럼의 이름을 counts로 바꿉니다
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [4]:
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [5]:
ratings['user_id'].nunique()

6039

---

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp
1000198,6040,2021,3,956716374
1000199,6040,2022,5,956716207
1000200,6040,2028,5,956704519
1000201,6040,1080,4,957717322
1000202,6040,1089,4,956704996
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569


In [8]:
ratings['user_id'].nunique()

6039

In [9]:
# top 5
# 1: Toy Story (1995)
# 4: Waiting to Exhale (1995)
# 6: Heat (1995)
# 12: Dracula: Dead and Loving It (1995)
# 29: City of Lost Children, The (1995)

my_favorite = [1,4,6,12,29]
my_playlist = pd.DataFrame({'user_id': ['6041']*5, 'movie_id': my_favorite, 'counts':[5]*5})

if not ratings.isin({'user_id':['6041']})['user_id'].any():  # user_id에 '6041(me)'이라는 데이터가 없다면
    ratings = ratings.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,6041,1,5,
1,6041,4,5,
2,6041,6,5,
3,6041,12,5,
4,6041,29,5,


In [13]:
num_movies = ratings['movie_id'].nunique()
num_users = ratings['user_id'].nunique() + 1
print(num_movies, num_users)

most_popular = ratings.groupby('movie_id')['counts'].mean()
most_popular.sort_values(ascending=False).head(30)

3628 6041


movie_id
1830    5.000000
3607    5.000000
3800    5.000000
3280    5.000000
989     5.000000
1360    5.000000
3656    5.000000
687     5.000000
3881    5.000000
787     5.000000
1420    5.000000
1787    5.000000
1553    5.000000
3382    5.000000
3205    5.000000
3236    5.000000
572     5.000000
3172    5.000000
3233    5.000000
3245    4.800000
53      4.750000
3232    4.750000
2503    4.666667
2197    4.666667
3866    4.666667
2905    4.647059
2019    4.629690
670     4.603774
858     4.598523
318     4.596627
Name: counts, dtype: float64

In [14]:
print(len(ratings.counts))
print(len(ratings.user_id))
print(len(ratings.movie_id))
print(num_users, num_movies)

836483
836483
836483
6041 3628


In [22]:
from scipy.sparse import csr_matrix

num_users = ratings['user_id'].nunique()
num_movies = ratings['movie_id'].nunique()

print(num_users, num_movies)

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape=(num_users, num_movies))
csr_data

6040 3628


ValueError: row index exceeds matrix dimensions

In [None]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

# Implicit AlternatingLeastSquares 모델 선언
als_model = AlternatingLeastSquares(factors=100, 
                                    regularization=0.01,
                                    use_gpu=False,
                                    dtype=np.float32)
# als 모델은 input으로 (item X user 꼴의 matrix를 때문에 닸ranspose 해줍니다)
csr_data_transpose = csr_data.T
csr_data_transpose

als_model.fit(csr_data_transpose)

In [None]:
favorite_movie_id = 6 # heat(1995
similar_artist = als_model.similar_items(artist_id, N=15)
similar_artist

In [None]:
favorite_movie_id = 6 # heat(1995)
similar_movie = als_model.similar_items(favortie_movie_id, N=15)
similar_movie

In [None]:
def get_similar_movie(movie_name: str):
    movie_id = movie_name
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [i[0] for i in similar_artist]
    return similar_artist

get_similar_movie(6) # heat (1995)

---

In [23]:
data['artist'] = data['artist'].str.lower()
data.head()

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


첫 번째 유저 정보 출력

In [24]:
condition = (data['user_id']==data.loc[0, 'user_id'])
data.loc[condition]

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691
6,00000c289a1829a808ac09c00daf10bc3c4e223b,magica,545
7,00000c289a1829a808ac09c00daf10bc3c4e223b,the black dahlia murder,507
8,00000c289a1829a808ac09c00daf10bc3c4e223b,the murmurs,424
9,00000c289a1829a808ac09c00daf10bc3c4e223b,lunachicks,403


### 데이터 탐색
```df.nunique()```

In [29]:
# 유저수
data['user_id'].nunique()

358868

In [31]:
# 아티스트 수
data['artist'].nunique()

292363

In [33]:
# 인기 많은 아티스트
artist_count = data.groupby('artist')['user_id'].count()
artist_count.sort_values(ascending=False).head(10)

artist
radiohead                77254
the beatles              76245
coldplay                 66658
red hot chili peppers    48924
muse                     46954
metallica                45233
pink floyd               44443
the killers              41229
linkin park              39773
nirvana                  39479
Name: user_id, dtype: int64

In [35]:
# 유저별 몇명의 아스트를 듣고 있느니 에 대한 통계
user_count = data.groupby('user_id')['artist'].count()
user_count.describe()

count    358868.000000
mean         48.863234
std           8.524272
min           1.000000
25%          46.000000
50%          49.000000
75%          51.000000
max         166.000000
Name: artist, dtype: float64

In [36]:
# 유저별 play 횟수 중앙값 통계
user_median = data.groupby('user_id')['play'].median()
user_median.describe()

count    358868.000000
mean        142.187676
std         213.089902
min           1.000000
25%          32.000000
50%          83.000000
75%         180.000000
max       50142.000000
Name: play, dtype: float64

In [37]:
# 본인이 좋아하시는 아티스트 데이터로 바꿔서 추가하셔도 됩니다! 단, 이름은 꼭 데이터셋에 있는 것과 동일하게 맞춰주세요. 
my_favorite = ['black eyed peas' , 'maroon5' ,'jason mraz' ,'coldplay' ,'beyoncé']

# 'zimin'이라는 user_id가 위 아티스트의 노래를 30회씩 들었다고 가정하겠습니다.
my_playlist = pd.DataFrame({'user_id': ['zimin']*5, 'artist': my_favorite, 'play':[30]*5})

if not data.isin({'user_id':['zimin']})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    data = data.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

data.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,artist_MBID,artist,play
17535650,"sep 20, 2008",7ffd711a-b34d-4739-8aab-25e045c246da,turbostaat,12
17535651,"sep 20, 2008",9201190d-409f-426b-9339-9bd7492443e2,cuba missouri,11
17535652,"sep 20, 2008",e7cf7ff9-ed2f-4315-aca8-bcbd3b2bfa71,little man tate,11
17535653,"sep 20, 2008",f6f2326f-6b25-4170-b89d-e235b25508e8,sigur rós,10
17535654,"sep 20, 2008",40f5d9e4-2de7-4f2d-ad41-e31a9a9fea27,the smiths,10
0,zimin,,black eyed peas,30
1,zimin,,maroon5,30
2,zimin,,jason mraz,30
3,zimin,,coldplay,30
4,zimin,,beyoncé,30


In [38]:
user_unique = data['user_id'].unique()
artist_unique = data['artist'].unique()

user_to_idx = {v: k for k,v in enumerate(user_unique)}
artist_to_idx = {v: k for k,v in enumerate(artist_unique)}

In [39]:
print(user_to_idx['zimin'])
print(artist_to_idx['black eyed peas'])

358868
376


In [40]:
# indexing 을 통해 테이터 칼럼 내 값을 바꾸는 코드

# user_to_idx.get 을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다.
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될테니 dropna()로 제거합니다

temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(data):
    print('user_id column indexing OK!!!')
    data['user_id'] = temp_user_data
else:
    print('user_id column indexing Fail!!')
    
temp_artist_data = data['artist'].map(artist_to_idx.get).dropna()
if len(temp_artist_data) == len(data):
    print('artist column indexing OK!!')
    data['artist']=temp_artist_data
else:
    print('artist column indexing FAIL!!!')

data

user_id column indexing OK!!!
artist column indexing OK!!


Unnamed: 0,user_id,artist_MBID,artist,play
0,0,3bd73256-3905-4f3a-97e2-8b341527f805,0,2137
1,0,f2fb0ff0-5679-42ec-a55c-15109ce6e320,1,1099
2,0,b3ae82c2-e60b-4551-a76d-6620f1b456aa,2,897
3,0,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,3,717
4,0,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,4,706
...,...,...,...,...
0,358868,,376,30
1,358868,,271017,30
2,358868,,3746,30
3,358868,,62,30


In [41]:
# 1회만 play 한 데이터의 비율을 보는 코드
only_one = data[data['play']<2]
one, all_data = len(only_one), len(data)
print(f'{one},{all_data}')
print(f'Ratio of only_one over all data is {one/all_data:.2%}')

147740,17535660
Ratio of only_one over all data is 0.84%


### CSR(Compressed Sparse Row) Matrix

'indprt' Index pointers is linked list of pointers to
'indices'(column index Pointers)...
link: https://stackoverflow.com/questions/53254104/cant-understand-scipy-sparse-csr-matrix-example/62118005#62118005

In [43]:
from scipy.sparse import csr_matrix

num_user = data['user_id'].nunique()
num_artist = data['artist'].nunique()

csr_data = csr_matrix((data.play, (data.user_id, data.artist)), shape=(num_user, num_artist))
csr_data

<358869x292364 sparse matrix of type '<class 'numpy.longlong'>'
	with 17535585 stored elements in Compressed Sparse Row format>

Matrix Fatorization -> Implicit(암묵적) dataset package
als(AlternatingLeastSquares) 모델 : 두 feature matrix 중 한쪽을 고정시고 다른 쪽을 학습하는 방식을 번갈아 수행함

In [44]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [45]:
# Implicit AlternatingLeastSquares 모델 선언
als_model = AlternatingLeastSquares(factors=100, 
                                    regularization=0.01,
                                    use_gpu=False,
                                    dtype=np.float32)

In [46]:
# als 모델은 input으로 (item X user 꼴의 matrix를 때문에 닸ranspose 해줍니다)
csr_data_transpose = csr_data.T
csr_data_transpose

<292364x358869 sparse matrix of type '<class 'numpy.longlong'>'
	with 17535585 stored elements in Compressed Sparse Column format>

In [47]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [48]:
zimin, black_eyed_peas = user_to_idx['zimin'], artist_to_idx['black eyed peas']
zimin_vector, black_eyed_peas_vector = als_model.user_factors[zimin], als_model.item_factors[black_eyed_peas]

In [49]:
zimin_vector

array([-5.58084488e-01, -1.38451368e-01, -2.37757847e-01,  8.32318783e-01,
       -5.70116527e-02, -1.35720563e+00, -2.77664572e-01, -6.30040169e-01,
       -1.45044699e-01, -1.01854146e+00,  4.58521307e-01, -1.06646597e+00,
       -7.97158387e-03,  1.31048059e+00,  3.75720978e-01, -3.76204789e-01,
       -1.14018805e-01,  1.11928098e-01,  8.25471759e-01, -7.59567507e-03,
        1.55405772e+00,  3.49292427e-01, -1.00414634e-01,  5.20219028e-01,
       -3.22346926e-01, -1.04816461e+00,  1.10404216e-01, -4.93673414e-01,
        4.41059992e-02, -8.32776964e-01,  2.71921664e-01,  1.71762764e+00,
       -6.68184400e-01, -4.73797992e-02, -2.94839472e-01,  8.03386331e-01,
       -6.52597308e-01, -1.14947832e+00, -1.74510896e+00, -4.52875465e-01,
       -7.11654872e-02, -1.27276218e+00,  4.34712887e-01,  6.23202384e-01,
        1.90261424e-01,  1.28737614e-01,  6.73704967e-02,  5.83447158e-01,
        5.11845648e-01,  5.34697950e-01,  1.90781400e-01, -4.82020378e-01,
       -9.69070733e-01, -

In [50]:
black_eyed_peas_vector

array([-0.00180914,  0.02359356, -0.00662939,  0.00653167,  0.01748352,
        0.00199991, -0.01224809, -0.00751665,  0.00158728, -0.00553667,
        0.00779487, -0.00515218,  0.00535332,  0.0123032 ,  0.01012068,
       -0.00313465, -0.01363786,  0.01954222,  0.01496124, -0.00410942,
        0.02972317,  0.00473042, -0.00124717,  0.01315532,  0.0055931 ,
       -0.01136929,  0.01020056, -0.00328502,  0.00919188, -0.00367534,
        0.00474358,  0.01867747, -0.00164955,  0.00038741,  0.00595106,
        0.01653088, -0.00325221, -0.01138331, -0.0058842 ,  0.01511716,
        0.0090636 , -0.01085387,  0.01367447,  0.01457198,  0.00945232,
        0.01530872,  0.01865887,  0.02127362,  0.01183216,  0.0131022 ,
        0.02224982,  0.00803497, -0.00751463,  0.00323679,  0.01564962,
        0.00597247, -0.00731711,  0.00019252, -0.01721236,  0.00827466,
        0.01930525,  0.00877372,  0.01899979,  0.01202327,  0.01273281,
        0.00926096,  0.01641816,  0.00815576,  0.00355271,  0.00

In [51]:
# zimin, black_eyed_peas 내적
np.dot(zimin_vector, black_eyed_peas_vector)

0.5124239

In [55]:
queen = artist_to_idx['queen']
queen_vector = als_model.item_factors[queen]
np.dot(zimin_vector, queen_vector)

0.3008922

### 비슷한 아티스트 찾기 + 유저에게 추천하기
```AlternatingLeastSquares.similar_items()```

In [56]:
favorite_artist = 'coldplay'
artist_id = artist_to_idx[favorite_artist]
similar_artist = als_model.similar_items(artist_id, N=15)
similar_artist

[(62, 0.9999999),
 (277, 0.98835284),
 (5, 0.97690046),
 (28, 0.9720171),
 (217, 0.9694139),
 (473, 0.96598005),
 (490, 0.9607386),
 (247, 0.9589004),
 (910, 0.9541248),
 (418, 0.9532959),
 (694, 0.9497932),
 (782, 0.9425723),
 (268, 0.9386042),
 (1018, 0.9379625),
 (531, 0.93480754)]

In [58]:
#artist_to_idx를 뒤집어, index로부터 artist 이름을 얻는 dict를 생성합니다
idx_to_artist = {v:k for k,v in artist_to_idx.items()}
[idx_to_artist[i[0]] for i in similar_artist]

['coldplay',
 'muse',
 'red hot chili peppers',
 'the killers',
 'radiohead',
 'placebo',
 'oasis',
 'the beatles',
 'nirvana',
 'u2',
 'foo fighters',
 'the white stripes',
 'pink floyd',
 'the smashing pumpkins',
 'depeche mode']

In [59]:
def get_similar_artist(artist_name: str):
    artist_id = artist_to_idx[artist_name]
    similar_artist = als_model.similar_items(artist_id)
    similar_artist = [idx_to_artist[i[0]] for i in similar_artist]
    return similar_artist

In [62]:
get_similar_artist('lady gaga')

['lady gaga',
 'britney spears',
 'katy perry',
 'rihanna',
 'beyoncé',
 'the pussycat dolls',
 'kelly clarkson',
 'christina aguilera',
 'justin timberlake',
 'leona lewis']

__유저에게 아티스트 추천하기__
```
AlternatingLeastSquares.recommmend()
AlternatingLeastSquares.filter_already_liked_items()
```

In [63]:
user = user_to_idx['zimin']
# recommend -> user*item CSR Matrix
artist_recommended = als_model.recommend(user,csr_data, N=20, 
                                         filter_already_liked_items=True)
artist_recommended

[(350, 0.44454262),
 (369, 0.43977642),
 (550, 0.43252516),
 (627, 0.4151128),
 (1800, 0.41475344),
 (2249, 0.414375),
 (274, 0.3913468),
 (354, 0.38966948),
 (382, 0.38825563),
 (355, 0.3849309),
 (391, 0.38430375),
 (5556, 0.3738802),
 (409, 0.3715177),
 (564, 0.3661566),
 (901, 0.3645937),
 (618, 0.36204734),
 (358, 0.35461217),
 (24, 0.35427287),
 (724, 0.34940523),
 (944, 0.34871125)]

In [64]:
[idx_to_artist[i[0]] for i in artist_recommended]

['rihanna',
 'justin timberlake',
 'britney spears',
 'maroon 5',
 'lady gaga',
 'katy perry',
 'michael jackson',
 'nelly furtado',
 'mika',
 'madonna',
 'christina aguilera',
 'timbaland',
 'amy winehouse',
 'kanye west',
 'pink',
 'the pussycat dolls',
 'kelly clarkson',
 'jack johnson',
 'lily allen',
 'avril lavigne']

In [65]:
rihanna = artist_to_idx['rihanna']
explain = als_model.explain(user, csr_data, itemid=rihanna)

In [68]:
[(idx_to_artist[i[0]], i[1]) for i in explain[1]]

[('beyoncé', 0.219924769100042),
 ('black eyed peas', 0.14042903441010368),
 ('jason mraz', 0.04668546196520325),
 ('coldplay', 0.03947152519784778),
 ('maroon5', -2.9961741062687594e-06)]