# 영화 추천 

This tutorial shows the major functionality of the [implicit](https://github.com/benfred/implicit) library by building a music recommender system using the the [last.fm 360K dataset](http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html).

### Getting the Dataset

Implicit includes code to access several different popular recommender datasets in the  ```implicit.datasets``` module. The following code will both download the lastfm dataset locally, as well as load it up into memory:

In [75]:

import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
import implicit

In [76]:
movies = pd.read_csv('/data/ephemeral/ml/input/code/HQ/Data/ml-2m/train_ratings.csv')
titles = pd.read_csv('/data/ephemeral/ml/input/data/train/titles.tsv',sep='\t')

In [77]:
movies=movies.merge(titles, on='item')
movies

Unnamed: 0,user,item,time,title
0,11,4643,1230782529,Planet of the Apes (2001)
1,11,170,1230782534,Hackers (1995)
2,11,531,1230782539,"Secret Garden, The (1993)"
3,11,616,1230782542,"Aristocats, The (1970)"
4,11,2140,1230782563,"Dark Crystal, The (1982)"
...,...,...,...,...
5154466,138493,44022,1260209449,Ice Age 2: The Meltdown (2006)
5154467,138493,4958,1260209482,Behind Enemy Lines (2001)
5154468,138493,68319,1260209720,X-Men Origins: Wolverine (2009)
5154469,138493,40819,1260209726,Walk the Line (2005)


In [78]:
movies

Unnamed: 0,user,item,time,title
0,11,4643,1230782529,Planet of the Apes (2001)
1,11,170,1230782534,Hackers (1995)
2,11,531,1230782539,"Secret Garden, The (1993)"
3,11,616,1230782542,"Aristocats, The (1970)"
4,11,2140,1230782563,"Dark Crystal, The (1982)"
...,...,...,...,...
5154466,138493,44022,1260209449,Ice Age 2: The Meltdown (2006)
5154467,138493,4958,1260209482,Behind Enemy Lines (2001)
5154468,138493,68319,1260209720,X-Men Origins: Wolverine (2009)
5154469,138493,40819,1260209726,Walk the Line (2005)


In [79]:
movies.value_counts('item')

item
2571      19699
2959      18437
296       18202
318       18168
356       17339
          ...  
117881       38
126          36
2555         34
4241         34
51372        27
Name: count, Length: 6807, dtype: int64

In [80]:
popularity=movies.value_counts('item')

In [81]:
popularity

item
2571      19699
2959      18437
296       18202
318       18168
356       17339
          ...  
117881       38
126          36
2555         34
4241         34
51372        27
Name: count, Length: 6807, dtype: int64

In [82]:
movies=movies.merge(popularity, on='item')

In [83]:
movies['count'] = (movies['count']-movies["count"].min())/(movies["count"].max()-movies["count"].min()) * 5

In [84]:
movies = movies[movies.user.notna()]
movies = movies.sort_values(by=['user','item'])
grouped_df = movies[['user', 'item', 'count', 'title']].sort_values(by=['user','item'])#.groupby(['user', 'item']).reset_index()
grouped_df
#grouped_df.loc[grouped_df['Quantity'] == 0, ['Quantity']] = 1
#grouped_df = grouped_df.loc[grouped_df['Quantity'] > 0]

Unnamed: 0,user,item,count,title
267,11,1,3.098312,Toy Story (1995)
33,11,19,0.564254,Ace Ventura: When Nature Calls (1995)
17,11,32,2.910228,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
291,11,39,0.786651,Clueless (1995)
205,11,110,3.134404,Braveheart (1995)
...,...,...,...,...
5154176,138493,66762,0.005846,Paris (2008)
5154468,138493,68319,0.359902,X-Men Origins: Wolverine (2009)
5154416,138493,68954,1.667344,Up (2009)
5154463,138493,69526,0.186560,Transformers: Revenge of the Fallen (2009)


In [85]:
item  = {a:b for a,b in enumerate(movies.item.unique())}
item

{0: 1,
 1: 19,
 2: 32,
 3: 39,
 4: 110,
 5: 150,
 6: 153,
 7: 158,
 8: 160,
 9: 165,
 10: 170,
 11: 172,
 12: 173,
 13: 185,
 14: 208,
 15: 231,
 16: 253,
 17: 256,
 18: 260,
 19: 296,
 20: 316,
 21: 318,
 22: 344,
 23: 356,
 24: 364,
 25: 367,
 26: 377,
 27: 380,
 28: 405,
 29: 410,
 30: 442,
 31: 480,
 32: 500,
 33: 527,
 34: 531,
 35: 541,
 36: 546,
 37: 551,
 38: 586,
 39: 587,
 40: 588,
 41: 589,
 42: 592,
 43: 593,
 44: 595,
 45: 597,
 46: 608,
 47: 610,
 48: 611,
 49: 616,
 50: 648,
 51: 653,
 52: 673,
 53: 736,
 54: 741,
 55: 761,
 56: 780,
 57: 784,
 58: 828,
 ...}

In [86]:
grouped_df.item.nunique()

6807

In [87]:

unique_customers = grouped_df.user.unique()
customer_ids = dict(zip(unique_customers, np.arange(unique_customers.shape[0], dtype=np.int32)))
print(customer_ids)
unique_items = grouped_df.item.unique()
item_ids = dict(zip(unique_items, np.arange(unique_items.shape[0], dtype=np.int32))) # nunique와 unique_items.shape[0]은 똑같음 
print(item_ids)
grouped_df['user'] = grouped_df.user.apply(lambda i: customer_ids[i])
grouped_df['item'] = grouped_df.item.apply(lambda i: item_ids[i])

sparse_item_customer = sparse.csr_matrix((grouped_df['count'].astype(float), (grouped_df['item'], grouped_df['user'])))
sparse_customer_item = sparse.csr_matrix((grouped_df['count'].astype(float), (grouped_df['user'], grouped_df['item'])))

model = implicit.nearest_neighbours.ItemItemRecommender()

#alpha = 15
#data = (sparse_item_customer * alpha).astype('double')

model.fit(sparse_customer_item)

{11: 0, 14: 1, 18: 2, 25: 3, 31: 4, 35: 5, 43: 6, 50: 7, 58: 8, 60: 9, 61: 10, 65: 11, 72: 12, 77: 13, 82: 14, 85: 15, 90: 16, 91: 17, 96: 18, 98: 19, 99: 20, 102: 21, 116: 22, 121: 23, 124: 24, 129: 25, 132: 26, 133: 27, 135: 28, 136: 29, 147: 30, 152: 31, 154: 32, 155: 33, 162: 34, 163: 35, 168: 36, 175: 37, 182: 38, 189: 39, 190: 40, 201: 41, 204: 42, 205: 43, 206: 44, 208: 45, 209: 46, 211: 47, 213: 48, 215: 49, 218: 50, 220: 51, 232: 52, 237: 53, 239: 54, 241: 55, 248: 56, 252: 57, 254: 58, 258: 59, 264: 60, 266: 61, 271: 62, 279: 63, 284: 64, 285: 65, 294: 66, 304: 67, 312: 68, 313: 69, 316: 70, 317: 71, 318: 72, 337: 73, 340: 74, 342: 75, 348: 76, 351: 77, 359: 78, 361: 79, 367: 80, 370: 81, 372: 82, 375: 83, 379: 84, 383: 85, 387: 86, 388: 87, 394: 88, 395: 89, 398: 90, 407: 91, 409: 92, 413: 93, 419: 94, 421: 95, 422: 96, 425: 97, 427: 98, 430: 99, 431: 100, 436: 101, 440: 102, 442: 103, 448: 104, 451: 105, 455: 106, 457: 107, 459: 108, 462: 109, 466: 110, 469: 111, 471: 112, 

  0%|          | 0/6807 [00:00<?, ?it/s]

In [88]:
# calculate the top recommendations for a single user
ids, scores = model.recommend(1, sparse_customer_item[1])

# # calculate the top recommendations for a batch of users
# userids = np.arange(10)
# ids, scores = model.recommend(userids, user_items[userids])

In [89]:
for i in ids:
    print(i)
    print(grouped_df[grouped_df['item']==i]['title'].unique())

122
['Matrix, The (1999)']
21
['Shawshank Redemption, The (1994)']
141
['Fight Club (1999)']
19
['Pulp Fiction (1994)']
194
['Lord of the Rings: The Fellowship of the Ring, The (2001)']
246
['Lord of the Rings: The Return of the King, The (2003)']
220
['Lord of the Rings: The Two Towers, The (2002)']
43
['Silence of the Lambs, The (1991)']
71
['Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)']
139
['American Beauty (1999)']


In [90]:
ids, scores = model.recommend(0, sparse_customer_item[0], N=10, filter_already_liked_items=True, filter_items=None, recalculate_user=False, items=None)

In [91]:
for i in ids:
    print(i,item[i])

722 47
586 50
522 858
754 7438
497 8961
733 2329
1336 4963
767 32587
463 4886
743 5418


4370
3156
2
4886
3986
37386
58025
8961
2174
47

2424
1198
1028
4016
4993
4857
1907
5952
7153
34

1193
46578
2324
63082
296
46723
60950
48738
56367
912

1259
1270
1923
2997
2762
47
1073
2329
1206
608

In [92]:
grouped_df.item.unique().shape[0]#nunique()

6807

In [93]:
grouped_df[grouped_df['item']==10488]

Unnamed: 0,user,item,count,title


### 모든 user에 대해서 10개씩 추천하기

In [94]:
users = unique_customers.repeat(10)
items = []
for i in range(len(unique_customers)):
    ids, scores = model.recommend(i, sparse_customer_item[i], N=10, filter_already_liked_items=True, filter_items=None, recalculate_user=False, items=None)
    for j in ids: items.append(item[j])

In [95]:
out = pd.DataFrame(zip(users, items),columns=['user','item'])
out.to_csv('out.csv',index=False)

## Evaluation

In [96]:
from implicit.evaluation import precision_at_k, train_test_split, ranking_metrics_at_k, ndcg_at_k
from implicit.als import AlternatingLeastSquares

In [97]:
sparse_customer_item = sparse.csr_matrix((grouped_df['count'].astype(float), (grouped_df['user'], grouped_df['item'])))

In [98]:
train, test = train_test_split(sparse_customer_item)

In [99]:
train, test = train_test_split(sparse_customer_item)

model = AlternatingLeastSquares(factors=128, regularization=20, iterations=1)
model.fit(train)

  0%|          | 0/1 [00:00<?, ?it/s]

In [100]:

p = ndcg_at_k(model, train.tocsr(), test.tocsr(), K=10, num_threads=4)

  0%|          | 0/6806 [00:00<?, ?it/s]

IndexError: index 6844 is out of bounds for axis 1 with size 6807