# Source Code

In [19]:
import os
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import numpy as np

### Load Data

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")

orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### Data Preprocessing

In [4]:
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


- ratings 가 3 미만인 데이터는 고려하지 않기 위해 제거합니다.

In [5]:
ratings.rename(columns={'ratings':'counts'}, inplace=True)

- ratings 를 시청 횟수로 간주하고, 컬럼명을 변경해주었습니다.

In [6]:
del ratings['timestamp']
del movies['genre']

- timestamp 열을 사용하지 않기 위해 삭제해주었습니다.
- genre 열을 사용하지 않기 위해 삭제해주었습니다.

In [7]:
data = pd.merge(ratings, movies, how='left', on='movie_id')
del data['movie_id']

data

Unnamed: 0,user_id,counts,title
0,1,5,One Flew Over the Cuckoo's Nest (1975)
1,1,3,James and the Giant Peach (1996)
2,1,3,My Fair Lady (1964)
3,1,4,Erin Brockovich (2000)
4,1,5,"Bug's Life, A (1998)"
...,...,...,...
836473,6040,3,Platoon (1986)
836474,6040,5,"Crying Game, The (1992)"
836475,6040,5,Welcome to the Dollhouse (1995)
836476,6040,4,Sophie's Choice (1982)


- movie_id 를 기준으로 두 데이터프레임을 left merge 해주었습니다.
- 이후, movie_id 컬럼을 제거해주었습니다.

### Analysis

- 유니크한 영화의 수

In [8]:
ratings['movie_id'].nunique()

3628

- 유니크한 사용자의 수

In [9]:
ratings['user_id'].nunique()

6039

- 가장 인기있는 영화 30 개

In [10]:
top_movie = ratings.groupby('movie_id')['user_id'].count()
top_movie.sort_values(ascending=False).head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

- 사용자별 시청한 영화의 수

In [11]:
user_count = ratings.groupby('user_id')['movie_id'].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: movie_id, dtype: float64

### Set User Preference

In [12]:
my_favorite = ['Jurassic Park (1993)', 'Sixth Sense, The (1999)',
               'Saving Private Ryan (1998)', 'Die Hard (1988)',
               'Mission: Impossible (1996)']

my_movies = pd.DataFrame({'user_id':['Lee']*5,
                          'counts':[5]*5,
                          'title': my_favorite})

data = data.append(my_movies, ignore_index=True)

data.tail(10)

Unnamed: 0,user_id,counts,title
836473,6040,3,Platoon (1986)
836474,6040,5,"Crying Game, The (1992)"
836475,6040,5,Welcome to the Dollhouse (1995)
836476,6040,4,Sophie's Choice (1982)
836477,6040,4,E.T. the Extra-Terrestrial (1982)
836478,Lee,5,Jurassic Park (1993)
836479,Lee,5,"Sixth Sense, The (1999)"
836480,Lee,5,Saving Private Ryan (1998)
836481,Lee,5,Die Hard (1988)
836482,Lee,5,Mission: Impossible (1996)


- 5 개 영화에 모두 5 점을 부여하였습니다.

In [13]:
user_id = data['user_id'].unique()

user_id_dict = {user_id:idx for idx, user_id in enumerate(user_id)}

data['user_id'] = data['user_id'].map(lambda x: user_id_dict[x])

- user_id 를 인덱싱하고, data 에 적용해줍니다.

In [14]:
movie_id = data['title'].unique()

movie_id_dict = {title:idx for idx, title in enumerate(movie_id)}

data['title'] = data['title'].map(lambda x: movie_id_dict[x])

- title 을 인덱싱하고, data 에 적용해줍니다.

In [15]:
# Test
data

Unnamed: 0,user_id,counts,title
0,0,5,0
1,0,3,1
2,0,3,2
3,0,4,3
4,0,5,4
...,...,...,...
836478,6039,5,107
836479,6039,5,38
836480,6039,5,48
836481,6039,5,194


### CSR Matrix

In [18]:
num_user = data['user_id'].nunique()
num_title = data['title'].nunique()

csr_data = csr_matrix((data.counts, (data.user_id, data.title)), shape=(num_user, num_title))

csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

### Modeling & Training

In [20]:
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [21]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [22]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [23]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

### Evaluate

- my_favorite
    - 'Jurassic Park (1993)' 
    - 'Sixth Sense, The (1999)',
    - 'Saving Private Ryan (1998)' 
    - 'Die Hard (1988)',
    - 'Mission: Impossible (1996)'

In [38]:
def dot_product(my_favorite):
    preferences = []
    lee_vector = als_model.user_factors[user_id_dict['Lee']]
    for movie in my_favorite:
        movie_vector = als_model.item_factors[movie_id_dict[movie]]
        preference = np.dot(lee_vector, movie_vector)
        preferences.append(preference)
        print(f"movie: {movie} --> preference: {preference}")
    
    return preferences

In [39]:
dot_product(my_favorite)

movie: Jurassic Park (1993) --> preference: 0.5575769543647766
movie: Sixth Sense, The (1999) --> preference: 0.5560243129730225
movie: Saving Private Ryan (1998) --> preference: 0.5561578869819641
movie: Die Hard (1988) --> preference: 0.36002877354621887
movie: Mission: Impossible (1996) --> preference: 0.45148298144340515


[0.55757695, 0.5560243, 0.5561579, 0.36002877, 0.45148298]

- my_favorite 에 대한 'Lee' 사용자의 선호도가 위와 같이 도출되었습니다.
<br>
<br>
- 모든 시청횟수를 5 로 주었지만, 약 0.5 의 선호도가 도출되고 있습니다.
<br>
<br>
- 모델 학습이 부족하다고 판단되어 iterations 와 factors 의 수를 늘려보도록 하겠습니다.

### Modeling & Training

In [40]:
als_model = AlternatingLeastSquares(factors=200, regularization=0.02, use_gpu=False, iterations=30, dtype=np.float32)

In [41]:
als_model.fit(csr_data_transpose)

  0%|          | 0/30 [00:00<?, ?it/s]

### Evaluate

In [42]:
dot_product(my_favorite)

movie: Jurassic Park (1993) --> preference: 0.7367327809333801
movie: Sixth Sense, The (1999) --> preference: 0.8288918137550354
movie: Saving Private Ryan (1998) --> preference: 0.8103535175323486
movie: Die Hard (1988) --> preference: 0.49985459446907043
movie: Mission: Impossible (1996) --> preference: 0.643250048160553


[0.7367328, 0.8288918, 0.8103535, 0.4998546, 0.64325005]

- 다이하드와 미션임파서블에 대한 선호도는 아직 낮지만, 다른 영화들의 선호도는 높게 도출되었습니다.

In [43]:
movie_list = ["My Fair Lady (1964)", "Bug's Life, A (1998)", "E.T. the Extra-Terrestrial (1982)"]

In [44]:
dot_product(movie_list)

movie: My Fair Lady (1964) --> preference: -0.011374170891940594
movie: Bug's Life, A (1998) --> preference: -0.08798056840896606
movie: E.T. the Extra-Terrestrial (1982) --> preference: -0.00992459524422884


[-0.011374171, -0.08798057, -0.009924595]

- 이번에는 my_favorite 과는 비교적 다른 장르의 영화에 대해 모델이 예측한 선호도를 확인해보았습니다.
<br>
<br>
- 로맨스, 애니메이션, 판타지와 같은 장르에 대해서는 음수값의 선호도가 도출되었습니다.

### Similar Movies

In [47]:
id_movie_dict = {idx:movie for movie, idx in movie_id_dict.items()}

In [48]:
def get_similar_movie(movie:str):
    movie_id = movie_id_dict[movie]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [id_movie_dict[i[0]] for i in similar_movie]
    return similar_movie

In [49]:
get_similar_movie('Die Hard (1988)')

['Die Hard (1988)',
 'Terminator, The (1984)',
 'Indiana Jones and the Last Crusade (1989)',
 'Fugitive, The (1993)',
 'Die Hard 2 (1990)',
 'Predator (1987)',
 'Rocky (1976)',
 'Lethal Weapon (1987)',
 'Raiders of the Lost Ark (1981)',
 'Untouchables, The (1987)']

- 이번에는 영화 다이하드와 유사한 영화들을 추출해보았습니다.
<br>
<br>
- 터미네이터, 인디애나 존스, 도망자, 다이하드 2, 프레데터 등 액션 및 스릴러 영화들이 정확하게 추출되었습니다.

### Recommend movies to User

In [50]:
user = user_id_dict['Lee']

movies_recommend = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movies_recommend

[(141, 0.44540688),
 (92, 0.2967397),
 (82, 0.28643933),
 (200, 0.264119),
 (87, 0.25722253),
 (172, 0.25443366),
 (124, 0.22357167),
 (121, 0.22002989),
 (648, 0.21641351),
 (120, 0.21604922),
 (175, 0.21193206),
 (131, 0.20669541),
 (746, 0.18848243),
 (75, 0.17130333),
 (23, 0.1687147),
 (220, 0.15545706),
 (233, 0.15454194),
 (116, 0.15327546),
 (248, 0.15081534),
 (15, 0.14645037)]

In [51]:
movies_recommend = [id_movie_dict[i[0]] for i in movies_recommend]
movies_recommend

['Fugitive, The (1993)',
 'Terminator 2: Judgment Day (1991)',
 'Lost World: Jurassic Park, The (1997)',
 'Terminator, The (1984)',
 'Braveheart (1995)',
 'Indiana Jones and the Last Crusade (1989)',
 'Matrix, The (1999)',
 'Silence of the Lambs, The (1991)',
 'Lethal Weapon (1987)',
 'Raiders of the Lost Ark (1981)',
 'Men in Black (1997)',
 'Rocky (1976)',
 'Perfect Storm, The (2000)',
 'Hunt for Red October, The (1990)',
 "Schindler's List (1993)",
 'Seven (Se7en) (1995)',
 'Usual Suspects, The (1995)',
 'Dances with Wolves (1990)',
 'Good Will Hunting (1997)',
 'Airplane! (1980)']

- 20 개의 영화들이 추천되었습니다.
<br>
<br>
- 각 영화들이 대체적으로 액션 및 스릴러 장르에 포함되고 있음을 알 수 있습니다.
<br>
<br>
- 하지만, 예측된 선호도는 0.2 에서 0.1 사이로 낮은 선호도를 보여주고 있습니다.
<br>
<br>
- 이는 모델이 과적합되어 발생한 현상으로 판단됩니다.
<br>
<br>
- 가장 밑에서 두 번째에 위치한 '굿 윌 헌팅' 에 대해 추천 기여도를 확인해보겠습니다.

In [55]:
goodWillHunting = movie_id_dict['Good Will Hunting (1997)']
explain = als_model.explain(user, csr_data, itemid=goodWillHunting)

In [56]:
explain = [(id_movie_dict[i[0]], i[1]) for i in explain[1]]
explain

[('Saving Private Ryan (1998)', 0.07366990212595911),
 ('Sixth Sense, The (1999)', 0.05650416401215226),
 ('Jurassic Park (1993)', 0.020327236886619228),
 ('Mission: Impossible (1996)', 0.011865145279231847),
 ('Die Hard (1988)', -0.012576364458786262)]

- 영화 '라이언 일병 구하기' 와 '식스 센스' 가 가장 높은 기여도를 보여주었습니다.