In [37]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')

In [38]:
mv = pd.read_csv("tmdb_5000_movies.csv")
mv.shape

(4803, 20)

### 전처리
* genres, keywords 칼럼 수정
* 영화별 장르, 키워드 추출

In [39]:
mv = mv[['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'keywords', 'overview']]
mv.shape

(4803, 8)

In [40]:
# genres, keyword -> list[dict1, dict2]가 단순 문자열로 인식됨
mv.head(1)
mv[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."


In [41]:
# ast 모듈의 literal_eval() 사용 -> list[dict1, dict2]
from ast import literal_eval

mv['genres'] = mv['genres'].apply(literal_eval)
mv['keywords'] = mv['keywords'].apply(literal_eval)

In [42]:
mv[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 1463, 'name': 'culture clash'}, {'id':..."


In [43]:
# 키 'name'에 해당하는 값만 추출
mv['genres'] = mv['genres'].apply(lambda x: [y['name'] for y in x])
mv['keywords'] = mv['keywords'].apply(lambda x: [y['name'] for y in x])

mv[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."


In [44]:
mv.head(5)

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",7.2,11800,150.437577,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",6.9,4500,139.082615,"[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,"[Action, Adventure, Crime]",6.3,4466,107.376788,"[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",7.6,9106,112.31295,"[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...
4,49529,John Carter,"[Action, Adventure, Science Fiction]",6.1,2124,43.926995,"[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca..."


### 장르 유사도 측정

* genres list -> str
* count 기반으로 피처벡터화 변환
* 코사인 유사도 사용하여 비교

In [45]:
# genre 컬럼 문자열로 변환
mv['str_genres'] = mv['genres'].apply(lambda x: (' ').join(x))
mv['str_genres']

0       Action Adventure Fantasy Science Fiction
1                       Adventure Fantasy Action
2                         Action Adventure Crime
3                    Action Crime Drama Thriller
4               Action Adventure Science Fiction
                          ...                   
4798                       Action Crime Thriller
4799                              Comedy Romance
4800               Comedy Drama Romance TV Movie
4801                                            
4802                                 Documentary
Name: str_genres, Length: 4803, dtype: object

In [46]:
# 피처벡터화 변환
from sklearn.feature_extraction.text import CountVectorizer

cnt_vec = CountVectorizer(min_df = 0.0, ngram_range = (1, 2))
genre_matrix = cnt_vec.fit_transform(mv['str_genres'])
genre_matrix.shape

(4803, 276)

In [47]:
# CountVectorizer 예시 (출처: https://taptorestart.tistory.com/entry/sklearn-textCountVectorizer%EC%97%90%EC%84%9C-ngramrange-%EC%9D%98%EB%AF%B8%EB%A5%BC-%ED%8C%8C%EC%95%85%ED%95%A0-%EC%88%98-%EC%9E%88%EB%8A%94-%EC%98%88%EC%A0%9C)
# min_df: 최소 빈도수
# ngram_range(min_n, max_n): 단어 몇개를 토큰화할지?

fruit = ['사과 딸기', '딸기 바나나', '수박', '수박 수박']
cv1 = CountVectorizer(min_df=0.0, ngram_range=(1,1))
cv2 = CountVectorizer(min_df=0.0, ngram_range=(1,2))
fv1 = cv1.fit_transform(fruit)
fv2 = cv2.fit_transform(fruit)

print(fv1.toarray())
print(fv2.toarray())

[[1 0 1 0]
 [1 1 0 0]
 [0 0 0 1]
 [0 0 0 2]]
[[1 0 0 1 1 0 0]
 [1 1 1 0 0 0 0]
 [0 0 0 0 0 1 0]
 [0 0 0 0 0 2 1]]


In [48]:
'''
ngram_range = (1,2)
            딸기   딸기 바나나   바나나   사과   사과 딸기   수박   수박 수박
사과 딸기     1         0          0      1         1       0        0
딸기 바나나   1         1          1      0         0       0        0
수박         0         0          0      0         0       1        0
수박 수박     0         0          0      0         0       2        1       
'''

'\nngram_range = (1,2)\n            딸기   딸기 바나나   바나나   사과   사과 딸기   수박   수박 수박\n사과 딸기     1         0          0      1         1       0        0\n딸기 바나나   1         1          1      0         0       0        0\n수박         0         0          0      0         0       1        0\n수박 수박     0         0          0      0         0       2        1       \n'

In [49]:
# 코사인 유사도 계산
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(genre_matrix, genre_matrix)
sim

array([[1.        , 0.59628479, 0.4472136 , ..., 0.        , 0.        ,
        0.        ],
       [0.59628479, 1.        , 0.4       , ..., 0.        , 0.        ,
        0.        ],
       [0.4472136 , 0.4       , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [57]:
# 유사도 값이 높은 순으로 정렬된 비교대상 행의 인덱스값 추출
ind = sim.argsort()[:,::-1]
ind[:1] # 0번 레코드 -> 자기 자신 제외하고 3494번 레코드의 유사도가 가장 높음

array([[   0, 3494,  813, ..., 3038, 3037, 2401]], dtype=int64)

### 장르 콘텐츠 필터링을 이용한 영화 추천

* 영화 추천 함수 만들기

In [58]:
# 인자
# movie_df: 추천 영화를 찾을 기반 데이터
# idx: 코사인 유사도 인덱스 리스트
# movie_title: 기준 영화 제목

def find_movie(movie_df, idx, movie_title, top_n = 10):
    title = movie_df[movie_df['title'] == movie_title]
    
    title_idx = title.index.values
    sim_idxs = idx[title_idx, :(top_n)]

    sim_idxs = sim_idxs.reshape(-1)

    return movie_df.iloc[sim_idxs]

In [59]:
movies = find_movie(mv, ind, 'The Godfather', 10)
movies[['title', 'vote_average']]

Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1
