# Contents-based Filtering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

movies=pd.read_csv('movies.csv')
links=pd.read_csv('links.csv')
ratings=pd.read_csv('ratings.csv')

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
movies['genres']

0       Adventure|Animation|Children|Comedy|Fantasy
1                        Adventure|Children|Fantasy
2                                    Comedy|Romance
3                              Comedy|Drama|Romance
4                                            Comedy
                           ...                     
9737                Action|Animation|Comedy|Fantasy
9738                       Animation|Comedy|Fantasy
9739                                          Drama
9740                               Action|Animation
9741                                         Comedy
Name: genres, Length: 9742, dtype: object

In [None]:
movies.loc[movies['genres'].notnull()]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [None]:
movies.shape

(9742, 3)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer()

In [None]:
movies_genres=movies['genres'].str.split('|') # counter vectorizer 를 사용하기 위해 공백 단위로 구분할 필요가 있음
movies_genres


0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9737                 [Action, Animation, Comedy, Fantasy]
9738                         [Animation, Comedy, Fantasy]
9739                                              [Drama]
9740                                  [Action, Animation]
9741                                             [Comedy]
Name: genres, Length: 9742, dtype: object

In [None]:
movies['genres']=movies_genres
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]"
9738,193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]"
9739,193585,Flint (2017),[Drama]
9740,193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]"


In [None]:
movies['genres']=movies['genres'].apply(lambda x: ' '.join(x))
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy
9738,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation


In [None]:
count_vect=vect.fit_transform(movies['genres'])

In [None]:
count_vect

<9742x24 sparse matrix of type '<class 'numpy.int64'>'
	with 23219 stored elements in Compressed Sparse Row format>

In [None]:
vect.vocabulary_

{'adventure': 1,
 'animation': 2,
 'children': 3,
 'comedy': 4,
 'fantasy': 8,
 'romance': 19,
 'drama': 7,
 'action': 0,
 'crime': 5,
 'thriller': 21,
 'horror': 12,
 'mystery': 16,
 'sci': 20,
 'fi': 9,
 'war': 22,
 'musical': 15,
 'documentary': 6,
 'imax': 13,
 'western': 23,
 'film': 10,
 'noir': 18,
 'no': 17,
 'genres': 11,
 'listed': 14}

In [None]:
count_vect_df=pd.DataFrame(count_vect.toarray(), columns=sorted(vect.vocabulary_)) # sorted 안해주면 columns 순서가 달라짐
count_vect_df.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,0,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

genres_sim=cosine_similarity(count_vect_df,count_vect_df)
genres_sim[:5]

array([[1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
        0.4472136 ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.        , 0.        ,
        0.70710678],
       [0.25819889, 0.        , 0.81649658, ..., 0.57735027, 0.        ,
        0.57735027],
       [0.4472136 , 0.        , 0.70710678, ..., 0.        , 0.        ,
        1.        ]])

In [None]:
genres_sim.shape

(9742, 9742)

In [None]:
genre_sim_desc = genres_sim.argsort()[:, ::-1]
genre_sim_desc[:5]

array([[   0, 6948, 1706, ..., 4562, 4563, 4870],
       [ 767, 6655, 7478, ..., 6042, 6044, 4870],
       [4390, 4352, 8622, ..., 6889, 6888, 4870],
       [7075, 2541, 4704, ..., 4827, 2278, 8784],
       [9741, 8753, 8755, ..., 4053, 4054, 4870]])

In [None]:
# 유사도 높은 순서 영화 찾기

def find_sim_movies(df, sim_df, title, top_n=10):
    title_movie = df[df['title'] == title] 
    
    title_index = title_movie.index.values
    similar_indexes = sim_df[title_index, :(top_n)]
    
    # 추출된 top_n index들 출력. top_n index는 2차원 데이터 임. 
    #dataframe에서 index로 사용하기 위해서 1차원 array로 변경
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]
        

In [None]:
find_sim_movies(movies,genre_sim_desc,"Toy Story (1995)",10)

[[   0 6948 1706 8927 2809 8219 6486 3568 7760 2355]]


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
6948,65577,"Tale of Despereaux, The (2008)",Adventure Animation Children Comedy Fantasy
1706,2294,Antz (1998),Adventure Animation Children Comedy Fantasy
8927,136016,The Good Dinosaur (2015),Adventure Animation Children Comedy Fantasy
2809,3754,"Adventures of Rocky and Bullwinkle, The (2000)",Adventure Animation Children Comedy Fantasy
8219,103755,Turbo (2013),Adventure Animation Children Comedy Fantasy
6486,53121,Shrek the Third (2007),Adventure Animation Children Comedy Fantasy
3568,4886,"Monsters, Inc. (2001)",Adventure Animation Children Comedy Fantasy
7760,91355,Asterix and the Vikings (Astérix et les Viking...,Adventure Animation Children Comedy Fantasy
2355,3114,Toy Story 2 (1999),Adventure Animation Children Comedy Fantasy
