In [32]:
docs = [
    '여름엔 과일이 맛있다',
    '여름엔 수박이 맛있다',
    '여름엔 바다지',
    '여름엔 과일이 맛없다'
]

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(docs)
print(vect.vocabulary_)

{'여름엔': 5, '과일이': 0, '맛있다': 2, '수박이': 4, '바다지': 3, '맛없다': 1}


In [34]:
vect_arr = vect.transform(docs).toarray()
print(vect_arr)

[[1 0 1 0 0 1]
 [0 0 1 0 1 1]
 [0 0 0 1 0 1]
 [1 1 0 0 0 1]]


In [35]:
import numpy as np

doc1 = vect_arr[0]
doc1_square = np.square(doc1)
doc1_square_sum = np.sum(doc1_square)
doc1_square_sum_sqrt = np.sqrt(doc1_square_sum)

for val in vect_arr:
    val_square = np.square(val)
    val_square_sum = np.sum(val_square)
    result = np.sqrt(val_square_sum)
    
    denominator = doc1_square_sum_sqrt * result
    numerator = np.dot(doc1, val)
    cos_sim = numerator / denominator
    print(cos_sim)

1.0000000000000002
0.6666666666666667
0.40824829046386296
0.6666666666666667


> 중복 유무에 따른 결과를 비교하려 두 값을 확인했지만 결국은 같은 결과가 나타났다. 이로 인해 중복된 값이 결과에 영향을 미치지는 않는다는 것이다.

In [36]:
docs = [
    '여름엔 과일이 맛있다 여름엔 과일이 맛있다',
    '여름엔 수박이 맛있다',
    '여름엔 바다지',
    '여름엔 과일이 맛없다'
]

In [37]:
def norm(v):
    return np.sqrt(np.sum(np.square(v)))

def cos_similarity(v1, v2):
    return np.dot(v1, v2) / (norm(v1) * norm(v2))

>위의 식을 이용하여 간단하게 함수를 만들었다. 
* dot(a, b) : a와 b의 곱의 합
* square() : 제곱
* sum() : 합
* sqrt : 루트

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(vect_arr, vect_arr)
sim

array([[1.        , 0.66666667, 0.40824829, 0.66666667],
       [0.66666667, 1.        , 0.40824829, 0.33333333],
       [0.40824829, 0.40824829, 1.        , 0.40824829],
       [0.66666667, 0.33333333, 0.40824829, 1.        ]])

In [39]:
sim_sorted = sim.argsort()
sim_sorted

array([[2, 1, 3, 0],
       [3, 2, 0, 1],
       [0, 1, 3, 2],
       [1, 2, 0, 3]])

In [40]:
sim.argsort()[::-1]

array([[1, 2, 0, 3],
       [0, 1, 3, 2],
       [3, 2, 0, 1],
       [2, 1, 3, 0]])

In [41]:
sim_sorted = sim.argsort()[:, ::-1]
sim_sorted

array([[0, 3, 1, 2],
       [1, 0, 2, 3],
       [2, 3, 1, 0],
       [3, 0, 2, 1]])

# Quiz

1. tmdb_5000_movies.csv를 이용하여 코사인 유사도로 장르가 비슷한 영화 찾기
2. 비슷한 영화에서 검색한 영화를 제외하고 영화 찾기 (10개)
3. 비슷한 영화에서 평점이 높은 순으로 영화 찾기 (10개)

In [42]:
import pandas as pd

file = 'data/tmdb_5000_movies.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [44]:
movie_features = ['id', 'title', 'genres','vote_average', 'vote_count', 'popularity', 
                    'keywords', 'overview']
df_movies = df[movie_features]
df_movies.head(2)

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",7.2,11800,150.437577,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",6.9,4500,139.082615,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha..."


In [45]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            4803 non-null   int64  
 1   title         4803 non-null   object 
 2   genres        4803 non-null   object 
 3   vote_average  4803 non-null   float64
 4   vote_count    4803 non-null   int64  
 5   popularity    4803 non-null   float64
 6   keywords      4803 non-null   object 
 7   overview      4800 non-null   object 
dtypes: float64(2), int64(2), object(4)
memory usage: 300.3+ KB


In [46]:
df_movies = df_movies.dropna()
df_movies.reset_index(drop=True, inplace=True)
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            4800 non-null   int64  
 1   title         4800 non-null   object 
 2   genres        4800 non-null   object 
 3   vote_average  4800 non-null   float64
 4   vote_count    4800 non-null   int64  
 5   popularity    4800 non-null   float64
 6   keywords      4800 non-null   object 
 7   overview      4800 non-null   object 
dtypes: float64(2), int64(2), object(4)
memory usage: 300.1+ KB


In [47]:
# 코사인 유사도로 장르가 비슷한 영화 찾기

In [48]:
df_movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [49]:
from ast import literal_eval
df_movies['genres'] = df_movies['genres'].apply(literal_eval)
df_movies['genres'][0]

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [50]:
df_movies['genres'] = df_movies['genres'].apply(lambda x: [genre['name'] for genre in x])
df_movies['genres'][0]

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [51]:
df_movies['genres_str'] = df_movies['genres'].apply(lambda x: (' ').join(x))
df_movies['genres_str'][0]

'Action Adventure Fantasy Science Fiction'

In [52]:
vect = CountVectorizer(ngram_range=(1, 2))
vect_arr = vect.fit_transform(df_movies['genres_str']).toarray()
print(vect_arr.shape)
print(vect.vocabulary_)

(4800, 276)
{'action': 0, 'adventure': 16, 'fantasy': 124, 'science': 232, 'fiction': 138, 'action adventure': 1, 'adventure fantasy': 24, 'fantasy science': 135, 'science fiction': 233, 'fantasy action': 125, 'crime': 64, 'adventure crime': 20, 'drama': 90, 'thriller': 234, 'action crime': 4, 'crime drama': 68, 'drama thriller': 106, 'adventure science': 29, 'animation': 33, 'family': 109, 'animation family': 38, 'fantasy family': 130, 'action science': 12, 'adventure action': 17, 'action thriller': 13, 'thriller crime': 238, 'western': 265, 'adventure western': 32, 'adventure family': 23, 'family fantasy': 115, 'fiction action': 139, 'action fantasy': 7, 'comedy': 44, 'action comedy': 3, 'comedy science': 59, 'adventure drama': 22, 'drama action': 91, 'romance': 214, 'drama romance': 104, 'romance thriller': 228, 'thriller action': 235, 'fiction thriller': 150, 'adventure thriller': 30, 'fantasy adventure': 126, 'family adventure': 111, 'adventure comedy': 19, 'thriller science': 247

In [53]:
sim = cosine_similarity(vect_arr, vect_arr)
sim_sorted = sim.argsort()[:, ::-1]
sim_sorted[:10]

array([[  14,    0,  813, ..., 4736, 4737, 4739],
       [2390, 2343,   12, ...,   34,   42, 4443],
       [   2, 1542, 1740, ..., 4115, 4787, 4121],
       ...,
       [4041,    7,   47, ...,   42, 4736, 4789],
       [ 114,  341,    8, ..., 4797, 4798, 4778],
       [  19,   20,   38, ...,   34,   42, 4443]])

In [54]:
def genre_sim(title):
    idx = df_movies[df_movies['title'].str.lower() == title.lower()].index[0]
    sim_idx = sim_sorted[idx]
    return df_movies.iloc[sim_idx]

In [61]:
genre_sim('avatar')

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview,genres_str
14,49521,Man of Steel,"[Action, Adventure, Fantasy, Science Fiction]",6.5,6359,99.398009,"[{""id"": 83, ""name"": ""saving the world""}, {""id""...",A young boy learns that he has extraordinary p...,Action Adventure Fantasy Science Fiction
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",7.2,11800,150.437577,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",Action Adventure Fantasy Science Fiction
813,1924,Superman,"[Action, Adventure, Fantasy, Science Fiction]",6.9,1022,48.507081,"[{""id"": 83, ""name"": ""saving the world""}, {""id""...",Mild-mannered Clark Kent works as a reporter a...,Action Adventure Fantasy Science Fiction
870,8536,Superman II,"[Action, Adventure, Fantasy, Science Fiction]",6.5,629,30.515175,"[{""id"": 83, ""name"": ""saving the world""}, {""id""...",Three escaped criminals from the planet Krypto...,Action Adventure Fantasy Science Fiction
46,127585,X-Men: Days of Future Past,"[Action, Adventure, Fantasy, Science Fiction]",7.5,6032,118.078691,"[{""id"": 1228, ""name"": ""1970s""}, {""id"": 1852, ""...",The ultimate X-Men ensemble fights a war for t...,Action Adventure Fantasy Science Fiction
...,...,...,...,...,...,...,...,...,...
34,62211,Monsters University,"[Animation, Family]",7.0,3528,89.186492,"[{""id"": 1299, ""name"": ""monster""}, {""id"": 5984,...",A look at the relationship between Mike and Su...,Animation Family
42,10193,Toy Story 3,"[Animation, Family, Comedy]",7.6,4597,59.995418,"[{""id"": 1562, ""name"": ""hostage""}, {""id"": 3616,...","Woody, Buzz, and the rest of Andy's toys haven...",Animation Family Comedy
4736,90414,"I Love You, Don't Touch Me!","[Comedy, Romance]",6.0,1,0.020839,"[{""id"": 187056, ""name"": ""woman director""}]","The story of a 25 year old virgin girl, lookin...",Comedy Romance
4737,111794,20 Dates,"[Romance, Comedy]",3.7,3,0.408073,"[{""id"": 3979, ""name"": ""hidden camera""}, {""id"":...",Myles is divorced in L.A. He wants a love life...,Romance Comedy


In [63]:
# 장르 비슷한 영화 중 검색한 영화를 제외하고 영화 10개 찾기
def genre_sim10(title):
    movie_sorted = genre_sim(title)
    top11 = movie_sorted[:11]
    return top11[top11['title'] != title]['title']


In [64]:
genre_sim10('avatar')

14                                   Man of Steel
0                                          Avatar
813                                      Superman
870                                   Superman II
46                     X-Men: Days of Future Past
3493    Beastmaster 2: Through the Portal of Time
1296                                 Superman III
1652                         Dragonball Evolution
419                                        Jumper
420                   Hellboy II: The Golden Army
1191                               Small Soldiers
Name: title, dtype: object

In [65]:
# 장르 비슷한 영화 중 검색한 영화를 제외하고 평점 상위 10개 찾기

In [66]:
def topVoted_genre_sim10(title):
    movie_sorted = genre_sim(title)
    top11 = movie_sorted[:11]
    return top11[top11['title'] != title][['title', 'vote_average']].sort_values(by='vote_average', ascending=False)

In [67]:
topVoted_genre_sim10('avatar')

Unnamed: 0,title,vote_average
46,X-Men: Days of Future Past,7.5
0,Avatar,7.2
813,Superman,6.9
14,Man of Steel,6.5
870,Superman II,6.5
420,Hellboy II: The Golden Army,6.5
1191,Small Soldiers,6.2
419,Jumper,5.9
1296,Superman III,5.3
3493,Beastmaster 2: Through the Portal of Time,4.6
