In [1]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

def cos_sim(A, B):
    return dot(A, B)/(norm(A) * norm(B))

doc1 = np.array([0, 1, 1, 1])
doc2 = np.array([1, 0, 1, 1])
doc3 = np.array([2, 0, 2, 2])

print(f"Doc1 & Doc2 Similarity : {cos_sim(doc1, doc2)}")
print(f"Doc1 & Doc3 Similarity : {cos_sim(doc1, doc3)}")
print(f"Doc2 & Doc3 Similarity : {cos_sim(doc2, doc3)}")

Doc1 & Doc2 Similarity : 0.6666666666666667
Doc1 & Doc3 Similarity : 0.6666666666666667
Doc2 & Doc3 Similarity : 1.0000000000000002


## Recommendation System Implementation using Similarity

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv("movies_metadata.csv", low_memory=False)
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [9]:
data = data.head(20000)
print(f"overview 열의 결측값의 수 : {data['overview'].isnull().sum()}")
data['overview'] = data['overview'].fillna('')
print(f"overview 열의 결측값의 수 : {data['overview'].isnull().sum()}")

overview 열의 결측값의 수 : 135
overview 열의 결측값의 수 : 0


In [22]:
data['overview'][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [10]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['overview'])
print(f"TF-IDF's Matrix Size : {tfidf_matrix.shape}")

TF-IDF's Matrix Size : (20000, 47487)


In [18]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(f"Cosine Similarity Result : {cosine_sim.shape}")
print(cosine_sim)

Cosine Similarity Result : (20000, 20000)
[[1.         0.01575748 0.         ... 0.         0.         0.        ]
 [0.01575748 1.         0.04907345 ... 0.         0.         0.        ]
 [0.         0.04907345 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.08375766]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.08375766 0.         1.        ]]


In [13]:
title_to_index = dict(zip(data['title'], data.index))

idx = title_to_index['Father of the Bride Part II']
print(idx)

4


In [14]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = title_to_index[title]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    movie_indices = [idx[0] for idx in sim_scores]

    return data['title'].iloc[movie_indices]

In [15]:
get_recommendations('The Dark Knight Rises')

18252                                The Dark Knight Rises
12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
                               ...                        
19992                          How to Make Love to a Woman
19994                               Violeta Went to Heaven
19996                                           Versailles
19998    Lotte Reiniger: Homage to the Inventor of the ...
19999    RKO Production 601: The Making of 'Kong, the E...
Name: title, Length: 20000, dtype: object

### Euclide Distance -> 간단하게 두 점 사이의 거리를 구하는 공식에서 다차원

In [23]:
import numpy as np

def dist(x, y):
    return np.sqrt(np.sum((x-y)**2))

doc1 = np.array((2, 3, 0, 1))
doc2 = np.array((1, 2, 3, 1))
doc3 = np.array((2, 1, 2, 2))
docQ = np.array((1, 1, 0, 1))

print(f'Doc1 & DocQ Distance {dist(doc1, docQ)}')
print(f'Doc2 & DocQ Distance {dist(doc2, docQ)}')
print(f'Doc3 & DocQ Distance {dist(doc3, docQ)}')

Doc1 & DocQ Distance 2.23606797749979
Doc2 & DocQ Distance 3.1622776601683795
Doc3 & DocQ Distance 2.449489742783178


### Jaccard Similarity -> 합집합에서의 교집합의 비율

In [24]:
doc1 = "apple banana everyone like likey watch card holder"
doc2 = "apple banana coupon passport love you"

tokenized_doc1 = doc1.split()
tokenized_doc2 = doc2.split()

print(f"Doc1 : {tokenized_doc1}")
print(f"Doc2 : {tokenized_doc2}")

Doc1 : ['apple', 'banana', 'everyone', 'like', 'likey', 'watch', 'card', 'holder']
Doc2 : ['apple', 'banana', 'coupon', 'passport', 'love', 'you']


In [26]:
union = set(tokenized_doc1).union(set(tokenized_doc2))
print(f"Union of Doc1 & Doc2 : {union}")

intersection = set(tokenized_doc1).intersection(set(tokenized_doc2))
print(f'Intersection of Doc1 & Doc2 : {intersection}')

Union of Doc1 & Doc2 : {'holder', 'passport', 'likey', 'coupon', 'apple', 'banana', 'watch', 'everyone', 'love', 'card', 'you', 'like'}
Intersection of Doc1 & Doc2 : {'apple', 'banana'}


In [27]:
print(f"Jaccard Similarity : {len(intersection)/len(union)}")

Jaccard Similarity : 0.16666666666666666
