# TF-IDF & Consine Similarity

In [1]:
import warnings
warnings.filterwarnings('ignore')

# I. TF-IDF Vectorization

In [2]:
import numpy as np
np.set_printoptions(linewidth = 200, precision = 5)

> ## 1) 문장 3개 지정

In [3]:
doc_list = ['if you take the blue pill, the story ends' ,
            'if you take the red pill, you stay in Wonderland',
            'if you take the red pill, I show you how deep the rabbit hole goes']

In [4]:
doc_list

['if you take the blue pill, the story ends',
 'if you take the red pill, you stay in Wonderland',
 'if you take the red pill, I show you how deep the rabbit hole goes']

> ## 2) Tokenization & Vectorization

* TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_TV = TfidfVectorizer()
feature_vec = tfidf_TV.fit(doc_list)

* 결과 확인

In [6]:
print(feature_vec.vocabulary_)

{'if': 6, 'you': 17, 'take': 14, 'the': 15, 'blue': 0, 'pill': 8, 'story': 13, 'ends': 2, 'red': 10, 'stay': 12, 'in': 7, 'wonderland': 16, 'show': 11, 'how': 5, 'deep': 1, 'rabbit': 9, 'hole': 4, 'goes': 3}


> ## 3) 행렬 변환

* .toarray( )

In [7]:
feature_vec_matrix = feature_vec.transform(doc_list).toarray()

* .shape

In [8]:
feature_vec_matrix.shape

(3, 18)

* 결과 확인

In [9]:
print(feature_vec_matrix)

[[0.41556 0.      0.41556 0.      0.      0.      0.24544 0.      0.24544 0.      0.      0.      0.      0.41556 0.24544 0.49088 0.      0.24544]
 [0.      0.      0.      0.      0.      0.      0.23403 0.39624 0.23403 0.      0.30135 0.      0.39624 0.      0.23403 0.23403 0.39624 0.46806]
 [0.      0.30986 0.      0.30986 0.30986 0.30986 0.18301 0.      0.18301 0.30986 0.23565 0.30986 0.      0.      0.18301 0.36601 0.      0.36601]]


> ## 4) 개별 Feature Vector 추출

In [10]:
vec1 = np.array(feature_vec_matrix[0]).reshape(-1,)
vec2 = np.array(feature_vec_matrix[1]).reshape(-1,)
vec3 = np.array(feature_vec_matrix[2]).reshape(-1,)

print(vec1)
print(vec2)
print(vec3)

[0.41556 0.      0.41556 0.      0.      0.      0.24544 0.      0.24544 0.      0.      0.      0.      0.41556 0.24544 0.49088 0.      0.24544]
[0.      0.      0.      0.      0.      0.      0.23403 0.39624 0.23403 0.      0.30135 0.      0.39624 0.      0.23403 0.23403 0.39624 0.46806]
[0.      0.30986 0.      0.30986 0.30986 0.30986 0.18301 0.      0.18301 0.30986 0.23565 0.30986 0.      0.      0.18301 0.36601 0.      0.36601]


# II. cos_similarity( )

* 두 벡터의 크기와 상관없이, 상호 방향성이 얼마나 유사한지에 기반
 - 두 벡터의 사잇각을 계산하여 유사도 측정
<br>
<br>
* 두 벡터의 내적을 총 벡터 크기로 정규화(L2 Norm)
 - dot_product: 두 벡터의 내적
 - l2_norm: 총 벡터 크기의 합

In [11]:
import numpy as np

def cos_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    l2_norm = (np.sqrt(sum(np.square(v1))) * np.sqrt(sum(np.square(v2))))
    similarity = dot_product / l2_norm     
    
    return similarity

> ## 1) 'vec1', 'vec2' 코사인 유사도

In [12]:
similarity_simple = cos_similarity(vec1, vec2)

print('vec1, vec2 코사인 유사도: {0:.5f}'.format(similarity_simple))

vec1, vec2 코사인 유사도: 0.40208


> ## 2) 'vec1', 'vec3' 코사인 유사도

In [13]:
similarity_simple = cos_similarity(vec1, vec3)

print('vec1, vec3 코사인 유사도: {0:.5f}'.format(similarity_simple))

vec1, vec3 코사인 유사도: 0.40425


> ## 3) 'vec2', 'vec3' 코사인 유사도

In [14]:
similarity_simple = cos_similarity(vec2, vec3)

print('vec2, vec3 코사인 유사도: {0:.5f}'.format(similarity_simple))

vec2, vec3 코사인 유사도: 0.45647


# III. sklearn - cosine_similarity( )

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity(feature_vec_matrix, feature_vec_matrix))

[[1.      0.40208 0.40425]
 [0.40208 1.      0.45647]
 [0.40425 0.45647 1.     ]]


# IV. Topic Problem
- 단어매칭방식의 경우, 동일한 의미지만 서로 다른 단어로 구성된 문장들의 유사도를 알 수 없는 문제

> ## 1) 문장 지정

In [16]:
sent_list = ['I eat an apple',
             'Koo have fruit',
             'I sell an apple']

> ## 2) 벡터 변환

In [17]:
tfidf_vec = TfidfVectorizer()
feature_vec = tfidf_vec.fit_transform(sent_list)

In [18]:
print(feature_vec.toarray())

[[0.51786 0.51786 0.68092 0.      0.      0.      0.     ]
 [0.      0.      0.      0.57735 0.57735 0.57735 0.     ]
 [0.51786 0.51786 0.      0.      0.      0.      0.68092]]


> ## 3) 문장1 vs. 문장2

In [19]:
print(cosine_similarity(feature_vec[0], feature_vec[1]))

[[0.]]


> ## 4) 문자1 vs. 문장3

In [20]:
print(cosine_similarity(feature_vec[0], feature_vec[2]))

[[0.53635]]


# V. Word2Vec


* 벡터 공간상에 비슷한 위치에 있다면 유사도(비슷한 의미)가 높음



> ## 1) Load Pretrained Word2Vec

In [21]:
import tensorflow_hub as hub

embed = hub.load('https://tfhub.dev/google/Wiki-words-250/2')

> ## 2) 'words' List 정의

In [22]:
words = ['apple', 'eat', 'fruit', 'have', 'sell']

> ## 3) Word2Vec Embedding

In [23]:
embeddings = embed(words)

> ## 4) Cosine Similarity

In [24]:
for i in range(len(words)):
    for j in range(i,len(words)):
        print("(",words[i], ",", words[j],")", cos_similarity(embeddings[i], embeddings[j]))

( apple , apple ) 0.9999999931909316
( apple , eat ) 0.489093062896653
( apple , fruit ) 0.7875376298521813
( apple , have ) 0.13348328970070875
( apple , sell ) 0.1062324041549029
( eat , eat ) 0.9999999971373392
( eat , fruit ) 0.5329400280904417
( eat , have ) 0.32322418518239215
( eat , sell ) 0.2691977909422276
( fruit , fruit ) 0.9999999953289063
( fruit , have ) 0.13598027740914803
( fruit , sell ) 0.11212407751092707
( have , have ) 0.9999999922515387
( have , sell ) 0.21071003257663778
( sell , sell ) 0.99999999637987


# 
# 
# 
# The End
# 
# 
# 