# 단어의 임베딩
- 빈도수 계산: 빈도기반 - TF 상대빈도
- TDM: matrix TF를 행렬로 만든 것, 사전을 이용한 단순빈도
- TF-IDF: TF*IDF 다른 문서에서 나온 빈도수와 확인 -> 문서 안에서 중요한 단어인지?
- IDF: 역문서빈도


In [1]:
text = "John likes to watch movies. Mary likes movies too.\
Mary also likes to watch football games."
words = text.replace('.','').split()
print(words)

['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'tooMary', 'also', 'likes', 'to', 'watch', 'football', 'games']


In [2]:
import numpy as np
word_count = np.unique(words, return_counts=True)
word_count

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies',
        'to', 'tooMary', 'watch'], dtype='<U8'),
 array([1, 1, 1, 1, 1, 3, 2, 2, 1, 2]))

In [3]:
# 딕셔너리 TF 생성
word_to_cnt = {}
for word, cnt in zip(*word_count):
    word_to_cnt[word] = cnt

word_to_cnt

{'John': 1,
 'Mary': 1,
 'also': 1,
 'football': 1,
 'games': 1,
 'likes': 3,
 'movies': 2,
 'to': 2,
 'tooMary': 1,
 'watch': 2}

In [4]:
word_to_cnt['movies'] # movies 빈도수

2

In [5]:
!pip install scikit-learn pandas



In [6]:
corpus = [
    "John likes to watch movies. Mary likes movies too.",
    "Mary also likes to watch football games."
]

# TDM: 문장 내에 나타난 빈도수 확인

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
dtm_array = vector.fit_transform(corpus).toarray()
dtm_array

array([[0, 0, 0, 1, 2, 1, 2, 1, 1, 1],
       [1, 1, 1, 0, 1, 1, 0, 1, 0, 1]])

In [8]:
tf_dic = vector.vocabulary_ # 단어 사전 만들기(문서 전체)
print(tf_dic)

{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [9]:
import pandas as pd
tf_dic_sorted = dict(sorted(tf_dic.items(), key=lambda item:item[1]))
# tf_dic_sorted
df = pd.DataFrame(dtm_array, columns=tf_dic_sorted.keys())
df # key에 따라 달라지는 행렬->tdm

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0,0,0,1,2,1,2,1,1,1
1,1,1,1,0,1,1,0,1,0,1


# TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()
tfidf_dic = tfidf_vec.vocabulary_
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(),
                               key=lambda item: item[1]))
tfidf_dtm = pd.DataFrame(tfidf_array, columns=tfidf_dic_sorted.keys())
tfidf_dtm

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0.0,0.0,0.0,0.323699,0.460629,0.230315,0.647398,0.230315,0.323699,0.230315
1,0.446101,0.446101,0.446101,0.0,0.317404,0.317404,0.0,0.317404,0.0,0.317404


In [20]:
!pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.2 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.3.0.post1-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m10.6 MB/s[0m  [33m0:00:02[0m eta [36m0:00:01[0m
[?25hDownloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m2.1 MB/s[0m  [33m0:00:08[0mm0:00:01[0m00:01[0m


In [11]:
corpust = [
    "John likes to watch movies. Mary likes movies too.",
    "Mary also likes to watch football games."   
]
word_list = []
for word in corpus:
    word_list.append(word.replace('.', '').split())

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(word_list, sg=0, vector_size=100,window=3, min_count=1)
print(model.wv.most_similar('likes'))
print(model.wv.similarity('movies','games'))

[('John', 0.21617145836353302), ('also', 0.09291718155145645), ('too', 0.027057474479079247), ('football', 0.01613466814160347), ('Mary', -0.01084057241678238), ('to', -0.02775036357343197), ('movies', -0.052346743643283844), ('games', -0.05987629294395447), ('watch', -0.111670583486557)]
0.0640898


In [19]:
model = Word2Vec(word_list, sg=0, vector_size=100,window=3, min_count=1)
model.wv.most_similar('games')

[('to', 0.13887982070446014),
 ('watch', 0.13149002194404602),
 ('movies', 0.06408978253602982),
 ('too', 0.06059185042977333),
 ('football', 0.019152285531163216),
 ('Mary', 0.009398205205798149),
 ('also', -0.05774582177400589),
 ('likes', -0.05987628549337387),
 ('John', -0.10513809323310852)]