TF-IDF
1. tfidf(t,d,D) = tf(t,d) * idf(t,D)
2. 단어 빈도 - 역문서 빈도
3. TDM 내 각 단어의 중요성을 가중치로 표현
4. tf(d,t) : 특정 문서 d에서 특정 단어 t의 등장 횟수
5. df(t) : 특정 단어 t가 등장한 문서의 수
6. idf(d,t) : df(t)의 역수

In [1]:
#TF-IDF 계산 절차
# 토큰 Index 생성 -> TF계산 -> IDF계산 -> TF-IDF계산

In [2]:
d1 = 'The cat sat on my face I hate a cat'
d2 = 'The dog sat on my bed I love a dog'

In [3]:
import numpy as np

In [4]:
def tf(t,d):
    return d.count(t) / len(d)

def idf(t,D):
    N = len(D)
    n = len([True for d in D if t in d])
    return np.log(N/n)

def tfidf(t, d, D) :
    return tf(t,d) * idf(t,D)

def tokenizer(d):
    return d.split()

def tfidf_scorer(D):
    docs = [tokenizer(d) for d in D]
    result = []
    for d in docs:
        result.append([(t,tfidf(t,d,docs)) for t in d])
    
    return result

In [5]:
tfidf_scorer([d1,d2])

[[('The', 0.0),
  ('cat', 0.13862943611198905),
  ('sat', 0.0),
  ('on', 0.0),
  ('my', 0.0),
  ('face', 0.06931471805599453),
  ('I', 0.0),
  ('hate', 0.06931471805599453),
  ('a', 0.0),
  ('cat', 0.13862943611198905)],
 [('The', 0.0),
  ('dog', 0.13862943611198905),
  ('sat', 0.0),
  ('on', 0.0),
  ('my', 0.0),
  ('bed', 0.06931471805599453),
  ('I', 0.0),
  ('love', 0.06931471805599453),
  ('a', 0.0),
  ('dog', 0.13862943611198905)]]

In [6]:
#sklearn 활용
from sklearn.feature_extraction.text import CountVectorizer

docs = [d1, d2]
count_vect = CountVectorizer()
countv = count_vect.fit_transform(docs)

In [7]:
print(countv.toarray())
print(count_vect.vocabulary_)

[[0 2 0 1 1 0 1 1 1 1]
 [1 0 2 0 0 1 1 1 1 1]]
{'the': 9, 'cat': 1, 'sat': 8, 'on': 7, 'my': 6, 'face': 3, 'hate': 4, 'dog': 2, 'bed': 0, 'love': 5}


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidfv = tfidf_vect.fit_transform(docs)
print(tfidfv.toarray())
print(tfidf_vect.vocabulary_)

[[0.         0.70600557 0.         0.35300279 0.35300279 0.
  0.25116439 0.25116439 0.25116439 0.25116439]
 [0.35300279 0.         0.70600557 0.         0.         0.35300279
  0.25116439 0.25116439 0.25116439 0.25116439]]
{'the': 9, 'cat': 1, 'sat': 8, 'on': 7, 'my': 6, 'face': 3, 'hate': 4, 'dog': 2, 'bed': 0, 'love': 5}


In [9]:
# gensim 활용한 tfdif 계산
from gensim.models import TfidfModel
from gensim import corpora

doc_ls = [d.split() for d in docs] #토큰화
id2word = corpora.Dictionary(doc_ls)
bow = [id2word.doc2bow(d) for d in doc_ls]

tfidf = TfidfModel(bow)
tfidf[bow[0]]

[(3, 0.8164965809277261), (4, 0.4082482904638631), (5, 0.4082482904638631)]

In [10]:
id2word[3]

'cat'

BoW(Bag of Words) : 문서 내 단어 출현 순서는 무시, 빈도수 기반으로 문서 표현하는 방법

In [11]:
#BoW 구현
docs = ['오늘 동물원에서 원숭이를 봤어',
       '오늘 동물원에서 코끼리를 봤어 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [12]:
# 1. 띄어쓰기 단위로 토근화
doc_ls = [d.split() for d in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [13]:
# 2. 각 고유 토큰에 Index 지정
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
[word2id[t] for d in doc_ls for t in d]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이를': 2,
             '봤어': 3,
             '코끼리를': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

In [14]:
# 3. BoW 생성
import numpy as np

bow_ls = []
for i, d in enumerate(doc_ls) : 
    bow = np.zeros(len(word2id), dtype=int)
    for t in d :
        bow[word2id[t]] += 1
    bow_ls.append(bow.tolist())

In [15]:
bow_ls

[[1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 2, 1]]

In [16]:
# sklearn활용 BoW
docs = ['오늘 동물원에서 원숭이를 봤어',
       '오늘 동물원에서 코끼리를 봤어 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
BoW = count_vect.fit_transform(docs)

In [18]:
BoW.toarray()

array([[1, 0, 1, 1, 1, 0, 0, 0],
       [1, 0, 2, 1, 0, 0, 0, 1],
       [1, 2, 0, 0, 0, 1, 1, 0]])

In [19]:
count_vect.vocabulary_

{'오늘': 3,
 '동물원에서': 0,
 '원숭이를': 4,
 '봤어': 2,
 '코끼리를': 7,
 '원숭이에게': 5,
 '바나나를': 1,
 '줬어': 6}

In [20]:
# gensim 활용 BoW 구현
docs = ['오늘 동물원에서 원숭이를 봤어',
       '오늘 동물원에서 코끼리를 봤어 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [21]:
import gensim
from gensim import corpora

doc_ls = [d.split() for d in docs]
id2word = corpora.Dictionary(doc_ls)
bow = [id2word.doc2bow(d) for d in doc_ls]

In [22]:
bow[0]

[(0, 1), (1, 1), (2, 1), (3, 1)]

In [23]:
# TDM 구현
docs = ['오늘 동물원에서 원숭이를 봤어',
       '오늘 동물원에서 코끼리를 봤어 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [24]:
# 1. 띄어쓰기 단위로 토큰화
doc_ls = [d.split() for d in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [25]:
# 2. 각 고유 토큰에 Index 지정
from collections import defaultdict

word2id = defaultdict(lambda:len(word2id))
[word2id[t] for d in doc_ls for t in d]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이를': 2,
             '봤어': 3,
             '코끼리를': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

In [26]:
# 3. TDM 생성
import numpy as np

TDM = np.zeros((len(word2id), len(doc_ls)), dtype=int)
for i, d in enumerate(doc_ls) :
    for t in d:
        TDM[word2id[t], i] +=1

In [27]:
TDM

array([[1, 1, 0],
       [1, 1, 1],
       [1, 0, 0],
       [1, 2, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 2],
       [0, 0, 1]])

In [28]:
# sklearn 활용 TDM 구현
docs = ['오늘 동물원에서 원숭이를 봤어',
       '오늘 동물원에서 코끼리를 봤어 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
DTM = count_vect.fit_transform(docs)

In [30]:
DTM.toarray()

array([[1, 0, 1, 1, 1, 0, 0, 0],
       [1, 0, 2, 1, 0, 0, 0, 1],
       [1, 2, 0, 0, 0, 1, 1, 0]])

In [31]:
# gensim 활용 TDM 구현
docs = ['오늘 동물원에서 원숭이를 봤어',
       '오늘 동물원에서 코끼리를 봤어 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [32]:
import gensim
from gensim import corpora

doc_ls = [d.split() for d in docs]
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(d) for d in doc_ls]

In [33]:
TDM

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (1, 2), (2, 1), (4, 1)],
 [(0, 1), (5, 2), (6, 1), (7, 1)]]

In [34]:
# TFIDF 구현하기
d1 = 'The cat sat on my face I hate a cat'
d2 = 'The dog sat on my bed I love a dog'
doc_ls = [d1, d2]

In [35]:
import numpy as np
from collections import defaultdict

def tf(t, d):
    return d.count(t) / len(d)

def idf(t,D):
    N = len(D)
    n = len([True for d in D if t in d])
    return np.log(N/n)

def tfidf(t,d,D):
    return tf(t,d) * idf(t,D)

def tokenizer(d):
    return d.split()

def tfidfScorer(D):
    doc_ls = [tokenizer(d) for d in D]
    word2id = defaultdict(lambda:len(word2id))
    
    [word2id[t] for d in doc_ls for t in d]
    
    tfidf_mat = np.zeros((len(doc_ls), len(word2id)))
    for i, d in enumerate(doc_ls):
        for t in d:
            tfidf_mat[i,word2id[t]] = tfidf(t,d,D)

    return tfidf_mat, word2id.keys()

In [36]:
mat, vocab = tfidfScorer(doc_ls)

In [37]:
import pandas as pd
pd.DataFrame(mat, columns=vocab)

Unnamed: 0,The,cat,sat,on,my,face,I,hate,a,dog,bed,love
0,0.0,0.138629,0.0,0.0,0.0,0.069315,0.0,0.069315,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138629,0.069315,0.069315


In [38]:
# sklearn 활용 TFIDF 구현
d1 = 'The cat sat on my face I hate a cat'
d2 = 'The dog sat on my bed I love a dog'
doc_ls = [d1, d2]

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(doc_ls)
tfidf.todense()

matrix([[0.        , 0.70600557, 0.        , 0.35300279, 0.35300279,
         0.        , 0.25116439, 0.25116439, 0.25116439, 0.25116439],
        [0.35300279, 0.        , 0.70600557, 0.        , 0.        ,
         0.35300279, 0.25116439, 0.25116439, 0.25116439, 0.25116439]])

In [40]:
import pandas as pd
pd.DataFrame(tfidf.todense(), columns=tfidf_vect.get_feature_names_out())

Unnamed: 0,bed,cat,dog,face,hate,love,my,on,sat,the
0,0.0,0.706006,0.0,0.353003,0.353003,0.0,0.251164,0.251164,0.251164,0.251164
1,0.353003,0.0,0.706006,0.0,0.0,0.353003,0.251164,0.251164,0.251164,0.251164


In [54]:
# gensim 활용 tfidf구현
d1 = 'The cat sat on my face I hate a cat'
d2 = 'The dog sat on my bed I love a dog'
docs = [d1, d2]

In [55]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel

doc_ls = [d.split() for d in docs]
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(d) for d in doc_ls]
model = TfidfModel(TDM)

In [56]:
model[TDM][0]

[(3, 0.8164965809277261), (4, 0.4082482904638631), (5, 0.4082482904638631)]

In [57]:
from gensim.matutils import sparse2full

TDM_matrix = [sparse2full(d, len(id2word)).tolist() for d in model[TDM]]

In [58]:
TDM_matrix

[[0.0,
  0.0,
  0.0,
  0.8164966106414795,
  0.40824830532073975,
  0.40824830532073975,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.40824830532073975,
  0.8164966106414795,
  0.40824830532073975]]

In [59]:
import pandas as pd
pd.DataFrame(TDM_matrix, columns=id2word.values())

Unnamed: 0,I,The,a,cat,face,hate,my,on,sat,bed,dog,love
0,0.0,0.0,0.0,0.816497,0.408248,0.408248,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,0.816497,0.408248
