# BOW ( Bag of Words)
문장의 원래 순서는 무시한 채 단어 집합의 순서에 맞춰 빈도수나 어떠한 벡터형태로 나타 낸 것

* DTM
* TF-IDF
 * $tf(d, t)$ : 특정 문서 $d$에서 특정 단어 $t$ 의 등장 횟수. 즉 DTM 상에서의 단어들의 값
  * $df(t)$ : 단어 t가 등장한 문서의 수

## Idf 식

$$
idf(t) = log(\frac{n}{1+df(t)})
$$

In [43]:
# n 번 문서 (document) 에서 단어 (term) 이 등장한 횟수

def term_frequency(term, document):
  return document.count(term)


# 단어가 문서들에서 몇번 등장했는지 세어주기
def document_frequency(term, documents):
  term_count = 0
  for document in documents:
    term_count += term in document

  return term_count

def inverse_document_frequency(term, documents):
  from math import log
  N = len(documents) # 전체 문서의 개수
  df = document_frequency(term, documents) # term 이 등장한 문서의 개수
  
  # idf 구하기
  return log(N / (df+1))

# 검사할 단어, 문서리스트, 문서 인덱스
def tf_idf(term, documents, idx):
  document = documents[idx]
  return term_frequency(term, document) * inverse_document_frequency(term, document)

In [44]:
docs = [
        '어제 하루종일 컴퓨터 작업 컴퓨터 너무 싫어',
        '어제 식사 소고기 연어',
        '어제 안드로이드 강의 자료 작업',
        '오늘 딥러닝 자연어 처리 작업'
]

In [45]:
# 하나의 문장에서 반복된 단어가 나오면 중요한 단어로 체크, 여러 문장에서 반복된 단어가 나오면 중요하지 않은 단어로 체크

vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()
vocab

['강의',
 '너무',
 '딥러닝',
 '소고기',
 '식사',
 '싫어',
 '안드로이드',
 '어제',
 '연어',
 '오늘',
 '자료',
 '자연어',
 '작업',
 '처리',
 '컴퓨터',
 '하루종일']

In [46]:
# term frequency 표현하기 = DTM 보기
import pandas as pd

result = []
for i in range(len(docs)):
  result.append([])
  d = docs[i]

  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(term_frequency(t, d))

tf_ = pd.DataFrame(result, columns = vocab)
tf_

Unnamed: 0,강의,너무,딥러닝,소고기,식사,싫어,안드로이드,어제,연어,오늘,자료,자연어,작업,처리,컴퓨터,하루종일
0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,2,1
1,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0
3,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,0


In [47]:
result = []
for j in range(len(vocab)):
  t = vocab[j]
  result.append(inverse_document_frequency(t, docs))

idf_ = pd.DataFrame(result, index=vocab, columns=["IDF"])
idf_

Unnamed: 0,IDF
강의,0.693147
너무,0.693147
딥러닝,0.693147
소고기,0.693147
식사,0.693147
싫어,0.693147
안드로이드,0.693147
어제,0.0
연어,0.287682
오늘,0.693147


In [49]:
result = []
N = len(docs)
for i in range(N):
  result.append([])
  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(tf_idf(t, docs, i))

tfidf_ = pd.DataFrame(result, columns=vocab)
tfidf_

Unnamed: 0,강의,너무,딥러닝,소고기,식사,싫어,안드로이드,어제,연어,오늘,자료,자연어,작업,처리,컴퓨터,하루종일
0,0.0,3.178054,0.0,0.0,0.0,3.178054,0.0,3.178054,0.0,0.0,0.0,0.0,3.178054,0.0,6.356108,3.178054
1,0.0,0.0,0.0,2.484907,2.484907,0.0,0.0,2.484907,2.484907,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.833213,0.0,0.0,0.0,0.0,0.0,2.833213,2.833213,0.0,0.0,2.833213,0.0,2.833213,0.0,0.0,0.0
3,0.0,0.0,2.772589,0.0,0.0,0.0,0.0,0.0,2.772589,2.772589,0.0,2.772589,2.772589,2.772589,0.0,0.0
