In [1]:
# TF-IDF는 특정단어가 문서들에서 자주 등장할수록 가중치는 낮아진다.
# 동일 문서라면 가중치는 올라간다.(TF가 크기 때문)

In [3]:
import pandas as pd # 데이터프레임 사용을 위해
from math import log # IDF 계산을 위해

docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
] 
vocab = list(set(w for doc in docs for w in doc.split())) #단어집합
vocab.sort()

In [5]:
# TF, IDF 구하기
# 총 문서의 수
n = len(docs)

def tf(t,d):
    return d.count(t)

def idf(t):
    df = 0 #특정 단어의 등장한 문서수
    for doc in docs:
        if t in doc:
            df+=1
    return log(n/(1+df))

def tfidf(t,d):
    return tf(t,d)*idf(t)
    

In [7]:
result = []

# 각 문서에 대해서 아래 연산을 반복
for i in range(n):
  result.append([])
  d = docs[i] #각문서에 대해
  for j in range(len(vocab)): #단어사전 크기만큼 for문
    t = vocab[j] #한단어에 대해
    result[-1].append(tf(t, d)) #result 마지막 리스트의 원소로 빈도순 -> DTM

tf_ = pd.DataFrame(result, columns = vocab)

In [8]:
tf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [9]:
result=[]
for i in range(len(vocab)):
    t=vocab[j]
    result.append(idf(t))
    
idf_ = pd.DataFrame(result, index=vocab, columns=['IDF'])
idf_

Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.693147
바나나,0.693147
사과,0.693147
싶은,0.693147
저는,0.693147
좋아요,0.693147


In [11]:
#TF-IDF 행렬
result = []
for i in range(n): # 총 문서의 개수 만큼
    result.append([])
    d = docs[i]
    for j in range(len(vocab)): #단어집합의 크기만큼
        t = vocab[j]
        result[-1].append(tfidf(t,d))
        
tfidf_ = pd.DataFrame(result,columns=vocab)
tfidf_
        
    


Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


# 2.사이킷런을 이용한 DTM과 TM-IDF실습

In [16]:
# 1. CounterVectorizer를 사용하면 DTM을 만들 수 있다.
from sklearn.feature_extraction.text import CountVectorizer
#CounterVector자체가 한 단어 문자를 삭제

corpus = [
    'you know I want your love',
    'I like you, I',
    'what should I do ',    
]

vector = CountVectorizer()

#코퍼스로 부터 단어수 기록

vector.fit_transform(corpus).toarray()

array([[0, 1, 0, 1, 0, 1, 0, 1, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 1, 0, 1, 0, 0]], dtype=int64)

In [17]:
vector.vocabulary_

{'you': 7,
 'know': 1,
 'want': 5,
 'your': 8,
 'love': 3,
 'like': 2,
 'what': 6,
 'should': 4,
 'do': 0}

In [18]:
# 코퍼스로부터 각 단어의 빈도수를 기록
print(vector.fit_transform(corpus).toarray())

# 각 단어와 맵핑된 인덱스 출력
print(vector.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [19]:
#사이킷 런 제공 = Tfidf벡터화
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',    
]

tfidfv = TfidfVectorizer().fit(corpus)
print(tfidfv.transform(corpus).toarray())
print(tfidfv.vocabulary_)

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}
