In [1]:
import pandas as pd
from math import log

In [2]:
docs = ['i like this movie', 'i like this pasta', 'i love sam']

In [3]:
vocab = list(set(w for doc in docs for w in doc.split()))

In [4]:
print(vocab)

['like', 'love', 'i', 'this', 'sam', 'pasta', 'movie']


In [5]:
def get_tf(word, doc):
    doc_word_list = doc.split()
    return doc_word_list.count(word)

def get_idf(word):
    df = 0
    for doc in docs:
        doc_word_list = doc.split()
        df += word in doc_word_list
    return log((len(docs) + 1)/(df + 1)) + 1

def get_tfidf(word, doc):
    return get_tf(word,doc) * get_idf(word)

In [6]:
def get_tf_matrix(docs, vocab):
    tf_matrix = []
    for doc in docs:
        tf_matrix.append([])
        for word in vocab:
            tf_score = get_tf(word, doc)
            tf_matrix[-1].append(tf_score)
    return tf_matrix
        

In [7]:
def get_idf_matrix(docs, vocab):
    idf_matrix = []
    for word in vocab:
        idf_score = get_idf(word)
        idf_matrix.append(idf_score)
    return idf_matrix
        

In [8]:
def get_tfidf_matrix(docs, vocab):
    tfidf_matrix = []
    for doc in docs:
        tfidf_matrix.append([])
        for word in vocab:
            tfidf_score = get_tfidf(word, doc)
            tfidf_matrix[-1].append(tfidf_score)
    return tfidf_matrix

In [9]:
tf_matrix = get_tf_matrix(docs, vocab)
idf_matrix = get_idf_matrix(docs, vocab)

print(pd.DataFrame(tf_matrix, columns= vocab))
print(pd.DataFrame(idf_matrix, index= vocab, columns=["IDF"]))

tfidf_matrix = get_tfidf_matrix(docs, vocab)
print(docs)
print(pd.DataFrame(tfidf_matrix, columns = vocab))

   like  love  i  this  sam  pasta  movie
0     1     0  1     1    0      0      1
1     1     0  1     1    0      1      0
2     0     1  1     0    1      0      0
            IDF
like   1.287682
love   1.693147
i      1.000000
this   1.287682
sam    1.693147
pasta  1.693147
movie  1.693147
['i like this movie', 'i like this pasta', 'i love sam']
       like      love    i      this       sam     pasta     movie
0  1.287682  0.000000  1.0  1.287682  0.000000  0.000000  1.693147
1  1.287682  0.000000  1.0  1.287682  0.000000  1.693147  0.000000
2  0.000000  1.693147  1.0  0.000000  1.693147  0.000000  0.000000


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
docs = ['i like this movie', 'i like this pasta', 'i love sam']

In [12]:
tfidf_vectorizer = TfidfVectorizer()

# 출처: https://chan-lab.tistory.com/24 [은공지능 공작소]

In [13]:
tfidf_vectorizer.fit(docs)

TfidfVectorizer()

In [14]:
print(tfidf_vectorizer.vocabulary_)

{'like': 0, 'this': 5, 'movie': 2, 'pasta': 3, 'love': 1, 'sam': 4}


In [15]:
print(tfidf_vectorizer.transform(docs).toarray())

[[0.51785612 0.         0.68091856 0.         0.         0.51785612]
 [0.51785612 0.         0.         0.68091856 0.         0.51785612]
 [0.         0.70710678 0.         0.         0.70710678 0.        ]]


In [16]:
# https://chan-lab.tistory.com/24