# TF-IDF算法

In [1]:
import numpy as np
import pandas as pd
import math

## 数据准备

In [2]:
a = 'the cat sat on my bad'
b = 'the dog sat on my feet'

## 构建词袋和字典

In [3]:
bowa = a.split(' ')
bowb = b.split(' ')
word_dict = set(bowa).union(set(bowb))

In [4]:
bowa

['the', 'cat', 'sat', 'on', 'my', 'bad']

In [5]:
word_dict

{'bad', 'cat', 'dog', 'feet', 'my', 'on', 'sat', 'the'}

In [6]:
# 构造a与b的字典
dicta = dict.fromkeys(word_dict, 0)
dictb = dict.fromkeys(word_dict, 0)

In [7]:
dicta

{'bad': 0, 'feet': 0, 'sat': 0, 'the': 0, 'on': 0, 'dog': 0, 'my': 0, 'cat': 0}

In [8]:
# 分别统计到字典中
for word in bowa:
    dicta[word] += 1
    
for word in bowb:
    dictb[word] += 1


In [9]:
pd.DataFrame([dicta, dictb]) # 查看一下效果

Unnamed: 0,bad,feet,sat,the,on,dog,my,cat
0,1,0,1,1,1,0,1,1
1,0,1,1,1,1,1,1,0


## 计算TF值

In [10]:
# @input 单词的字典和他对应的词袋
# @output 计算完毕的tf字典
def compute_TF(word_dict, bow):
    tf_dict = {}
    # 对每一个单词对应的数量cnt计算tf值 tf = cnt / len
    for word, cnt in word_dict.items():
        tf_dict[word] = cnt / len(bow)
    return tf_dict


In [11]:
tfa = compute_TF(dicta, bowa)

tfb = compute_TF(dictb, bowb)


In [12]:
tfa

{'bad': 0.16666666666666666,
 'feet': 0.0,
 'sat': 0.16666666666666666,
 'the': 0.16666666666666666,
 'on': 0.16666666666666666,
 'dog': 0.0,
 'my': 0.16666666666666666,
 'cat': 0.16666666666666666}

## 计算IDF值

In [13]:
# @input 单词的字典拼接成dict_list 单词列表word_dict
# @output 计算完毕的idf字典
def compute_IDF(dict_list):
    idf_dict = dict.fromkeys(word_dict, 0)
    for d in dict_list: # 遍历单词字典的列表
        for word, cnt in d.items(): # 统计出现次数到idf_dict
            if cnt != 0:
                idf_dict[word] += 1
                       
    for word, cnt in idf_dict.items(): # 遍历idf_dict计算
        idf_dict[word] = math.log10((len(dict_list)+1) / (cnt+1))
        
    return idf_dict    


In [14]:
idf = compute_IDF([dicta, dictb])
idf

{'bad': 0.17609125905568124,
 'feet': 0.17609125905568124,
 'sat': 0.0,
 'the': 0.0,
 'on': 0.0,
 'dog': 0.17609125905568124,
 'my': 0.0,
 'cat': 0.17609125905568124}

## 计算TF-IDF值

In [15]:
# @input 句子的tf字典和idf字典
# @output 计算完毕的tf_idf字典
def compute_TFIDF(tf, idf):
    tf_idf = {}
    # 计算tf-idf值-> tf * idf 
    for word, val in tf.items():
        tf_idf[word] = val * idf[word]
        
    return tf_idf


In [16]:
tf_idfa = compute_TFIDF(tfa, idf)

tf_idfb = compute_TFIDF(tfb, idf)


In [17]:
tf_idfa

{'bad': 0.029348543175946873,
 'feet': 0.0,
 'sat': 0.0,
 'the': 0.0,
 'on': 0.0,
 'dog': 0.0,
 'my': 0.0,
 'cat': 0.029348543175946873}

In [18]:
df = pd.DataFrame([tf_idfa, tf_idfb])
df

Unnamed: 0,bad,feet,sat,the,on,dog,my,cat
0,0.029349,0.0,0.0,0.0,0.0,0.0,0.0,0.029349
1,0.0,0.029349,0.0,0.0,0.0,0.029349,0.0,0.0


In [23]:
df.T

Unnamed: 0,0,1
bad,0.029349,0.0
feet,0.0,0.029349
sat,0.0,0.0
the,0.0,0.0
on,0.0,0.0
dog,0.0,0.029349
my,0.0,0.0
cat,0.029349,0.0


In [20]:
# 传入df和文件名生成csv文件
def save_file(df, file_name):
    df.to_csv('./result/' + file_name + 'result.csv', index=False)

In [22]:
%%time
save_file(df, 'tf_idf_')

CPU times: user 2.48 ms, sys: 2.21 ms, total: 4.68 ms
Wall time: 4.39 ms
