In [1]:
# -*- coding: utf-8 -*-
from collections import defaultdict
import math
import operator
 
"""
函数说明:创建数据样本
Returns:
    dataset - 实验样本切分的词条
    classVec - 类别标签向量
"""
def loadDataSet():
    dataset = [ ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],    # 切分的词条
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid'] ]
    classVec = [0, 1, 0, 1, 0, 1]  # 类别标签向量，1代表好，0代表不好
    return dataset, classVec
 
 
"""
函数说明：特征选择TF-IDF算法
Parameters:
     list_words:词列表
Returns:
     dict_feature_select:特征选择词字典
"""
def feature_select(list_words):
    #总词频统计
    doc_frequency=defaultdict(int)
    for word_list in list_words:
        for i in word_list:
            doc_frequency[i]+=1
 
    #计算每个词的TF值
    word_tf={}  #存储没个词的tf值
    for i in doc_frequency:
        word_tf[i]=doc_frequency[i]/sum(doc_frequency.values())
 
    #计算每个词的IDF值
    doc_num=len(list_words)
    word_idf={} #存储每个词的idf值
    word_doc=defaultdict(int) #存储包含该词的文档数
    for i in doc_frequency:
        for j in list_words:
            if i in j:
                word_doc[i]+=1
    for i in doc_frequency:
        word_idf[i]=math.log(doc_num/(word_doc[i]+1))
 
    #计算每个词的TF*IDF的值
    word_tf_idf={}
    for i in doc_frequency:
        word_tf_idf[i]=word_tf[i]*word_idf[i]
 
    # 对字典按值由大到小排序
    dict_feature_select=sorted(word_tf_idf.items(),key=operator.itemgetter(1),reverse=True)
    return dict_feature_select
 
if __name__=='__main__':
    data_list,label_list=loadDataSet() #加载数据
    features=feature_select(data_list) #所有词的TF-IDF值
    print(features)
    print(len(features))


[('worthless', 0.0322394037469742), ('stop', 0.0322394037469742), ('to', 0.0322394037469742), ('dog', 0.028288263356383563), ('my', 0.028288263356383563), ('stupid', 0.028288263356383563), ('him', 0.028288263356383563), ('buying', 0.025549122992281622), ('maybe', 0.025549122992281622), ('is', 0.025549122992281622), ('I', 0.025549122992281622), ('not', 0.025549122992281622), ('dalmation', 0.025549122992281622), ('take', 0.025549122992281622), ('love', 0.025549122992281622), ('park', 0.025549122992281622), ('cute', 0.025549122992281622), ('garbage', 0.025549122992281622), ('so', 0.025549122992281622), ('steak', 0.025549122992281622), ('flea', 0.025549122992281622), ('licks', 0.025549122992281622), ('help', 0.025549122992281622), ('has', 0.025549122992281622), ('how', 0.025549122992281622), ('quit', 0.025549122992281622), ('food', 0.025549122992281622), ('ate', 0.025549122992281622), ('please', 0.025549122992281622), ('posting', 0.025549122992281622), ('problems', 0.025549122992281622), (