In [1]:
import math
import numpy as np
import jieba
import jieba.posseg as psg
import warnings
from jieba import analyse
from _utils import u_constant
from collections import defaultdict
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
path = u_constant.PATH_ROOT + "for learn/Python/NLP_in_Action/chapter-5/"

In [2]:
def get_stopword_list():
    """
    加载停用词
    """
    file_path = path + "stopword.txt"
    with open(file_path, "r", encoding="UTF-8") as f:
        l = [s.replace("\n", "") for s in f.readlines()]
        f.close()
    return l

def seg_to_list(sentence, pos=False):
    """
    分词方法，调用jieba接口
    input:
    - sentence: 句子, string
    - pos: 是否采用词性标注分词, boolean
    """
    if not pos:
        # 不进行词性标注的分词方法
        seg_list = jieba.lcut(sentence)
    else:
        # 采用词性标注的分词方法
        seg_list = psg.lcut(sentence)
    return seg_list
        
def word_filter(seg_list, stopword_list):
    """
    去除干扰词
    input:
    - seg_list: 分词列表，generator[String]或generator[(String, String)]
    - stopword_list: 停用词列表， List[String]
    """
    filter_list = []
    # 由于分词列表可能是由词性标注生成的，因此接下来处理时需要判断
    pos = isinstance(seg_list[0], jieba.posseg.pair)
    for seg in seg_list:
        if not pos:
            word = seg
        else:
            word, flag = seg
            # 只保留名词
            if not flag.startswith("n"):
                continue
        # 过滤掉长度小于2的词 和 停用词
        if (len(word) < 2) | (word in stopword_list):
            continue
        
        filter_list.append(word)
    return filter_list

def load_data(corpus_path, pos=False):
    """
    加载语料数据
    input: 
    - corpus_path: 语料地址
    - pos: 决定是否用词性分词
    :return: 语料二重列表， 第一重为语料，第二重为该语料的词（已去除停用词），如果pos为True，则为（词、词性）tuple
    """
    doc_list = []
    stopword_list = get_stopword_list()
    with open(corpus_path, "r", encoding="UTf-8") as f:
        for line in f:
            content = line.strip()
            seg_list = seg_to_list(content, pos)
            filter_list = word_filter(seg_list, stopword_list)
            doc_list.append(filter_list)
        f.close()
    return doc_list  
        

In [3]:
class TFIDF:
    def __init__(self, keyword_num):
        """
        :param keyword_num: 关键词数量
        """
        self.keyword_num = keyword_num
    def fit(self, doc_list):
        """
        根据训练语料，得到词的idf值
        :param doc_list: 训练语料
        :return: 包含语料各个词的idf字典，以及未登录词的默认idf值
        """
        idf_dic = defaultdict(int)
        tt_count = 0
        for word_list in doc_list:
            for word in set(word_list):
                idf_dic[word] += 1
            tt_count += 1

        self.idf_dic = {word: math.log(tt_count / (num + 1.0)) for word, num in idf_dic.items()}
        self.default_idf = math.log(tt_count / 1.0)
    
    def get_keyword(self, word_list):
        """
        对目标词列表进行tf-idf转换
        :param word_list: 目标词列表
        """
        # 统计词频
        tf_dic = defaultdict(int)
        for word in word_list:
            tf_dic[word] += 1
        
        self.tfidf_dic = {word: tf * self.idf_dic.get(word, self.default_idf) for word, tf in tf_dic.items()}
        sorted_words = sorted(list(self.tfidf_dic.items()), key=lambda tup: tup[1], reverse=True)
        keywords = sorted_words[:self.keyword_num]
        for i, (word, score) in enumerate(keywords):
            print("%d: %s %.4f" % (i, word, score))
        return keywords

In [4]:
class TopicModel:
    def __init__(self, keyword_num, num_topics=4):
        self.keyword_num = keyword_num
        self.num_topics = num_topics
    def fit(self, doc_list):
        doc_list = [" ".join(doc) for doc in doc_list]
        self.tfidf = TfidfVectorizer(min_df=0)
        vec = self.tfidf.fit_transform(doc_list)
        self.model = LatentDirichletAllocation(n_components=self.num_topics, random_state=0)
        self.model.fit(vec)
    def __normalize(self, arr):
        """
        对目标向量 / 矩阵进行二范数规范化
        """
        norm = np.linalg.norm(arr, 2, axis=1, keepdims=True)
        return arr / norm
    def get_keyword(self, word_list):
        """
        以每个词单独作为文档，计算其与整体语料主题分布的相似度（cosine），并按相似度高低选出关键词
        """
        content = " ".join(word_list)
        words = list(set(word_list))  # m
        seq_vec = self.tfidf.transform([content])  # sparsed  1 * n
        seq_topic_vec = self.model.transform(seq_vec)  # 1 * num_topis 语句的主题分布
        word_mat = self.tfidf.transform(words)  # sparsed m * n
        word_topic_mat = self.model.transform(word_mat)
        
        normed_seq_topic_vec = self.__normalize(seq_topic_vec)  # 1 * n
        normed_word_topic_mat = self.__normalize(word_topic_mat)  # m * n
        
        sims = np.dot(normed_word_topic_mat, normed_seq_topic_vec.T).flatten()
        
        sorted_words = sorted(list(zip(words, sims)), key=lambda tup: tup[1], reverse=True)
        keywords =  sorted_words[:self.keyword_num]
        for i, (word, score) in enumerate(keywords):
            print("%d: %s %.4f" % (i, word, score))
        return keywords
            

In [5]:
def keyword_extract(text, train_corpus=None, keyword_num=10, method="tfidf", seg_pos=False):
    """
    统一关键词抽取API
    :param text: 目标语料集
    :param train_corpus: 训练集，用于抽取idf值
    :param keyword_num: 最终抽取的关键词数量
    :param method: 具体抽取的算法，有tf-idf, text-rank, lda算法
    :param seg_pos: 筛选关键词词性的方法，如果为True，则只保留名词
    """
    stop_word_list = get_stopword_list()
    seg_list = seg_to_list(text, seg_pos)
    filter_list = word_filter(seg_list, stop_word_list)
    
    if method.lower() in ["tfidf", "tf-idf", "tf_idf"]:
        if train_corpus is None:
            warnings.warn("train corpus is None. Loaded from default path!")
            train_corpus = load_data(path + "corpus.txt", seg_pos)
        model = TFIDF(keyword_num)
        model.fit(train_corpus)
        keywords = model.get_keyword(filter_list)
    elif method.lower() in ["textrank", "text rank", "text-rank", "text_rank"]:
        params = {"topK": keyword_num, "sentence": text, "withWeight": True}
        params["allowPOS"] = ["n", "ns", "nz"] if seg_pos else ["n", "ns", "nz", "vn", "v"]
        keywords = analyse.textrank(**params)
        for i, (word, score) in enumerate(keywords):
            print("%d: %s %.4f" % (i, word, score))
    elif method.lower() in ["lda", "topic"]:
        if train_corpus is None:
            warnings.warn("train corpus is None. Loaded from default path!")
            train_corpus = load_data(path + "corpus.txt", seg_pos)
        model = TopicModel(keyword_num)
        model.fit(train_corpus)
        keywords = model.get_keyword(filter_list)
    else:
        raise Exception("method %s is invalid!" % method)
    return keywords
        

In [6]:
text = '6月19日,《2012年度“中国爱心城市”公益活动新闻发布会》在京举行。' + \
       '中华社会救助基金会理事长许嘉璐到会讲话。基金会高级顾问朱发忠,全国老龄' + \
       '办副主任朱勇,民政部社会救助司助理巡视员周萍,中华社会救助基金会副理事长耿志远,' + \
       '重庆市民政局巡视员谭明政。晋江市人大常委会主任陈健倩,以及10余个省、市、自治区民政局' + \
       '领导及四十多家媒体参加了发布会。中华社会救助基金会秘书长时正新介绍本年度“中国爱心城' + \
       '市”公益活动将以“爱心城市宣传、孤老关爱救助项目及第二届中国爱心城市大会”为主要内容,重庆市' + \
       '、呼和浩特市、长沙市、太原市、蚌埠市、南昌市、汕头市、沧州市、晋江市及遵化市将会积极参加' + \
       '这一公益活动。中国雅虎副总编张银生和凤凰网城市频道总监赵耀分别以各自媒体优势介绍了活动' + \
       '的宣传方案。会上,中华社会救助基金会与“第二届中国爱心城市大会”承办方晋江市签约,许嘉璐理' + \
       '事长接受晋江市参与“百万孤老关爱行动”向国家重点扶贫地区捐赠的价值400万元的款物。晋江市人大' + \
       '常委会主任陈健倩介绍了大会的筹备情况。'

In [7]:
for method in ["tfidf", "text-rank", "topic"]:
    print(method)
    keyword_extract(text, keyword_num=10, method=method, seg_pos=False)
    print("")

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/5p/db2l5pk14pd2d9bwhhtqbrgc0000gn/T/jieba.cache


tfidf


Loading model cost 0.765 seconds.
Prefix dict has been built succesfully.
  app.launch_new_instance()


0: 晋江市 22.0336
1: 救助 12.6248
2: 城市 12.0529
3: 大会 12.0038
4: 爱心 11.6364
5: 中华 10.1397
6: 基金会 9.8219
7: 许嘉璐 8.8134
8: 巡视员 8.8134
9: 重庆市 8.8134

text-rank
0: 城市 1.0000
1: 救助 0.8287
2: 爱心 0.8102
3: 中国 0.7955
4: 社会 0.7491
5: 中华 0.7053
6: 基金会 0.6890
7: 晋江市 0.6873
8: 大会 0.5803
9: 介绍 0.5147

topic




0: 捐赠 0.9605
1: 基金会 0.9605
2: 项目 0.9604
3: 年度 0.9604
4: 活动 0.9603
5: 行动 0.9603
6: 中华 0.9601
7: 爱心 0.9599
8: 老龄 0.9597
9: 2012 0.9596



In [8]:
for method in ["tfidf", "text-rank", "topic"]:
    print(method)
    keyword_extract(text, keyword_num=10, method=method, seg_pos=True)
    print("")

tfidf


  app.launch_new_instance()


0: 晋江市 22.0336
1: 城市 12.0529
2: 大会 12.0038
3: 爱心 11.6364
4: 中华 10.1397
5: 基金会 9.8219
6: 许嘉璐 8.8134
7: 巡视员 8.8134
8: 重庆市 8.8134
9: 人大常委会 8.8134

text-rank
0: 城市 1.0000
1: 爱心 0.8025
2: 中国 0.7980
3: 社会 0.6994
4: 基金会 0.6903
5: 中华 0.6882
6: 晋江市 0.5972
7: 公益活动 0.5059
8: 大会 0.4981
9: 发布会 0.4191

topic




0: 民政部 0.9640
1: 中华 0.9638
2: 基金会 0.9637
3: 老龄 0.9636
4: 社会 0.9636
5: 理事长 0.9627
6: 重点 0.9622
7: 爱心 0.9622
8: 公益活动 0.9620
9: 晋江市 0.9619

