In [15]:
import math
import numpy as np
import jieba
import jieba.posseg as psg
from gensim import corpora, models
from collections import defaultdict
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

path = "F:/for learn/Python/NLP_in_Action/chapter-5/"

In [11]:
def get_stopword_list():
    """
    加载停用词
    """
    file_path = path + "stopword.txt"
    with open(file_path, "r", encoding="UTF-8") as f:
        l = [s.replace("\n", "") for s in f.readlines()]
        f.close()
    return l

def seg_to_list(sentence, pos=False):
    """
    分词方法，调用jieba接口
    input:
    - sentence: 句子, string
    - pos: 是否采用词性标注分词, boolean
    """
    if not pos:
        # 不进行词性标注的分词方法
        seg_list = jieba.cut(sentence)
    else:
        # 采用词性标注的分词方法
        seg_list = psg.cut(sentence)
    return seg_list
        
def word_filter(seg_list, stopword_list):
    """
    去除干扰词
    input:
    - seg_list: 分词列表，generator[String]或generator[(String, String)]
    - stopword_list: 停用词列表， List[String]
    """
    filter_list = []
    # 由于分词列表可能是由词性标注生成的，因此接下来处理时需要判断
    pos = True if len(seg_list[0]) > 1 else False
    for seg in seg_list:
        if not pos:
            word = seg
        else:
            word, flag = seg
            # 只保留名词
            if not flag.startswith("n"):
                continue
        # 过滤掉长度小于2的词 和 停用词
        if len(word) < 2 | word in stopword_list:
            continue
        
        filter_list.append(word)
    return filter_list

def load_data(corpus_path, pos=False):
    """
    加载语料数据
    input: 
    - corpus_path: 语料地址
    - pos: 决定是否用词性分词
    :return: 语料二重列表， 第一重为语料，第二重为该语料的词（已去除停用词），如果pos为True，则为（词、词性）tuple
    """
    doc_list = []
    stopword_list = get_stopword_list()
    with open(corpus_path, "r", encoding="UTf-8") as f:
        for line in f:
            content = line.strip()
            seg_list = seg_to_list(content, pos)
            filter_list = word_filter(seg_list, stopword_list)
            doc_list.append(filter_list)
        f.close()
    return doc_list  
        

In [2]:
class TFIDF:
    def __init__(self, keyword_num):
        """
        :param keyword_num: 关键词数量
        """
        self.keyword_num = keyword_num
    def fit(self, doc_list):
        """
        根据训练语料，得到词的idf值
        :param doc_list: 训练语料
        :return: 包含语料各个词的idf字典，以及未登录词的默认idf值
        """
        idf_dic = defaultdict(int)
        tt_count = 0
        for word_list in doc_list:
            for word in set(word_list):
                idf_dic[word] += 1
            tt_count += 1

        self.idf_dic = {word: math.log(tt_count / (num + 1.0)) for word, num in idf_dic.items()}
        self.default_idf = math.log(tt_count / 1.0)
    
    def transform(self, word_list):
        """
        对目标词列表进行tf-idf转换
        :param word_list: 目标词列表
        """
        # 统计词频
        tf_dic = defaultdict(int)
        for word in word_list:
            tf_dic[word] += 1
        
        self.tfidf_dic = {word: tf * self.idf_dic.get(word, self.default_idf) for word, tf in tf_dic.items()}
        keywords = sorted(list(self.tfidf_dic.items()), key=lambda tup: tup[1], reverse=True)[:self.keyword_num]
        return keywords

In [None]:
class TopicModel:
    def __init__(self, keyword_num, num_topics=4):
        self.keyword_num = keyword_num
        self.num_topics = num_topics
    def fit(self, doc_list):
        self.tfidf = TfidfVectorizer(min_df=0)
        vec = self.tfidf.fit_transform(doc_list)
        self.model = LatentDirichletAllocation(n_topics=self.num_topics, random_state=0)
        self.model.fit(vec)
    def transform(self, word_list):
        content = " ".join(word_list)
        seq_vec = self.tfidf.transform([content])  # sparsed
        seq_topic_vec = self.model.transform(seq_vec)  # 1 * num_topis 语句的主题分布
        word_mat = self.tfidf.transform([word])
        word_topic_mat = self.model.transform(word_mat)
        
        
            

In [44]:
word_mat = tfidf.transform(docs[0])
word_topic_mat = model.transform(word_mat)

In [46]:
word_topic_mat.shape

(8, 4)

In [48]:
seq_vec = tfidf.transform([content])
seq_topic_vec = model.transform(seq_vec)
seq_topic_vec

array([[0.76664136, 0.07755041, 0.07790411, 0.07790411]])

In [51]:
np.dot(word_topic_mat, seq_topic_vec.T)

array([[0.25      ],
       [0.50631757],
       [0.25      ],
       [0.25      ],
       [0.50631757],
       [0.50631757],
       [0.50631757],
       [0.50631757]])

In [40]:
content = " ".join(docs[0])
vec = tfidf.transform([content])

In [28]:
tfidf = TfidfVectorizer(min_df=0)
vec = tfidf.fit_transform(doc_list)

In [29]:
model = LatentDirichletAllocation(n_topics=4, random_state=0)
model.fit(vec)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=4, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)