### 简单demo

In [26]:
import gensim
from gensim import corpora

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

1.准备文档数据

In [55]:
doc1 = "Sugar sugar sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "sugar My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never se\
ems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

mydoc = [doc1, doc2, doc3, doc4, doc5]

2.数据清洗和预处理，包括移除标点符号，停用词和标准化语料库（Lemmatize，将英文词归元）

In [56]:
def clean_words(doc):
    """ 数据清洗 """
    stop = set(stopwords.words('english')) # 停止词
    exclude = set(string.punctuation) # 标点符号
    lemma = WordNetLemmatizer() # 词性还原
    
    clean_stopwords = " ".join([c for c in doc.lower().split() if c not in stop])
    clean_exclude = "".join(c for c in clean_stopwords if c not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in clean_exclude.split())
    return normalized

clean_mydoc = [clean_words(doc).split() for doc in mydoc]
'''
[['sugar', 'bad', 'consume', 'sister', 'like', 'sugar', 'father'], 
 ['father','spends','lot','time','driving','sister','around','dance','practice'],...
'''

"\n[['sugar', 'bad', 'consume', 'sister', 'like', 'sugar', 'father'], \n ['father','spends','lot','time','driving','sister','around','dance','practice'],...\n"

3.将预料转化为Document-Term矩阵

In [57]:
#  创建预料词典，每个词都给予一个索引
dictionary = corpora.Dictionary(clean_mydoc)

# 将文本变成词袋矩阵
doc_item_matrix = [dictionary.doc2bow(doc) for doc in clean_mydoc]
'''
doc_item_matrix:
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)],
[(2, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],...
这里第一行的(5, 2)可能代表sugar出现2次。
'''

'\ndoc_item_matrix:\n[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)],\n[(2, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],...\n这里第一行的(5, 1)代表sugar出现一次。\n'

4.构建一个LDA对象，使用DT矩阵进行训练。采用训练好的模型，对新文档做主题分布的推断。

ldamodel = LdaModel(text, num_topics=10, id2word=dictionary, passes=20)

参数:
    
    - text 词袋，这里已经表示成DT矩阵了
    - num_topics 主题数
    - id2word 词典
    - passes 训练的轮数

In [58]:
lda = gensim.models.ldamodel.LdaModel

# 在 DT 矩阵上运行和训练 LDA 模型
ldamodel = lda(doc_item_matrix, num_topics=3, id2word=dictionary, passes=50)

5.输出结果

In [59]:
ldamodel.print_topics(num_topics=3, num_words=3)

[(0, '0.062*"father" + 0.062*"sister" + 0.062*"driving"'),
 (1, '0.150*"sugar" + 0.037*"cause" + 0.037*"doctor"'),
 (2, '0.057*"sister" + 0.057*"father" + 0.057*"pressure"')]

### 工程实现

In [37]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
import sys 
import argparse

In [None]:
class LDA(object):
    def __init__(self, topics=10,
                worker=3,
                pretrained_model=None,
                dictionary=None):
        """ lda模型训练初始化
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数，一般为core数量减一
            pretrained_model -- 预训练模型，由于支持在线更新，所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID，所以跟模型配套有一个ID映射的字典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """
        
        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        
        if pretrained_model and dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)
        
        

参考

深入浅出讲解LDA主题模型（一）

https://blog.csdn.net/Love_wanling/article/details/72872180

主题模型 LDA 入门（附 Python 代码）

https://blog.csdn.net/selinda001/article/details/80446766

NLP Lemmatisation（词性还原） 和 Stemming（词干提取） NLTK pos_tag word_tokenize

https://blog.csdn.net/qq_16234613/article/details/79430381