In [1]:
import sklearn

sklearn.__version__

'0.20.1'

In [2]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
                                  categories=categories,
                                  shuffle=True,
                                  random_state=42)

print(type(twenty_train))

<class 'sklearn.utils.Bunch'>


In [3]:
"""得到的对象是bunch,它具有一些性质"""
for k,w in enumerate(twenty_train.target_names):
    print(f'标签{k+1}：{w}')

标签1：alt.atheism
标签2：comp.graphics
标签3：sci.med
标签4：soc.religion.christian


In [4]:
"""bunch.data是所有的文档组成的列表"""
print('twenty_train.data的数据类型：',type(twenty_train.data))
print('twenty_train.data的样本数：',len(twenty_train.data))      

twenty_train.data的数据类型： <class 'list'>
twenty_train.data的样本数： 2257


In [5]:
"""从data中去取出一篇来看看"""
raw_data = twenty_train.data
raw_data[0]

'From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n'

In [6]:
"""定义清除特殊字符的方法clean"""
import re

def clean(text):
    if type(text) == str:
        text_droped = re.sub("[\d+\s+\.\!\/_,$%^*()+\"\'\?]+|[+——！，。？、~@#￥%……&*（）:]+",
                             " ",  # 用空格来代替
                             text)
    else:
        text_droped = []
        for i in text:
            text_droped.append(clean(i))
            
    return text_droped

clean(twenty_train.data)[0]

'From  sd  city ac uk Michael Collier Subject  Converting images to HP LaserJet III Nntp-Posting-Host  hampton Organization  The City University Lines  Does anyone know of a good way standard PC application PD utility to convert tif img tga files into LaserJet III format We would also like to do the same converting to HPGL HP plotter files Please email any response Is this the correct group Thanks in advance Michael -- Michael Collier Programmer The Computer Unit Email  M P Collier uk ac city The City University Tel  - x London Fax  - EC V HB '

In [7]:
"""使用tf-idf对2257篇文档进行文本向量化"""
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# 只取1000个出现次数最多的词作为特征
n_features = 1000
tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                max_features=n_features,
                                stop_words='english',
                                max_df=0.5,
                                min_df=10)
tf = tf_vectorizer.fit_transform(clean(twenty_train.data))
tf.shape,tf_vectorizer.get_feature_names()

((2257, 1000),
 ['ability',
  'able',
  'absolute',
  'absolutes',
  'ac',
  'accept',
  'accepted',
  'access',
  'according',
  'account',
  'act',
  'action',
  'actions',
  'acts',
  'actually',
  'ad',
  'add',
  'address',
  'admit',
  'advance',
  'advice',
  'age',
  'ago',
  'agree',
  'ai',
  'aids',
  'algorithm',
  'allan',
  'allow',
  'alt',
  'alternative',
  'american',
  'amiga',
  'analysis',
  'ancient',
  'andrew',
  'animals',
  'animation',
  'answer',
  'answers',
  'anti',
  'anybody',
  'apparently',
  'appears',
  'apple',
  'apply',
  'appreciate',
  'appreciated',
  'approach',
  'appropriate',
  'apr',
  'april',
  'archive',
  'area',
  'aren',
  'argue',
  'argument',
  'arguments',
  'arrogance',
  'art',
  'article',
  'articles',
  'ask',
  'asked',
  'assume',
  'assumption',
  'atheism',
  'atheist',
  'atheists',
  'athens',
  'athos',
  'au',
  'austin',
  'australia',
  'author',
  'authority',
  'available',
  'avoid',
  'away',
  'bad',
  'banks

In [8]:
"""创建LDA模型，拟合数据"""
from sklearn.decomposition import LatentDirichletAllocation

n_topics = 5
lda = LatentDirichletAllocation(n_components=n_topics,
                                max_iter=100,
                                learning_method='online',
                                learning_offset=50,
                                random_state=0)
# 开始查找主题
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50,
             max_doc_update_iter=100, max_iter=100, mean_change_tol=0.001,
             n_components=5, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [9]:
"""定义模型输出结果函数"""
def print_top_words(model, feature_name, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        for i in topic.argsort()[:-n_top_words - 1:-1]:
            print(feature_name[i],end=' ')

In [12]:
"""输出结果"""

n_top_words = 10

tf_feature_name = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_name, n_top_words)


Topic #0:
com article msg people medical health use food like know 
Topic #1:
graphics image com university posting file software host mail nntp 
Topic #2:
god jesus people believe does bible christians christian know think 
Topic #3:
com keith pitt article cs gordon banks caltech geb sgi 
Topic #4:
think people article don science just com like university uk 

In [None]:
"""数据可视化，会弹出一个网页"""
import pyLDAvis.sklearn

data = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

pyLDAvis.show(data)#可视化主题模型


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [06/Aug/2019 15:50:49] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [06/Aug/2019 15:51:10] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [06/Aug/2019 15:51:10] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [06/Aug/2019 15:51:10] "GET /LDAvis.js HTTP/1.1" 200 -
