In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
# 多行输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import randomized_svd, NMF
from glob import glob
import os

## 获取数据

In [3]:
filenames = glob("data/british-fiction-corpus" + "/*.txt")

In [4]:
len(filenames)

27

- 向量化-数值化

In [5]:
vectorizer = TfidfVectorizer(input='filename', stop_words='english')
tfidf = vectorizer.fit_transform(filenames).toarray()
vocab = np.array(vectorizer.get_feature_names())
tfidf.shape, vocab.shape

((27, 55035), (55035,))

In [6]:
tfidf[0, 245: 270]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00396722, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

- 文档存储名

In [7]:
[f.split("/")[2] for f in filenames]

['ABronte_Tenant.txt',
 'Thackeray_Barry.txt',
 'Dickens_Hard.txt',
 'Thackeray_Pendennis.txt',
 'Trollope_Phineas.txt',
 'Richardson_Pamela.txt',
 'CBronte_Villette.txt',
 'Austen_Pride.txt',
 'Fielding_Joseph.txt',
 'CBronte_Professor.txt',
 'Dickens_Bleak.txt',
 'Trollope_Prime.txt',
 'Fielding_Tom.txt',
 'CBronte_Jane.txt',
 'Sterne_Tristram.txt',
 'EBronte_Wuthering.txt',
 'Dickens_David.txt',
 'Eliot_Middlemarch.txt',
 'Austen_Sense.txt',
 'Richardson_Clarissa.txt',
 'Sterne_Sentimental.txt',
 'Austen_Emma.txt',
 'Eliot_Mill.txt',
 'ABronte_Agnes.txt',
 'Trollope_Barchester.txt',
 'Eliot_Adam.txt',
 'Thackeray_Vanity.txt']

## 基于 NMF 主题模型

In [8]:
num_topics, num_top_words= 8, 10

In [9]:
model = NMF(n_components=num_topics, random_state=123)

In [10]:
W = model.fit_transform(tfidf)
H = model.components_

In [11]:
W.shape, H.shape

((27, 8), (8, 55035))

- 关键词

In [12]:
def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [13]:
show_topics(H)

['mr said mrs little know sir good old bounderby miss',
 'said little like did time good thought know monsieur hunsden',
 'adams said jones lady man allworthy sophia joseph good mr',
 'mr elinor emma darcy mrs weston marianne miss knightley elton',
 'toby said uncle father corporal quoth tis trim yorick man',
 'heathcliff linton hareton catherine earnshaw cathy edgar ll said ellen',
 'maggie tulliver said tom glegg philip mr stephen wakem lucy',
 'phineas said lopez mr finn man wharton laura duke kennedy']

- 所有的不重复关键词

In [14]:
def get_all_topic_words(H):
    top_indices = lambda t: {i for i in np.argsort(t)[:-num_top_words-1:-1]}
    topic_indices = [top_indices(t) for t in H]
    return sorted(set.union(*topic_indices))

In [15]:
idx = get_all_topic_words(H)

In [16]:
vocab[idx]

array(['adams', 'allworthy', 'bounderby', 'catherine', 'cathy',
       'corporal', 'darcy', 'did', 'duke', 'earnshaw', 'edgar', 'elinor',
       'ellen', 'elton', 'emma', 'father', 'finn', 'glegg', 'good',
       'hareton', 'heathcliff', 'hunsden', 'jones', 'joseph', 'kennedy',
       'knightley', 'know', 'lady', 'laura', 'like', 'linton', 'little',
       'll', 'lopez', 'lucy', 'maggie', 'man', 'marianne', 'miss',
       'monsieur', 'mr', 'mrs', 'old', 'philip', 'phineas', 'quoth',
       'said', 'sir', 'sophia', 'stephen', 'thought', 'time', 'tis',
       'toby', 'tom', 'trim', 'tulliver', 'uncle', 'wakem', 'weston',
       'wharton', 'yorick'], dtype='<U31')

In [17]:
H[:, idx].shape

(8, 62)

In [18]:
y = show_topics(H)
v = [set(t.split(' ')) for t in y]

In [19]:
np.array(sorted(set.union(*v)))

array(['adams', 'allworthy', 'bounderby', 'catherine', 'cathy',
       'corporal', 'darcy', 'did', 'duke', 'earnshaw', 'edgar', 'elinor',
       'ellen', 'elton', 'emma', 'father', 'finn', 'glegg', 'good',
       'hareton', 'heathcliff', 'hunsden', 'jones', 'joseph', 'kennedy',
       'knightley', 'know', 'lady', 'laura', 'like', 'linton', 'little',
       'll', 'lopez', 'lucy', 'maggie', 'man', 'marianne', 'miss',
       'monsieur', 'mr', 'mrs', 'old', 'philip', 'phineas', 'quoth',
       'said', 'sir', 'sophia', 'stephen', 'thought', 'time', 'tis',
       'toby', 'tom', 'trim', 'tulliver', 'uncle', 'wakem', 'weston',
       'wharton', 'yorick'], dtype='<U10')

In [20]:
all(np.array(sorted(set.union(*v))) == vocab[idx])

True

## 基于SVD的模型

In [21]:
U, s, Vh = randomized_svd(tfidf, num_topics)

In [22]:
idx1 = get_all_topic_words(Vh)

In [23]:
vocab[idx1]

array(['adams', 'allworthy', 'bounderby', 'bretton', 'catherine', 'cathy',
       'crimsworth', 'darcy', 'dashwood', 'did', 'earnshaw', 'edgar',
       'elinor', 'elton', 'emma', 'fairfax', 'father', 'finn', 'fleur',
       'good', 'hareton', 'harriet', 'hath', 'heathcliff', 'hunsden',
       'jennings', 'jones', 'joseph', 'knightley', 'know', 'la', 'lady',
       'linton', 'little', 'lopez', 'lovelace', 'madame', 'maggie', 'man',
       'marianne', 'mdlle', 'miss', 'monsieur', 'mr', 'mrs', 'pelet',
       'phineas', 'said', 'sophia', 'think', 'time', 'toby', 'tom',
       'tulliver', 'uncle', 'weston', 'woodhouse'], dtype='<U31')

In [24]:
len(idx1)

57

In [25]:
U.shape, s.shape, Vh.shape

((27, 8), (8,), (8, 55035))

- 两种方法共有的单词

In [26]:
vs = set.difference(set(vocab[idx1]), set(vocab[idx]))

In [27]:
len(vs)
vs

15

{'bretton',
 'crimsworth',
 'dashwood',
 'fairfax',
 'fleur',
 'harriet',
 'hath',
 'jennings',
 'la',
 'lovelace',
 'madame',
 'mdlle',
 'pelet',
 'think',
 'woodhouse'}

`np.linalg.svd(vectors, full_matrices=False)`

![](../imgs/full_svd.JPG)

![](../imgs/reduced_svd.JPG)