In [12]:
import numpy as np
from nltk.corpus import stopwords #停用词处理
from nltk.tokenize import word_tokenize #分词 
from sklearn.feature_extraction.text import TfidfVectorizer #TfidfVectorizer可以把原始文本转化为tf-idf的特征矩阵
from sklearn.decomposition import TruncatedSVD #截断奇异值分解,可降维

In [13]:
original_docs = ["Snap Inc.'s stock price surged more than 20% in after-hours trading after the social media company reported quarterly revenue and user growth that topped estimates, fueled by advertisers’ stepped-up spending and consumers’ increased use of their phones for messaging and entertainment during the pandemic.",
        "Stocks gave up some of their recent gains Monday as hopes faded on Wall Street that Washington will come through with badly needed aid for the economy before election day.",
        "Facing increased pressure from business owners and politicians in places with theme parks, California released protocols for operating the parks, giving destinations such as Disneyland, Universal Studios and Knott’s Berry Farm a path to reopening that is tied to how well each park’s home county throttles the spread of the coronavirus.",
        "Stocks closed broadly higher Tuesday as Wall Street welcomed a batch of solid earnings reports from U.S. companies.",
        "At 90 years old and living through a raging pandemic, Hannah Carson knows time may be short. She wasted no time returning her absentee ballot for this year’s election.",
        "California’s unemployment rate ticked down last month as the state slowly recouped some of its lost jobs, even as it continued to rack up coronavirus cases.",
        "Americans have lost trust across the board in the people and institutions informing them about the coronavirus and COVID-19 since the beginning of the pandemic, according to a new poll from The Associated Press-NORC Center for Public Affairs Research and USAFacts.",
        "Job seeking in an uncertain economy is hard enough. Throw in coronavirus fears, home quarantines and hiring freezes at many companies, and the hunt for work becomes even more difficult.",
        "The pandemic’s effects on workers has been nothing short of devastating. In addition to fueling the highest unemployment rate since the Great Depression, it has caused even Americans who have jobs to earn less.",
        "The embattled president of Kyrgyzstan ordered a nearly two-week state of emergency Friday in the capital, Bishkek, in a bid to end the political turmoil sparked by a disputed parliamentary election."
        ]

In [14]:
docs=[]
stop_words = set(stopwords.words('english')) 
new_stopwords = ['the','that', 'for','from','even']
new_stopwords_list= stop_words.union(new_stopwords) #设置停用题
for i in original_docs:
    word_tokens = word_tokenize(i.lower()) 
    filtered_sentence = [w for w in word_tokens if not w in new_stopwords_list] 
    docs.append(" ".join(filtered_sentence)) #docs为经过去除停用词和大小写转换后的文档
print(docs)

["snap inc. 's stock price surged 20 % after-hours trading social media company reported quarterly revenue user growth topped estimates , fueled advertisers ’ stepped-up spending consumers ’ increased use phones messaging entertainment pandemic .", 'stocks gave recent gains monday hopes faded wall street washington come badly needed aid economy election day .', 'facing increased pressure business owners politicians places theme parks , california released protocols operating parks , giving destinations disneyland , universal studios knott ’ berry farm path reopening tied well park ’ home county throttles spread coronavirus .', 'stocks closed broadly higher tuesday wall street welcomed batch solid earnings reports u.s. companies .', '90 years old living raging pandemic , hannah carson knows time may short . wasted time returning absentee ballot year ’ election .', 'california ’ unemployment rate ticked last month state slowly recouped lost jobs , continued rack coronavirus cases .', 'am

In [15]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs) #生成docx的Tfidf单词-文本矩阵
print("----------单词—文本矩阵----------")
print(X) #X为系数矩阵
print(X.toarray()) #更直观的输出形式
print(X.shape)
print("----------特征单词----------")
terms = vectorizer.get_feature_names()
print(terms)
print("一共",len(terms), "个特征单词") 

----------单词—文本矩阵----------
  (0, 107)	0.12040290649547312
  (0, 55)	0.1820896258758233
  (0, 95)	0.1820896258758233
  (0, 113)	0.1820896258758233
  (0, 172)	0.1820896258758233
  (0, 80)	0.15479281769875788
  (0, 34)	0.1820896258758233
  (0, 146)	0.1820896258758233
  (0, 170)	0.1820896258758233
  (0, 149)	0.1820896258758233
  (0, 7)	0.1820896258758233
  (0, 63)	0.1820896258758233
  (0, 56)	0.1820896258758233
  (0, 161)	0.1820896258758233
  (0, 69)	0.1820896258758233
  (0, 173)	0.1820896258758233
  (0, 137)	0.1820896258758233
  (0, 125)	0.1820896258758233
  (0, 133)	0.1820896258758233
  (0, 33)	0.1820896258758233
  (0, 94)	0.1820896258758233
  (0, 143)	0.1820896258758233
  (0, 162)	0.1820896258758233
  (0, 77)	0.1820896258758233
  (0, 9)	0.1820896258758233
  :	:
  (8, 84)	0.20756695586038015
  (8, 128)	0.20756695586038015
  (8, 168)	0.20756695586038015
  (8, 139)	0.20756695586038015
  (8, 107)	0.16145235385948978
  (9, 110)	0.22776863526061358
  (9, 45)	0.22776863526061358
  (9, 145)	0.

In [16]:
topics=4 #设定主题数为4
lsa = TruncatedSVD(n_components=topics) #矩阵阶段奇异值分解  
X_reduce = lsa.fit_transform(X)
print("----------单词-话题矩阵----------")
print(lsa.components_)
print(lsa.components_.shape)
print("----------LSA奇异值----------")
print(lsa.singular_values_)
print("----------10个文本在4个话题向量空间下的表示----------")
print(X_reduce)
print(X_reduce.shape) 

----------单词-话题矩阵----------
[[ 7.76916428e-02  1.72737973e-02  4.54447119e-02  4.54447119e-02
   7.76916428e-02  7.76916428e-02  1.20247640e-01  1.72737973e-02
   7.76916428e-02  1.72737973e-02  4.35646842e-02  1.68266474e-01
   7.76916428e-02  4.35646842e-02  4.54447119e-02  4.28254519e-02
   4.40149624e-02  7.76916428e-02  3.14887752e-02  3.07761352e-02
   3.07761352e-02  7.76916428e-02  4.28254519e-02  3.14887752e-02
   1.41873843e-01  3.07761352e-02  4.54447119e-02  1.35403690e-01
   1.20247640e-01  7.76916428e-02  4.28254519e-02  4.35646842e-02
   7.38222859e-02  1.72737973e-02  1.72737973e-02  1.35403690e-01
   1.90830020e-01  3.14887752e-02  7.76916428e-02  4.35646842e-02
   1.20247640e-01  3.14887752e-02  1.20247640e-01  4.40149624e-02
   3.14887752e-02  3.07761352e-02  1.20247640e-01  4.28254519e-02
   7.44507010e-02  1.20247640e-01  8.90880318e-02  3.07761352e-02
   3.07761352e-02  3.07761352e-02  4.40149624e-02  1.72737973e-02
   1.72737973e-02  3.14887752e-02  4.35646842e-0

In [6]:
pick_docs = 2 #X_reduce[i,t]为第i篇文档在第t个话题上的分布，该值越高的文档i，可以认为在主题t上更有代表性，故可据此筛选最具代表能代表该话题的文档
topic_docid = [X_reduce[:, t].argsort()[:-(pick_docs + 1):-1] for t in range(topics)]
print("----------每个话题挑出2个最具代表性的文档----------")
print(topic_docid)

----------每个话题挑出2个最具代表性的文档----------
[array([8, 5]), array([1, 3]), array([2, 7]), array([0, 4])]


In [7]:
pick_keywords = 4 #lsa.components_[i，j]代表了词语j在主题t上的权重，可以此获得主题关键词：
topic_keywdid = [lsa.components_[t].argsort()[:-(pick_keywords + 1):-1] for t in range(topics)]
print("----------每个话题挑出3个关键词----------")
print(topic_keywdid)

----------每个话题挑出3个关键词----------
[array([128, 168,  84,  36]), array([152, 151, 174,  32]), array([109,  75,  36,  24]), array([160,  80,   1, 149])]


In [11]:
print("----------LSA分析结果----------")
for t in range(topics):
    print("话题 {}".format(t+1))
    print("关键词：{}".format(", ".join(terms[topic_keywdid[t][j]] for j in range(pick_keywords))))
    for i in range(pick_docs):
        print("文档{}: ".format(i+1))
        print( original_docs[topic_docid[t][i]])
    print("\n")

----------LSA分析结果----------
话题 1
关键词：rate, unemployment, jobs, coronavirus
文档1: 
The pandemic’s effects on workers has been nothing short of devastating. In addition to fueling the highest unemployment rate since the Great Depression, it has caused even Americans who have jobs to earn less.
文档2: 
California’s unemployment rate ticked down last month as the state slowly recouped some of its lost jobs, even as it continued to rack up coronavirus cases.


话题 2
关键词：street, stocks, wall, companies
文档1: 
Stocks gave up some of their recent gains Monday as hopes faded on Wall Street that Washington will come through with badly needed aid for the economy before election day.
文档2: 
Stocks closed broadly higher Tuesday as Wall Street welcomed a batch of solid earnings reports from U.S. companies.


话题 3
关键词：parks, home, coronavirus, california
文档1: 
Facing increased pressure from business owners and politicians in places with theme parks, California released protocols for operating the parks, gi